diff --git a/README.md b/README.md
index 593f5872..a5a9d666 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,11 @@ Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
 -->
 
 <p align="center">
-  <img src="https://docs.friendli.ai/img/logo.svg" width="30%" alt="Friendli Logo">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://docs.friendli.ai/img/logo_dark.svg">
+    <source media="(prefers-color-scheme: light)" srcset="https://docs.friendli.ai/img/logo.svg">
+    <img width="30%" alt="Friendli Logo" src="https://docs.friendli.ai/img/logo.svg">
+  </picture>
 </p>
 
 <h2><p align="center">Supercharge Generative AI Serving with Friendli 🚀</p></h2>
diff --git a/friendli/cli/model.py b/friendli/cli/model.py
index dff7bba3..6a50bd50 100644
--- a/friendli/cli/model.py
+++ b/friendli/cli/model.py
@@ -6,25 +6,12 @@
 
 from __future__ import annotations
 
-import os
-from typing import Optional, cast
-
 import typer
-import yaml
 
-from friendli.enums import CheckpointFileType, ModelDataType
-from friendli.errors import (
-    CheckpointConversionError,
-    InvalidConfigError,
-    NotFoundError,
-    NotSupportedQuantConfigError,
-    QuantizationError,
-)
 from friendli.formatter import TableFormatter
 from friendli.sdk.client import Friendli
-from friendli.utils.compat import model_dump, model_parse
+from friendli.utils.compat import model_dump
 from friendli.utils.decorator import check_api
-from friendli.utils.format import secho_error_and_exit
 
 app = typer.Typer(
     no_args_is_help=True,
@@ -53,350 +40,3 @@ def list_models():
     models = client.model.list()
     models_ = [model_dump(model) for model in iter(models)]
     table_formatter.render(models_)
-
-
-@app.command()
-def convert(
-    model_name_or_path: str = typer.Option(
-        ...,
-        "--model-name-or-path",
-        "-m",
-        help="Hugging Face pretrained model name or path to the saved model checkpoint.",
-    ),
-    output_dir: str = typer.Option(
-        ...,
-        "--output-dir",
-        "-o",
-        help=(
-            "Directory path to save the converted checkpoint and related configuration "
-            "files. Three files will be created in the directory: `model.h5`, "
-            "`tokenizer.json`, and `attr.yaml`. "
-            "The `model.h5` or `model.safetensors` is the converted checkpoint and can be renamed using "
-            "the `--output-model-filename` option. "
-            "The `tokenizer.json` is the Friendli-compatible tokenizer file, which should "
-            "be uploaded along with the checkpoint file to tokenize the model input "
-            "and output. "
-            "The `attr.yaml` is the checkpoint attribute file, to be used when uploading "
-            "the converted model to Friendli. You can designate the file name using "
-            "the `--output-attr-filename` option."
-        ),
-    ),
-    data_type: ModelDataType = typer.Option(
-        None, "--data-type", "-dt", help="The data type of converted checkpoint."
-    ),
-    cache_dir: Optional[str] = typer.Option(
-        None, "--cache-dir", help="Directory for downloading checkpoint."
-    ),
-    dry_run: bool = typer.Option(
-        False, "--dry-run", help="Only check conversion avaliability."
-    ),
-    output_model_file_name: str = typer.Option(
-        None,
-        "--output-model-filename",
-        help="Name of the converted checkpoint file."
-        "The default file name is `model.h5` when `--output-ckpt-file-type` is `hdf5` or `model.safetensors` when `--output-ckpt-file-type` is `safetensors`.",
-    ),
-    output_ckpt_file_type: CheckpointFileType = typer.Option(
-        CheckpointFileType.SAFETENSORS,
-        "--output-ckpt-file-type",
-        help="File format of the converted checkpoint file. The default output ckpt file type is `safetensors`.",
-    ),
-    output_attr_file_name: str = typer.Option(
-        "attr.yaml",
-        "--output-attr-filename",
-        help="Name of the checkpoint attribute file.",
-    ),
-    quantize: bool = typer.Option(
-        False,
-        "--quantize",
-        help="Quantize the model before conversion",
-    ),
-    quant_config_file: Optional[typer.FileText] = typer.Option(
-        None,
-        "--quant-config-file",
-        help="Path to the quantization configuration file.",
-    ),
-):
-    """Convert huggingface's model checkpoint to Friendli format.
-
-    When a checkpoint is in the Hugging Face format, it cannot be directly served.
-    It requires conversion to the Friendli format for serving. The conversion
-    process involves copying the original checkpoint and transforming it into a
-    checkpoint in the Friendli format (*.h5).
-
-    :::caution
-    The `friendli model convert` is available only when the package is installed with
-    `pip install "friendli-client[mllib]"`.
-    :::
-
-    ### Apply quantization
-
-    If you want to quantize the model along with the conversion, `--quantize` option
-    should be provided. You can customize the quantization configuration by describing
-    it in a YAML file and providing the path to the file to `--quant-config-file`
-    option. When `--quantize` option is used without providing `--quant-config-file`,
-    the following configuration is used by default.
-
-    ```yaml
-    # Default quantization configuration
-    mode: awq
-    device: cuda:0
-    seed: 42
-    offload: true
-    calibration_dataset:
-        path_or_name: lambada
-        format: json
-        split: validation
-        lookup_column_name: text
-        num_samples: 128
-        max_length: 512
-        batch_size: 1
-    awq_args:
-        quant_bit: 4
-        quant_group_size: 64
-    ```
-
-    - **`mode`**: Quantization scheme to apply. Defaults to "awq".
-    - **`device`**: Device to run the quantization process. Defaults to "cuda:0".
-    - **`seed`**: Random seed. Defaults to 42.
-    - **`offload`**: When enabled, this option significantly reduces GPU memory usage by offloading model layers onto CPU RAM. Defaults to true.
-    - **`calibration_dataset`**
-        - **`path_or_name`**: Path or name of the dataset. Datasets from either the Hugging Face Datasets Hub or local file system can be used. Defaults to "lambada".
-        - **`format`**: Format of datasets. Defaults to "json".
-        - **`split`**: Which split of the data to load. Defaults to "validation".
-        - **`lookup_column_name`**: The name of a column in the dataset to be used as calibration inputs. Defaults to "text".
-        - **`num_samples`**: The number of dataset samples to use for calibration. Note that the dataset will be shuffled before sampling. Defaults to 512.
-        - **`max_length`**: The maximum length of a calibration input sequence. Defauts to 512.
-        - **`batch_size`**: The number of samples to process in a single batch. Defaults to 1.
-    - **`awq_args`** (Fill in this field only for "awq" mode)
-        - **`quant_bit`** : Bit width of integers to represent weights. Possible values are `4` or `8`. Defaults to 4.
-        - **`quant_group_size`**: Group size of quantized matrices. 64 is the only supported value at this time. Defaults to 64.
-
-    :::tip
-    If you encounter OOM issues when running with AWQ, try enabling the `offload` option.
-    :::
-
-    :::tip
-    If you set `percentile` in quant-config-file into 100,
-    the quantization range will be determined by the maximum absolute values of the activation tensors.
-    :::
-
-    :::info
-    Currently, [AWQ](https://arxiv.org/abs/2306.00978) is the only supported quantization scheme.
-    :::
-
-    :::info
-    AWQ is supported only for models with architecture listed as follows:
-
-    - `GPTNeoXForCausalLM`
-    - `GPTJForCausalLM`
-    - `LlamaForCausalLM`
-    - `MPTForCausalLM`
-    :::
-
-    """
-    # pylint: disable=too-many-branches
-    try:
-        # pylint: disable=import-outside-toplevel
-        from friendli.modules.converter.convert import convert_checkpoint
-        from friendli.modules.quantizer.schema.config import (
-            AWQConfig,
-            OneOfQuantConfig,
-            QuantConfig,
-        )
-        from friendli.modules.quantizer_v2.quantize import quantize_checkpoint
-        from friendli.modules.quantizer_v2.schema.config import Int8QuantConfig
-
-        # pylint: enable=import-outside-toplevel
-    except ModuleNotFoundError as exc:
-        secho_error_and_exit(str(exc))
-
-    if not os.path.isdir(output_dir):
-        if os.path.exists(output_dir):
-            secho_error_and_exit(f"'{output_dir}' exists, but it is not a directory.")
-        os.mkdir(output_dir)
-
-    quant_config: Optional[OneOfQuantConfig] = None
-    use_quantizer_v2 = False
-    if quantize:
-        if quant_config_file:
-            try:
-                quant_config_dict = cast(dict, yaml.safe_load(quant_config_file.read()))
-            except yaml.YAMLError as err:
-                secho_error_and_exit(f"Failed to load the quant config file: {err}")
-            if quant_config_dict["mode"] == "int8":
-                quant_config = model_parse(  # type: ignore
-                    Int8QuantConfig, quant_config_dict
-                )
-            else:
-                quant_config = model_parse(
-                    QuantConfig, {"config": quant_config_dict}
-                ).config
-
-            # TODO(SA): All Quantization mode will be migrated to V2. After migration, please remove it.
-        else:
-            quant_config = AWQConfig()
-
-        if isinstance(quant_config, Int8QuantConfig):
-            use_quantizer_v2 = True
-
-    default_names = {
-        CheckpointFileType.HDF5: "model.h5",
-        CheckpointFileType.SAFETENSORS: "model.safetensors",
-    }
-    output_model_file_name = (
-        output_model_file_name or default_names[output_ckpt_file_type]
-    )
-
-    if use_quantizer_v2:
-        if output_ckpt_file_type == CheckpointFileType.HDF5:
-            secho_error_and_exit(
-                f"int8 quantization only supports `safetensors` output_ckpt_file_type. Current output_ckpt_file_type: {output_ckpt_file_type}"
-            )
-        try:
-            assert isinstance(quant_config, Int8QuantConfig)
-            quantize_checkpoint(
-                model_name_or_path=model_name_or_path,
-                output_dir=output_dir,
-                cache_dir=cache_dir,
-                dry_run=dry_run,
-                quant_config=quant_config,
-            )
-        except (NotFoundError, QuantizationError, NotSupportedQuantConfigError) as exc:
-            secho_error_and_exit(str(exc))
-    else:
-        try:
-            convert_checkpoint(
-                model_name_or_path=model_name_or_path,
-                output_model_file_name=output_model_file_name,
-                output_ckpt_file_type=output_ckpt_file_type,
-                output_attr_file_name=output_attr_file_name,
-                output_dir=output_dir,
-                data_type=data_type,
-                cache_dir=cache_dir,
-                dry_run=dry_run,
-                quantize=quantize,
-                quant_config=quant_config,
-            )
-        except (NotFoundError, CheckpointConversionError, InvalidConfigError) as exc:
-            secho_error_and_exit(str(exc))
-
-    msg = (
-        f"Checkpoint({model_name_or_path}) can be converted."
-        if dry_run
-        else f"Checkpoint({model_name_or_path}) has been converted successfully."
-    )
-    typer.secho(msg)
-
-
-@app.command()
-def convert_adapter(
-    adapter_name_or_path: str = typer.Option(
-        ...,
-        "--adapter-name-or-path",
-        "-a",
-        help="Hugging Face pretrained adapter name or path to the saved adapter checkpoint.",
-    ),
-    output_dir: str = typer.Option(
-        ...,
-        "--output-dir",
-        "-o",
-        help=(
-            "Directory path to save the converted adapter checkpoint and related configuration "
-            "files. Two files will be created in the directory: `adapter.h5`, "
-            "and `attr.yaml`. "
-            "The `adapter.h5` is the converted checkpoint and can be renamed using "
-            "the `--output-adapter-filename` option. "
-            "The `attr.yaml` is the adapter checkpoint attribute file, to be used when uploading "
-            "the converted model to Friendli. You can designate the file name using "
-            "the `--output-attr-filename` option."
-        ),
-    ),
-    data_type: ModelDataType = typer.Option(
-        None, "--data-type", "-dt", help="The data type of converted checkpoint."
-    ),
-    base_model_name_or_path: Optional[str] = typer.Option(
-        None,
-        "--base-model-name-or-path",
-        "-b",
-        help=(
-            "Hugging Face model name or path to the saved backbone checkpoint. "
-            "By default, we use the `base_model_name_or_path` in adapter_config.json."
-        ),
-    ),
-    cache_dir: Optional[str] = typer.Option(
-        None, "--cache-dir", help="Directory for downloading checkpoint."
-    ),
-    dry_run: bool = typer.Option(
-        False, "--dry-run", help="Only check conversion avaliability."
-    ),
-    output_adapter_filename: str = typer.Option(
-        "adapter.h5",
-        "--output-adapter-filename",
-        help="Name of the converted adapter checkpoint file.",
-    ),
-    output_attr_filename: str = typer.Option(
-        "adapter_attr.yaml",
-        "--output-attr-filename",
-        help="Name of the adapter checkpoint attribute file.",
-    ),
-) -> None:
-    """Convert huggingface's adapter checkpoint to Friendli format.
-
-    When an adapter checkpoint is in the Hugging Face PEFT format, it cannot
-    be directly served in Friendli. It requires conversion to the Friendli format.
-    The conversion process involves copying the original adapter checkpoint and
-    transforming it into a checkpoint in the Friendli format (*.h5).
-
-    This function does not include the `friendli model convert` command. i.e.
-    `friendli model convert-adapter` only converts adapter's parameters, not backbone's.
-
-    :::caution
-    The `friendli model convert-adapter` is available only when the package is installed with
-    `pip install "friendli-client[mllib]"`.
-    :::
-
-    """
-    try:
-        from friendli.modules.converter.convert import (  # pylint: disable=import-outside-toplevel
-            convert_adapter_checkpoint,
-        )
-    except ModuleNotFoundError as exc:
-        secho_error_and_exit(str(exc))
-
-    if not os.path.isdir(output_dir):
-        if os.path.exists(output_dir):
-            secho_error_and_exit(f"'{output_dir}' exists, but it is not a directory.")
-        os.mkdir(output_dir)
-
-    # Engine cannot load a Safetensors Lora ckpt yet.
-    output_adapter_file_type = CheckpointFileType.HDF5
-    default_names = {
-        CheckpointFileType.HDF5: "adapter.h5",
-        CheckpointFileType.SAFETENSORS: "adapter.safetensors",
-    }
-    output_adapter_filename = (
-        output_adapter_filename or default_names[output_adapter_file_type]
-    )
-
-    try:
-        convert_adapter_checkpoint(
-            adapter_name_or_path=adapter_name_or_path,
-            output_attr_filename=output_attr_filename,
-            output_dir=output_dir,
-            output_adapter_filename=output_adapter_filename,
-            base_model_name_or_path=base_model_name_or_path,
-            data_type=data_type,
-            output_adapter_file_type=output_adapter_file_type,
-            cache_dir=cache_dir,
-            dry_run=dry_run,
-        )
-    except (NotFoundError, CheckpointConversionError, InvalidConfigError) as exc:
-        secho_error_and_exit(str(exc))
-
-    msg = (
-        f"Checkpoint({adapter_name_or_path}) can be converted."
-        if dry_run
-        else f"Checkpoint({adapter_name_or_path}) has been converted successfully."
-    )
-    typer.secho(msg)
diff --git a/friendli/modules/__init__.py b/friendli/modules/__init__.py
deleted file mode 100644
index e603ace1..00000000
--- a/friendli/modules/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli modules."""
diff --git a/friendli/modules/converter/__init__.py b/friendli/modules/converter/__init__.py
deleted file mode 100644
index d0213cf4..00000000
--- a/friendli/modules/converter/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli model converter."""
diff --git a/friendli/modules/converter/base.py b/friendli/modules/converter/base.py
deleted file mode 100644
index 9eaca2ec..00000000
--- a/friendli/modules/converter/base.py
+++ /dev/null
@@ -1,560 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Converter."""
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from collections.abc import Generator
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union, cast
-
-import numpy as np
-import torch
-from peft import PeftType  # type: ignore[import] # pylint: disable=import-error
-from peft.config import PeftConfig
-from peft.tuners.lora import (  # type: ignore[import] # pylint: disable=import-error
-    LoraConfig,
-)
-from transformers import GenerationConfig, PretrainedConfig  # type: ignore[import]
-
-from friendli.enums import ModelDataType
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.interface import (
-    DecoderTFBlockConversionInterface,
-    EncoderTFBlockConversionInterface,
-    ModelConversionInterface,
-    NonTFBlockConversionInterface,
-)
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import get_model_data_type
-
-SUPPORTED_GELU_FAMILY = [
-    "gelu",
-    "gelu_fast",
-    "gelu_new",
-    "gelu_python",
-    "gelu_pytorch_tanh",
-    "gelu_accurate",
-]
-SUPPORTED_HEAD_SIZE = [64, 80, 96, 128, 256]
-
-MODEL_TYPE_TO_SUPPORTED_LORA_TARGET_MODULES_MAP = {
-    "gptj": {"q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"},
-    "llama": {
-        "q_proj",
-        "k_proj",
-        "v_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
-        "down_proj",
-    },
-    "mistral": {
-        "q_proj",
-        "k_proj",
-        "v_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
-        "down_proj",
-    },
-    "mpt": {"Wqkv", "out_proj", "up_proj", "down_proj"},
-}
-# TODO: remove this const map when engine supports lm head LoRA
-MODEL_TYPE_TO_UNSUPPORTED_LORA_TARGET_MODULES_MAP = {
-    "gptj": {"lm_head"},
-    "llama": {"lm_head"},
-    "mistral": {"lm_head"},
-    "mpt": {"lm_head"},
-}
-
-ENCODER_PREFIX = "encoder"
-DECODER_PREFIX = "decoder"
-
-
-class AbstractConverter(ModelConversionInterface, ABC):
-    """Abstract class for converting Hugging Face checkpoint to Friendli checkpoint.
-
-    Attributes:
-        config (PreTrainedConfig): Hugging Face model configuration.
-        generation_config (Optional[GenerationConfig]): Hugginface generation config.
-            When set to None, `config` is used for configuring generation.
-        data_type (Optional(ModelDataType)): Data type for the Friendli checkpoint.
-
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        generation_config: Optional[GenerationConfig],
-        data_type: Optional[ModelDataType],
-    ) -> None:
-        """Initialize converter."""
-        self.config = config
-        self.generation_config = generation_config
-        self.data_type = (
-            data_type if data_type else get_model_data_type(config.torch_dtype)
-        )
-
-    def get_eos_token_id(self) -> Optional[int]:
-        """Get ID of EOS token."""
-        generation_eos_token_id = None
-        if self.generation_config is not None:
-            generation_eos_token_id = self.generation_config.eos_token_id
-
-        config_eos_token_id = self.config.eos_token_id
-
-        if generation_eos_token_id is None:
-            eos_token_id = config_eos_token_id
-        else:
-            if generation_eos_token_id != config_eos_token_id:
-                logger.warn(
-                    "'eos_token' is different in generation_config (%s) and config (%s). "
-                    "Please fill the correct value.",
-                    generation_eos_token_id,
-                    config_eos_token_id,
-                )
-                eos_token_id = None
-            else:
-                eos_token_id = config_eos_token_id
-
-        if eos_token_id is None:
-            logger.warn(
-                "'eos_token' cannot be automatically configured. "
-                "Please fill in the field by yourself."
-            )
-
-        return eos_token_id
-
-    def token_embed_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape embedding layer's weight to Friendli format.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-            The tensor of reshaped embedding weight.
-
-        """
-        assert len(params) == 1
-        return params[0]
-
-    def pos_embed_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape position embedding layer's weight to Friendli format.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-            The tensor of reshaped position embedding weight.
-        """
-        assert len(params) == 1
-        return params[0]
-
-    def head_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape head layer's weight to Friendli format.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-            The tensor of reshaped head weight.
-
-        """
-        assert len(params) == 1
-        return params[0]
-
-    def linear_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape linear layer's weight to Friendli format.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-            The tensor of reshaped linear weight.
-
-        """
-        assert len(params) == 1
-        param = params[0].transpose(0, 1)
-        return param
-
-    def linear_bias_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape linear layer's bias to Friendli format.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-            The tensor of reshaped linear bias.
-
-        """
-        assert len(params) == 1
-        return params[0]
-
-    def ln_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape layer norm layer's weight to Friendli format.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-            The tensor of reshaped layer norm weight.
-
-        """
-        assert len(params) == 1
-        return params[0]
-
-    def ln_bias_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape layer norm layer's bias to Friendli format.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-            The tensor of reshaped layer norm bias.
-
-        """
-        assert len(params) == 1
-        return params[0]
-
-    def qkv_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape qkv layer's weight to Friendli format.
-
-        In the original checkpoint, the qkv weight is stored as a single tensor or
-        separated by three tensors. In the Friendli checkpoint, it is stored as a single tensor.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-            The tensor of reshaped qkv weight.
-
-        """
-        param = torch.cat(params, dim=0)
-        param = param.transpose(0, 1)
-        return param
-
-    def qkv_bias_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape qkv layer's bias to Friendli format.
-
-        In the original checkpoint, the qkv weight is stored as a single tensor or
-        separated by three tensors. In the Friendli checkpoint, it is stored as a single tensor.
-
-        Args:
-            state_dict (Dict[str, torch.Tensor]): The state_dict of the original checkpoint.
-            layer (str): The layer name of the original checkpoint.
-            per_layer_postfixes (List[str]): The list of postfixes of the layer.
-
-        Returns:
-           The tensor of reshaped qkv bias.
-
-        """
-        param = torch.cat(params, dim=0)
-        return param
-
-
-class DecoderOnlyConverter(
-    AbstractConverter,
-    NonTFBlockConversionInterface,
-    DecoderTFBlockConversionInterface,
-):
-    """Converter for Decoder-Only models."""
-
-    def check_config(self) -> None:
-        """Check if a convertible form of the checkpoint from the decoder-only model config."""
-        super().check_config()
-        if self.decoder_head_size not in SUPPORTED_HEAD_SIZE:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"decoder_head_size={self.decoder_head_size}",
-                valid_options=SUPPORTED_HEAD_SIZE,
-            )
-
-    def get_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Get List of conversion informations for Decoder-Only model."""
-        return self.non_transformer_convert_info_list + self.decoder_convert_info_list
-
-
-class EncoderDecoderConverter(
-    AbstractConverter,
-    NonTFBlockConversionInterface,
-    EncoderTFBlockConversionInterface,
-    DecoderTFBlockConversionInterface,
-):
-    """Converter for Encoder-Decoder models."""
-
-    def check_config(self) -> None:
-        """Check if a convertible form of the checkpoint from the encoder-decoder model config."""
-        if self.decoder_head_size not in SUPPORTED_HEAD_SIZE:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"decoder_head_size={self.decoder_head_size}",
-                valid_options=SUPPORTED_HEAD_SIZE,
-            )
-
-    def get_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Get list of conversion informations for Encoder-Decoder model."""
-        return (
-            self.non_transformer_convert_info_list
-            + self.decoder_convert_info_list
-            + self.encoder_convert_info_list
-        )
-
-    def get_decoder_start_token_id(self) -> Optional[int]:
-        """Get ID of decoder start token."""
-        generation_decoder_start_token_id = None
-        if self.generation_config is not None:
-            generation_decoder_start_token_id = (
-                self.generation_config.decoder_start_token_id
-            )
-
-        config_decoder_start_token_id = self.config.decoder_start_token_id
-
-        if generation_decoder_start_token_id is None:
-            decoder_start_token_id = config_decoder_start_token_id
-        else:
-            if generation_decoder_start_token_id != config_decoder_start_token_id:
-                logger.warn(
-                    "'decoder_start_token_id' is different in generation_config "
-                    "(%s) and config (%s). Please fill the correct value.",
-                    generation_decoder_start_token_id,
-                    config_decoder_start_token_id,
-                )
-                decoder_start_token_id = None
-            else:
-                decoder_start_token_id = config_decoder_start_token_id
-
-        if decoder_start_token_id is None:
-            logger.warn(
-                "'decoder_start_token' cannot be automatically configured. "
-                "Please fill in the field by yourself."
-            )
-
-        return decoder_start_token_id
-
-
-class DecoderOnlyLoraConverter(AbstractConverter):
-    """Converter for LoRA modules in the models."""
-
-    def __init__(
-        self,
-        converter: AbstractConverter,
-        adapter_config: PeftConfig,
-    ) -> None:
-        """Initialize LoRA Converter."""
-        super().__init__(
-            config=converter.config,
-            generation_config=converter.generation_config,
-            data_type=converter.data_type,
-        )
-        self.converter = cast(DecoderOnlyConverter, converter)
-        self.adapter_config = cast(LoraConfig, adapter_config)
-
-    def check_config(self) -> None:
-        """Check if a convertible form of the checkpoint from the LoRAconfig."""
-        if self.adapter_config.peft_type != PeftType.LORA:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"peft_type={self.adapter_config.peft_type}",
-                valid_options=[str(PeftType.LORA)],
-            )
-        if (
-            self.config.model_type
-            not in MODEL_TYPE_TO_SUPPORTED_LORA_TARGET_MODULES_MAP
-        ):
-            raise NotSupportedCheckpointError(
-                invalid_option=f"model_type={self.config.model_type} for LORA",
-                valid_options=list(
-                    MODEL_TYPE_TO_SUPPORTED_LORA_TARGET_MODULES_MAP.keys()
-                ),
-            )
-        if (
-            self.adapter_config.layers_pattern is not None
-            and len(self.adapter_config.layers_pattern) > 0
-        ):
-            raise NotSupportedCheckpointError(
-                invalid_option=f"layers_pattern={self.adapter_config.layers_pattern}",
-                valid_options=[None, [], ""],
-            )
-        if (
-            self.adapter_config.rank_pattern is not None
-            and len(self.adapter_config.rank_pattern) > 0
-        ):
-            raise NotSupportedCheckpointError(
-                invalid_option=f"rank_pattern={self.adapter_config.rank_pattern}",
-                valid_options=[None, {}],
-            )
-        if (
-            self.adapter_config.alpha_pattern is not None
-            and len(self.adapter_config.alpha_pattern) > 0
-        ):
-            raise NotSupportedCheckpointError(
-                invalid_option=f"alpha_pattern={self.adapter_config.alpha_pattern}",
-                valid_options=[None, {}],
-            )
-
-        if self.adapter_config.target_modules is not None:
-            for target_module in self.adapter_config.target_modules:
-                if (
-                    target_module
-                    not in MODEL_TYPE_TO_SUPPORTED_LORA_TARGET_MODULES_MAP[
-                        self.config.model_type
-                    ]
-                ):
-                    if (
-                        target_module
-                        in MODEL_TYPE_TO_UNSUPPORTED_LORA_TARGET_MODULES_MAP[
-                            self.config.model_type
-                        ]
-                    ):
-                        raise NotSupportedCheckpointError(
-                            invalid_option=f"target_module={target_module}",
-                            valid_options=list(
-                                MODEL_TYPE_TO_SUPPORTED_LORA_TARGET_MODULES_MAP[
-                                    self.config.model_type
-                                ]
-                            ),
-                        )
-
-                    logger.warn(
-                        "Target module %s does not exist in the base model (%s). Will be ignored.",
-                        target_module,
-                        self.adapter_config.base_model_name_or_path,
-                    )
-
-        if (self.adapter_config.layers_to_transform is not None) and (
-            self.adapter_config != list(range(self.converter.decoder_layer_num))
-        ):
-            raise NotSupportedCheckpointError(
-                invalid_option=f"layers_to_transform={self.adapter_config.layers_to_transform}",
-                valid_options=[
-                    f"layers_to_transform=None"
-                    f"layers_to_transform={list(range(self.converter.decoder_layer_num))}",
-                ],
-            )
-
-    def get_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Get convert dict for LoRA model."""
-        return self.adapter_convert_info_list
-
-    def _get_layers_to_transform(self) -> List[int]:
-        layers_to_transform = cast(LoraConfig, self.adapter_config).layers_to_transform
-        if layers_to_transform is None:
-            layers_to_transform = list(range(self.converter.decoder_layer_num))
-        else:
-            if isinstance(layers_to_transform, int):
-                layers_to_transform = [layers_to_transform]
-        return layers_to_transform
-
-    def lora_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """Reshape LoRA layer's weight to Friendli format."""
-        assert len(params) == 1
-        return params[0].transpose(0, 1)
-
-    def pre_convert(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Preprocess the adapter modules before converting.
-
-        All the parameters of the LoRA low-rank matrixs are converted by `lora_weight_reshape`.
-        If the parameter can't be converted by `lora_weight_reshape`,
-
-        """
-        return model
-
-    def convert(  # pylint: disable=too-many-locals
-        self,
-        model: torch.nn.Module,
-        convert_info_list: List[ConvertInfo],
-        save_numpy_format: bool = True,
-    ) -> Generator[Tuple[str, Union[np.ndarray, torch.Tensor]], None, None]:
-        """Reshape Lora adapter model's all layer to Friendli format."""
-        model = self.pre_convert(model)
-        yield from self.converter.convert(model, convert_info_list, save_numpy_format)
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get adapter checkpoint attributes."""
-        return {
-            "name": "FILL ME",
-            "type": "lora",
-            "alpha": self.adapter_config.lora_alpha,
-            "rank": self.adapter_config.r,
-            "target-modules": list(self.adapter_target_modules),
-            "ckpt-path": "FILL ME",
-        }
-
-    @property
-    def adapter_target_modules(self) -> Set[str]:
-        """Return the target modules that LoRA applies to."""
-        if isinstance(self.adapter_config.target_modules, str):
-            hf_target_modules = {self.adapter_config.target_modules}
-        elif isinstance(self.adapter_config.target_modules, Iterable):
-            hf_target_modules = set(self.adapter_config.target_modules)
-        else:
-            raise CheckpointConversionError("`target_modules` should not be None")
-
-        translated_target_modules = set()
-        for target in hf_target_modules:
-            if target in self.adapter_target_module_map:
-                translated_target_modules.add(self.adapter_target_module_map[target])
-
-        return translated_target_modules
-
-    @property
-    @abstractmethod
-    def adapter_target_module_map(self) -> Dict[str, str]:
-        """Return the dictionary that maps Hugging Face's module name to Friendli's module name."""
-
-    @property
-    @abstractmethod
-    def adapter_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for LoRA modules of the model."""
-
-
-OneOfAdapterConverter = DecoderOnlyLoraConverter
-OneOfConverter = Union[EncoderDecoderConverter, DecoderOnlyConverter]
-
-
-class FP8OnlyConverter(DecoderOnlyConverter):
-    """FP8Only Architectures Converter Class."""
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        raise NotImplementedError("Not supported in FP8 Conversion.")
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks."""
-        raise NotImplementedError("Not supported in FP8 Conversion.")
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks."""
-        raise NotImplementedError("Not supported in FP8 Conversion.")
diff --git a/friendli/modules/converter/convert.py b/friendli/modules/converter/convert.py
deleted file mode 100644
index 4e4338e0..00000000
--- a/friendli/modules/converter/convert.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Converter."""
-
-from __future__ import annotations
-
-import os
-from typing import Optional
-
-import yaml
-from peft import PeftModel  # type: ignore[import] # pylint: disable=import-error
-
-from friendli.enums import CheckpointFileType, ModelDataType, QuantMode
-from friendli.errors import TokenizerNotFoundError
-from friendli.logging import logger
-from friendli.modules.converter.saver import get_saver
-from friendli.utils.validate import validate_convert_imports
-
-validate_convert_imports()
-# pylint: disable=import-outside-toplevel, wrong-import-position, wrong-import-order, ungrouped-imports
-import torch  # type: ignore[import]
-from accelerate import init_empty_weights  # type: ignore[import]
-
-from friendli.modules.converter.maps import (
-    get_adapter_converter_factory,
-    get_hf_converter_factory,
-)
-from friendli.modules.converter.utils import (
-    get_adapter_config,
-    get_model_arch,
-    get_model_generation_config,
-    get_model_pretrained_config,
-    get_torch_data_type,
-    save_tokenizer,
-)
-from friendli.modules.quantizer.maps import get_quantized_converter
-from friendli.modules.quantizer.schema.config import OneOfQuantConfig
-
-# pylint: enable=import-outside-toplevel, wrong-import-position, wrong-import-order, ungrouped-imports
-
-
-def convert_checkpoint(  # pylint: disable=too-many-branches
-    model_name_or_path: str,
-    output_model_file_name: str,
-    output_attr_file_name: str,
-    output_dir: str,
-    output_ckpt_file_type: CheckpointFileType,
-    *,
-    data_type: Optional[ModelDataType] = None,
-    cache_dir: Optional[str] = None,
-    dry_run: bool = False,
-    quantize: bool = False,
-    quant_config: Optional[OneOfQuantConfig] = None,
-) -> None:
-    """Convert HuggingFace model checkpoint to Friendli format.
-
-    Args:
-        model_name_or_path (str): Hugging Face model name or local path to the checkpoint.
-        output_model_file_name (str): File name of converted checkpoint to save.
-        output_attr_file_name (str): File name of the attribute YAML file for
-            the converted checkpoint.
-        output_dir (str) : Directory path to save the converted checkpoint and the attribute YAML,
-            and tokenizer configuration file.
-        output_ckpt_file_type (CheckpointFileType): The file type of converted checkpoint.
-        data_type (Optional[ModelDataType]): Converted checkpoint data type.
-            Defaults to torch_dtype in 'config.json'
-        attr_output_path (Optional[str], optional): Path to create the attribute YAML file for
-            the converted checkpoint. Defaults to None.
-        cache_dir (Optional[str], optional): Path for downloading checkpoint. Defaults to None.
-        dry_run (bool, optional): Check only if checkpoint is convertable. Defaults to False.
-        quantize (bool, optional): Enable quantization. Defaults to False.
-        quant_config (Optional[OneOfQuantConfig], optional): Quantization configuration.
-            Defaults to None.
-
-    Raises:
-        InValidconfigError: Raised when data_type is not supported.
-        NotFoundError: Raised when `model_name_or_path` or `tokenizer_output_dir` is not found.
-        NotSupportedCheckpointError: Raised when model architecture is not supported to convert.
-
-    """
-    # pylint: disable=too-many-locals
-    model_output_path = os.path.join(output_dir, output_model_file_name)
-    model_config = get_model_pretrained_config(
-        model_name_or_path, model_output_path, cache_dir
-    )
-    generation_config = get_model_generation_config(model_name_or_path, cache_dir)
-
-    model_arch = get_model_arch(model_config)
-    hf_factory, converter_factory = get_hf_converter_factory(model_arch)
-    converter = converter_factory(
-        config=model_config,
-        generation_config=generation_config,
-        data_type=data_type,
-    )
-
-    if quantize:
-        assert quant_config is not None
-        #  common quantization only supports `.safetensors`` output format.
-        if quant_config.mode == QuantMode.FP8:
-            assert output_ckpt_file_type == CheckpointFileType.SAFETENSORS
-        converter = get_quantized_converter(  # type: ignore[assignment]
-            quant_config, converter
-        )
-
-    converter.check_config()
-
-    if not dry_run:
-        logger.info(
-            "Start loading Hugging Face checkpoint(%s) for conversion...",
-            model_name_or_path,
-        )
-        model = hf_factory.from_pretrained(
-            model_name_or_path,
-            torch_dtype=model_config.torch_dtype,
-            cache_dir=cache_dir,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            # `low_cpu_mem_usage` is for model loading faster and using ~1x model size CPU memory.
-            # https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained.example
-        )
-
-        logger.info(
-            "Hugging Face checkpoint(%s) is successfully loaded!",
-            model_name_or_path,
-        )
-
-        convert_info_list = converter.get_convert_info_list()
-        with get_saver(
-            output_ckpt_file_type, output_dir, output_model_file_name
-        ) as saver:
-            for name, w in converter.convert(
-                model,
-                convert_info_list,
-                output_ckpt_file_type == CheckpointFileType.HDF5,
-            ):
-                saver.save_tensor(name, w)
-
-        logger.info(
-            "Hugging Face checkpoint(%s) is successfully converted to Friendli format!",
-            model_name_or_path,
-        )
-
-    # Save attr.yaml
-    attr_output_path = os.path.join(output_dir, output_attr_file_name)
-    if quant_config and quant_config.mode == QuantMode.FP8 and ModelDataType.FP8_E4M3:
-        model_config.torch_dtype = (
-            get_torch_data_type(data_type) if data_type else model_config.torch_dtype
-        )
-        setattr(model_config, "use_fp8_e4m3", True)
-        model_config.to_json_file(os.path.join(output_dir, "config.json"))
-    else:
-        attr = converter.get_attributes()
-        with open(attr_output_path, "w", encoding="utf-8") as file:
-            yaml.dump(attr, file, sort_keys=False)
-
-    # Save tokenizer files.
-    tokenizer_output_dir = output_dir
-    try:
-        saved_tokenizer_file_paths = save_tokenizer(
-            model_name_or_path=model_name_or_path,
-            cache_dir=cache_dir,
-            save_dir=tokenizer_output_dir,
-        )
-    except TokenizerNotFoundError as exc:
-        logger.warn(str(exc))
-
-    if not (
-        quant_config and quant_config.mode == QuantMode.FP8 and ModelDataType.FP8_E4M3
-    ):
-        for path in saved_tokenizer_file_paths:
-            if "tokenizer.json" not in path:
-                try:
-                    os.remove(path)
-                except FileNotFoundError:
-                    logger.warn(
-                        "Tried to delete unnecessary tokenizer file %s but the file "
-                        "is not found.",
-                        path,
-                    )
-
-
-def convert_adapter_checkpoint(  # pylint: disable=too-many-locals, too-many-arguments
-    adapter_name_or_path: str,
-    output_attr_filename: str,
-    output_dir: str,
-    output_adapter_filename: str,
-    base_model_name_or_path: Optional[str],
-    data_type: Optional[ModelDataType],
-    output_adapter_file_type: CheckpointFileType,
-    cache_dir: Optional[str],
-    dry_run: bool = False,
-) -> None:
-    """Convert HuggingFace model checkpoint to Friendli format."""
-    adapter_attr_output_path = os.path.join(output_dir, output_attr_filename)
-    adapter_config = get_adapter_config(adapter_name_or_path, cache_dir)
-    base_model_name_or_path = (
-        base_model_name_or_path or adapter_config.base_model_name_or_path
-    )
-    model_config = get_model_pretrained_config(
-        base_model_name_or_path,
-        adapter_attr_output_path,
-        cache_dir,
-    )
-    model_arch = get_model_arch(model_config)
-    hf_factory, converter_factory = get_hf_converter_factory(model_arch)
-    converter = converter_factory(
-        config=model_config,
-        generation_config=None,
-        data_type=data_type,
-    )
-    adapter_converter = get_adapter_converter_factory(model_arch)(
-        converter, adapter_config
-    )
-    adapter_converter.check_config()
-
-    if not dry_run:
-        logger.info(
-            "Start loading Hugging Face adapter checkpoint(%s's %s) for conversion...",
-            base_model_name_or_path,
-            adapter_name_or_path,
-        )
-        with init_empty_weights():
-            model = hf_factory.from_pretrained(
-                base_model_name_or_path,
-                torch_dtype=torch.float32,
-                cache_dir=cache_dir,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True,
-            )
-        # inplace model update
-        PeftModel.from_pretrained(
-            model, adapter_name_or_path, cache_dir=cache_dir, torch_dtype=torch.float32
-        )
-        logger.info(
-            "Hugging Face adapter checkpoint (%s) is successfully loaded!",
-            adapter_name_or_path,
-        )
-        convert_dict = adapter_converter.get_convert_info_list()
-        with get_saver(
-            output_adapter_file_type, output_dir, output_adapter_filename
-        ) as saver:
-            for name, w in adapter_converter.convert(
-                model, convert_dict, output_adapter_file_type == CheckpointFileType.HDF5
-            ):
-                saver.save_tensor(name, w)
-
-        logger.info(
-            "Hugging Face checkpoint (%s) is successfully converted to Friendli format!",
-            adapter_name_or_path,
-        )
-
-    attr = adapter_converter.get_attributes()
-    with open(adapter_attr_output_path, "w", encoding="utf-8") as file:
-        yaml.dump([attr], file, sort_keys=False)
diff --git a/friendli/modules/converter/interface.py b/friendli/modules/converter/interface.py
deleted file mode 100644
index 6e7db352..00000000
--- a/friendli/modules/converter/interface.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Converter Interface."""
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from collections.abc import Generator
-from typing import Any, Dict, List, Tuple, Union
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from friendli.enums import ModelDataType
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import (
-    convert_tensor_dtype,
-    get_tensor_from_state_dict,
-)
-
-
-class ModelConversionInterface(ABC):
-    """Interface get information for converting models."""
-
-    @abstractmethod
-    def get_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Get list of conversion informations for the model."""
-
-    @abstractmethod
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-
-    @abstractmethod
-    def check_config(self) -> None:
-        """Check if the model is convertable."""
-
-    def convert(
-        self,
-        model: torch.nn.Module,
-        convert_info_list: List[ConvertInfo],
-        save_numpy_format: bool = True,
-    ) -> Generator[Tuple[str, Union[np.ndarray, torch.Tensor]], None, None]:
-        """Convert Huggingface Model to Friendli format(.h5).
-
-        Args:
-            model (torch.nn.Module): Huggingface model.
-            output_path (str): Path to save the converted checkpoint.
-            convert_info_list (List[ConvertInfo]):
-                List of convert information of the parameter in huggingface checkpoint.
-            save_numpy_format (bool, optional): Save the converted tensor in numpy format.
-                                                Defaults to True.
-        """
-        state_dict = model.state_dict()
-        total_layers = len(convert_info_list)
-        with tqdm(total=total_layers, desc="Converting", unit="tensor") as pbar:
-            for convert_info in convert_info_list:
-                converted_name, reshape_fn, param_names, data_type = (
-                    convert_info.converted_name,
-                    convert_info.reshape_fn,
-                    convert_info.param_names,
-                    convert_info.data_type,
-                )
-                params = [
-                    get_tensor_from_state_dict(state_dict, param_name)
-                    for param_name in param_names
-                ]
-                reshaped_tensor = convert_tensor_dtype(reshape_fn(params), data_type)
-                if save_numpy_format:
-                    yield (
-                        converted_name,
-                        reshaped_tensor.view(torch.float16).numpy().view(np.uint16)
-                        if data_type == ModelDataType.BF16
-                        else reshaped_tensor.numpy(),
-                    )
-                else:
-                    yield (
-                        converted_name,
-                        reshaped_tensor.contiguous(),
-                    )
-
-                pbar.update()
-
-
-class NonTFBlockConversionInterface(ABC):
-    """Interface get information for converting common layers."""
-
-    @property
-    @abstractmethod
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for the non-transformer blocks."""
-
-
-class DecoderTFBlockConversionInterface(ABC):
-    """Interface get information for converting decoder layers."""
-
-    @property
-    @abstractmethod
-    def decoder_layer_prefix(self) -> str:
-        """Return the layer name prefix used before the decoder's transformer block number."""
-
-    @property
-    @abstractmethod
-    def decoder_layer_num(self) -> int:
-        """Return the number of transformer blocks in the decoder."""
-
-    @property
-    @abstractmethod
-    def decoder_hidden_size(self) -> int:
-        """Return the hidden size of the decoder."""
-
-    @property
-    @abstractmethod
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads."""
-
-    @property
-    @abstractmethod
-    def decoder_num_attention_heads(self) -> int:
-        """Return the number of attention heads in the decoder."""
-
-    @property
-    @abstractmethod
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for transformer blocks in the decoder."""
-
-    @property
-    @abstractmethod
-    def decoder_head_size(self) -> int:
-        """Return the head size of the decoder."""
-
-    @property
-    @abstractmethod
-    def decoder_ff_intermediate_size(self) -> int:
-        """Return the intermediate size of the linear layer in decoder's MLP."""
-
-
-class EncoderTFBlockConversionInterface(ABC):
-    """Interface get information for converting encoder layers."""
-
-    @property
-    @abstractmethod
-    def encoder_layer_prefix(self) -> str:
-        """Return the layer name prefix used before the encoder's transformer block number."""
-
-    @property
-    @abstractmethod
-    def encoder_layer_num(self) -> int:
-        """Return the number of transformer blocks in the encoder."""
-
-    @property
-    @abstractmethod
-    def encoder_hidden_size(self) -> int:
-        """Return the hidden size of the encoder."""
-
-    @property
-    @abstractmethod
-    def encoder_num_attention_heads(self) -> int:
-        """Return the number of attention heads in the encoder."""
-
-    @property
-    @abstractmethod
-    def encoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for transformer blocks in the encoder."""
-
-    @property
-    @abstractmethod
-    def encoder_head_size(self) -> int:
-        """Return the head size of the encoder."""
-
-    @property
-    @abstractmethod
-    def encoder_ff_intermediate_size(self) -> int:
-        """Return the intermediate size of the linear layer in encoder's MLP."""
-
-
-class RotaryEmbeddingConversionInterface(ABC):
-    """Interface get information for converting rotary embeddings."""
-
-    @property
-    @abstractmethod
-    def rotary_dim(self) -> int:
-        """Return the dimension of rotary embeddings."""
-
-    @property
-    @abstractmethod
-    def rotary_emb_base(self) -> float:
-        """Return the base of rotary embeddings."""
diff --git a/friendli/modules/converter/maps.py b/friendli/modules/converter/maps.py
deleted file mode 100644
index 7a8bcd37..00000000
--- a/friendli/modules/converter/maps.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Defining Friendli Model Converter maps."""
-
-from __future__ import annotations
-
-from typing import Dict, Tuple, Type, Union
-
-from transformers import (  # type: ignore[import]
-    AutoModelForCausalLM,
-    BlenderbotForConditionalGeneration,
-    BloomForCausalLM,
-    CodeGenForCausalLM,
-    CohereForCausalLM,
-    DbrxForCausalLM,
-    FalconForCausalLM,
-    GPT2LMHeadModel,
-    GPTJForCausalLM,
-    GPTNeoXForCausalLM,
-    LlamaForCausalLM,
-    MistralForCausalLM,
-    MixtralForCausalLM,
-    MptForCausalLM,
-    OPTForCausalLM,
-    Phi3ForCausalLM,
-    PreTrainedModel,
-    T5ForConditionalGeneration,
-)
-
-from friendli.errors import NotSupportedCheckpointError
-from friendli.modules.converter.base import OneOfAdapterConverter, OneOfConverter
-from friendli.modules.converter.models.arctic import ArcticForCausalLMConverter
-from friendli.modules.converter.models.blenderbot import BlenderbotConverter
-from friendli.modules.converter.models.bloom import BloomForCausalLMConverter
-from friendli.modules.converter.models.codegen import CodegenForCausalLMConverter
-from friendli.modules.converter.models.cohere import CohereForCausalLMConverter
-from friendli.modules.converter.models.dbrx import DbrxForCausalLMConverter
-from friendli.modules.converter.models.falcon import FalconForCausalLMConverter
-from friendli.modules.converter.models.gpt2 import GPT2LMHeadModelConverter
-from friendli.modules.converter.models.gpt_neox import GPTNeoXForCausalLMConverter
-from friendli.modules.converter.models.gptj import (
-    GPTJForCausalLMConverter,
-    GPTJForCausalLMLoraConverter,
-)
-from friendli.modules.converter.models.llama import (
-    LlamaForCausalLMConverter,
-    LlamaForCausalLMLoraConverter,
-)
-from friendli.modules.converter.models.mistral import (
-    MistralForCausalLMConverter,
-    MistralForCausalLMLoraConverter,
-)
-from friendli.modules.converter.models.mixtral import MixtralForCausalLMConverter
-from friendli.modules.converter.models.mpt import (
-    MPTForCausalLMConverter,
-    MptForCausalLMLoraConverter,
-)
-from friendli.modules.converter.models.opt import OPTForCausalLMConverter
-from friendli.modules.converter.models.phi3 import Phi3ForCausalLMConverter
-from friendli.modules.converter.models.phi_msft import PhiForCausalLMConverter
-from friendli.modules.converter.models.t5 import T5Converter
-
-MODEL_ARCH_CONVERTER_MAP: Dict[
-    str, Tuple[Union[PreTrainedModel, PreTrainedModel], Type[OneOfConverter]]
-] = {
-    "BlenderbotForConditionalGeneration": (
-        BlenderbotForConditionalGeneration,
-        BlenderbotConverter,
-    ),
-    "BloomForCausalLM": (BloomForCausalLM, BloomForCausalLMConverter),
-    "CodeGenForCausalLM": (CodeGenForCausalLM, CodegenForCausalLMConverter),
-    "FalconForCausalLM": (FalconForCausalLM, FalconForCausalLMConverter),
-    "GPTNeoXForCausalLM": (GPTNeoXForCausalLM, GPTNeoXForCausalLMConverter),
-    "GPT2LMHeadModel": (GPT2LMHeadModel, GPT2LMHeadModelConverter),
-    "GPTJForCausalLM": (GPTJForCausalLM, GPTJForCausalLMConverter),
-    "LlamaForCausalLM": (LlamaForCausalLM, LlamaForCausalLMConverter),
-    "LLaMAForCausalLM": (LlamaForCausalLM, LlamaForCausalLMConverter),
-    "MistralForCausalLM": (MistralForCausalLM, MistralForCausalLMConverter),
-    "MixtralForCausalLM": (MixtralForCausalLM, MixtralForCausalLMConverter),
-    "MPTForCausalLM": (MptForCausalLM, MPTForCausalLMConverter),
-    "OPTForCausalLM": (OPTForCausalLM, OPTForCausalLMConverter),
-    "T5ForConditionalGeneration": (T5ForConditionalGeneration, T5Converter),
-    "PhiForCausalLM": (AutoModelForCausalLM, PhiForCausalLMConverter),
-    "CohereForCausalLM": (CohereForCausalLM, CohereForCausalLMConverter),
-    "DbrxForCausalLM": (DbrxForCausalLM, DbrxForCausalLMConverter),
-    "Phi3ForCausalLM": (Phi3ForCausalLM, Phi3ForCausalLMConverter),
-    "ArcticForCausalLM": (AutoModelForCausalLM, ArcticForCausalLMConverter),
-}
-
-MODEL_ARCH_ADAPTER_CONVERTER_MAP: Dict[
-    str,
-    Type[OneOfAdapterConverter],
-] = {
-    "GPTJForCausalLM": GPTJForCausalLMLoraConverter,
-    "LlamaForCausalLM": LlamaForCausalLMLoraConverter,
-    "LLaMAForCausalLM": LlamaForCausalLMLoraConverter,
-    "MPTForCausalLM": MptForCausalLMLoraConverter,
-    "MistralForCausalLM": MistralForCausalLMLoraConverter,
-}
-
-
-def get_hf_converter_factory(
-    model_arch: str,
-) -> Tuple[PreTrainedModel, Type[OneOfConverter]]:
-    """Return the converter factory for the given model architecture.
-
-    Args:
-        model_arch (str): Model architecture name.
-
-    Returns:
-        Tuple[PretrainedModel, Type[OneOfConverter]]: Tuple of
-            model class and converter class.
-
-    Raises:
-        NotSupportedCheckpointError: Raised when the given model architecture is not supported.
-
-    """
-    if model_arch not in MODEL_ARCH_CONVERTER_MAP:
-        raise NotSupportedCheckpointError(
-            invalid_option=f"Model architecture='{model_arch}'",
-            valid_options=list(MODEL_ARCH_CONVERTER_MAP.keys()),
-        )
-
-    return MODEL_ARCH_CONVERTER_MAP[model_arch]
-
-
-def get_adapter_converter_factory(
-    model_arch: str,
-) -> Type[OneOfAdapterConverter]:
-    """Return the converter factory for the given model architecture.
-
-    Args:
-        model_arch (str): Model architecture name.
-
-    Returns:
-        Type[LoraConverter]: Adapter Converter class.
-
-    Raises:
-        NotSupportedCheckpointError: Raised when the given model architecture is not supported.
-    """
-    try:
-        adapter_converter_type = MODEL_ARCH_ADAPTER_CONVERTER_MAP[model_arch]
-    except KeyError as exc:
-        raise NotSupportedCheckpointError(
-            invalid_option=f"adapter for model architecture='{model_arch}'",
-            valid_options=list(MODEL_ARCH_ADAPTER_CONVERTER_MAP.keys()),
-        ) from exc
-    return adapter_converter_type
diff --git a/friendli/modules/converter/models/arctic.py b/friendli/modules/converter/models/arctic.py
deleted file mode 100644
index 293d21d9..00000000
--- a/friendli/modules/converter/models/arctic.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Arctic Checkpoint Converter."""
-
-
-from __future__ import annotations
-
-from typing import cast
-
-from transformers import PretrainedConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import FP8OnlyConverter
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-
-
-class ArcticConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
-    Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`ArcticModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 14336):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
-            The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
-            allows sequence of up to 4096*32 tokens.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            The id of the padding token.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the "end-of-sequence" token.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
-        sliding_window (`int`, *optional*):
-            Sliding window attention window size. If not specified, will default to `4096`.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        num_experts_per_tok (`int`, *optional*, defaults to 2):
-            The number of experts to root per-token, can be also interpreted as the `top-p` routing
-            parameter
-        num_local_experts (`int`, *optional*, defaults to 8):
-            Number of experts per Sparse MLP layer.
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
-            The aux loss factor for the total loss.
-    ```python
-    >>> from transformers import ArcticModel, ArcticConfig
-    >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
-    >>> configuration = ArcticConfig()
-    >>> # Initializing a model from the Arctic 7B style configuration
-    >>> model = ArcticModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "arctic"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=1e6,
-        sliding_window=None,
-        attention_dropout=0.0,
-        num_experts_per_tok=1,
-        num_local_experts=8,
-        router_aux_loss_coef=0.001,
-        moe_layer_frequency=2,
-        parallel_attn_mlp_res=False,
-        moe_train_capacity_factor=1,
-        moe_eval_capacity_factor=1,
-        enable_expert_tensor_parallelism=False,
-        moe_min_capacity=0,
-        moe_token_dropping=True,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_local_experts = num_local_experts
-        self.router_aux_loss_coef = router_aux_loss_coef
-        self.moe_layer_frequency = moe_layer_frequency
-        self.moe_train_capacity_factor = moe_train_capacity_factor
-        self.moe_eval_capacity_factor = moe_eval_capacity_factor
-        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
-        self.moe_min_capacity = moe_min_capacity
-        self.moe_token_dropping = moe_token_dropping
-        self.parallel_attn_mlp_res = parallel_attn_mlp_res
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
-class ArcticForCausalLMConverter(FP8OnlyConverter, RotaryEmbeddingConversionInterface):
-    """ArcticForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if Arctic architectures' config can be converted to Friendli format."""
-        super().check_config()
-        config = cast(ArcticConfig, self.config)
-        try:
-            if config.tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if config.hidden_act not in ["silu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={config.hidden_act}'",
-                    valid_options=["silu"],
-                )
-            if config.moe_layer_frequency != 1:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'moe_layer_frequency={config.moe_layer_frequency}'",
-                    valid_options=[1],
-                )
-            if not config.parallel_attn_mlp_res:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'parallel_attn_mlp_res={config.parallel_attn_mlp_res}'",
-                    valid_options=[True],
-                )
-
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "arctic"
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before Arctic's transformer block number."""
-        return "model.layers."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in Arctic."""
-        return cast(ArcticConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in Arctic."""
-        return cast(ArcticConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in Arctic."""
-        return cast(ArcticConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in Arctic."""
-        config = cast(ArcticConfig, self.config)
-        if config.num_key_value_heads is None:
-            return self.decoder_num_attention_heads
-        return config.num_key_value_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of Arctic."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in Arctic MLP."""
-        return cast(ArcticConfig, self.config).intermediate_size
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary embedding dimension of Arctic."""
-        return self.decoder_head_size
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary embedding base of Arctic."""
-        return cast(ArcticConfig, self.config).rope_theta
-
-    @property
-    def num_experts(self) -> int:
-        """The number of moe experts per transformer block in Arctic."""
-        return cast(ArcticConfig, self.config).num_local_experts
-
-    @property
-    def num_selected_moe_experts(self) -> int:
-        """The number of selected moe experts per transformer block in Arctic."""
-        return cast(ArcticConfig, self.config).num_experts_per_tok
diff --git a/friendli/modules/converter/models/blenderbot.py b/friendli/modules/converter/models/blenderbot.py
deleted file mode 100644
index 224ded48..00000000
--- a/friendli/modules/converter/models/blenderbot.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Blenderbot Checkpoint Converter."""
-
-from __future__ import annotations
-
-import math
-from typing import Any, Dict, List, cast
-
-import numpy as np
-import torch
-from transformers import BlenderbotConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    ENCODER_PREFIX,
-    SUPPORTED_GELU_FAMILY,
-    EncoderDecoderConverter,
-)
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class BlenderbotConverter(EncoderDecoderConverter):
-    """BlenderbotForConditionalGeneration Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if Blenderbot architectures's config can be converted to Friendli format."""
-        super().check_config()
-        config = cast(BlenderbotConfig, self.config)
-        try:
-            if config.activation_function not in SUPPORTED_GELU_FAMILY:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'activation_function="
-                    f"{cast(BlenderbotConfig, self.config).activation_function}'",
-                    valid_options=SUPPORTED_GELU_FAMILY,
-                )
-            if not config.tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=False'",
-                    valid_options=[True],
-                )
-            if self.encoder_num_attention_heads != self.decoder_num_attention_heads:
-                raise NotSupportedCheckpointError(
-                    invalid_option=(
-                        f"encoder_num_attention_heads={self.encoder_num_attention_heads} "
-                        f"decoder_num_attention_heads={self.decoder_num_attention_heads}"
-                    ),
-                    valid_options=[
-                        "encoder_num_attention_heads == decoder_num_attention_heads"
-                    ],
-                )
-            if config.decoder_ffn_dim != config.encoder_ffn_dim:
-                raise NotSupportedCheckpointError(
-                    invalid_option=(
-                        f"encoder_ffn_dim={config.encoder_ffn_dim} "
-                        f"decoder_ffn_dim={config.decoder_ffn_dim}"
-                    ),
-                    valid_options=["encoder_ffn_dim == decoder_ffn_dim"],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def token_embed_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """Reshape token embedding weight for Blenderbot's embedding layer."""
-        assert len(params) == 1
-        embed_dim = cast(BlenderbotConfig, self.config).d_model
-        embed_scale = (
-            math.sqrt(embed_dim)
-            if cast(BlenderbotConfig, self.config).scale_embedding
-            else 1.0
-        )
-        embed_weight = params[0]
-        embed_weight = embed_weight * embed_scale
-        return embed_weight
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(BlenderbotConfig, self.config)
-
-        logger.warn(
-            "Since Blenderbot uses absolute position embedding, 'max_input_length' and "
-            "'max_output_length' cannot be larger than %d.",
-            config.max_position_embeddings,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        decoder_start_token_id = self.get_decoder_start_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.encoder_head_size,
-            "num_heads": self.encoder_num_attention_heads,
-            "hidden_size": self.encoder_hidden_size,
-            "ff_intermediate_size": self.decoder_ff_intermediate_size,
-            "num_encoder_layers": self.encoder_layer_num,
-            "num_decoder_layers": self.decoder_layer_num,
-            "max_input_length": config.max_position_embeddings,
-            "max_output_length": config.max_position_embeddings,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "decoder_start_token": (
-                decoder_start_token_id
-                if decoder_start_token_id is not None
-                else "FILL ME"
-            ),
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "blenderbot"
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in Blenderbot."""
-        return [
-            ConvertInfo(
-                param_names=["model.shared.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.shared.weight"],
-                data_type=self.data_type,
-                converted_name="head_fc/weight:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.encoder.embed_positions.weight"],
-                data_type=self.data_type,
-                converted_name=f"{ENCODER_PREFIX}/wpe/weight:0",
-                reshape_fn=self.pos_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.decoder.embed_positions.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/wpe/weight:0",
-                reshape_fn=self.pos_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.encoder.layer_norm.weight"],
-                data_type=self.data_type,
-                converted_name=f"{ENCODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.encoder.layer_norm.bias"],
-                data_type=self.data_type,
-                converted_name=f"{ENCODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.decoder.layer_norm.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.decoder.layer_norm.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-        ]
-
-    @property
-    def encoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in Blenderbot's encoder."""
-        convert_info_list = []
-        for i in range(self.encoder_layer_num):
-            layer_prefix = f"{self.encoder_layer_prefix}{i}."
-            converted_prefix = f"{ENCODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn_layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn_layer_norm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attn.q_proj.weight",
-                            f"{layer_prefix}self_attn.k_proj.weight",
-                            f"{layer_prefix}self_attn.v_proj.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attn.q_proj.bias",
-                            f"{layer_prefix}self_attn.k_proj.bias",
-                            f"{layer_prefix}self_attn.v_proj.bias",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/bias:0",
-                        reshape_fn=self.qkv_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn.out_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn.out_proj.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}final_layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}final_layer_norm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc1.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc1.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc2.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc2.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in Blenderbot's decoder."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn_layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn_layer_norm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attn.q_proj.weight",
-                            f"{layer_prefix}self_attn.k_proj.weight",
-                            f"{layer_prefix}self_attn.v_proj.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attn.q_proj.bias",
-                            f"{layer_prefix}self_attn.k_proj.bias",
-                            f"{layer_prefix}self_attn.v_proj.bias",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/bias:0",
-                        reshape_fn=self.qkv_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn.out_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn.out_proj.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}encoder_attn_layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}encoder_attn_layer_norm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}final_layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_3/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}final_layer_norm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_3/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}encoder_attn.q_proj.weight",
-                            f"{layer_prefix}encoder_attn.k_proj.weight",
-                            f"{layer_prefix}encoder_attn.v_proj.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}cross_attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}encoder_attn.q_proj.bias",
-                            f"{layer_prefix}encoder_attn.k_proj.bias",
-                            f"{layer_prefix}encoder_attn.v_proj.bias",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}cross_attn/c_attn/bias:0",
-                        reshape_fn=self.qkv_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}encoder_attn.out_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}cross_attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}encoder_attn.out_proj.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}cross_attn/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc1.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc1.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc2.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc2.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def encoder_layer_prefix(self) -> str:
-        """The layer name prefix used before Blenderbot encoder's transformer block number."""
-        return "model.encoder.layers."
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before Blenderbot decoder's transformer block number."""
-        return "model.decoder.layers."
-
-    @property
-    def encoder_layer_num(self) -> int:
-        """The number of transformer blocks in Blenderbot encoder."""
-        return cast(BlenderbotConfig, self.config).encoder_layers
-
-    @property
-    def encoder_hidden_size(self) -> int:
-        """The hidden size of Blenderbot encoder."""
-        return cast(BlenderbotConfig, self.config).d_model
-
-    @property
-    def encoder_num_attention_heads(self) -> int:
-        """The number of attention heads of Blenderbot encoder."""
-        return cast(BlenderbotConfig, self.config).encoder_attention_heads
-
-    @property
-    def encoder_head_size(self) -> int:
-        """The size of each attention head of Blenderbot encoder."""
-        return self.encoder_hidden_size // self.encoder_num_attention_heads
-
-    @property
-    def encoder_ff_intermediate_size(self) -> int:
-        """The intermediate of the linear layer in Blenderbot encoder's MLP."""
-        return cast(BlenderbotConfig, self.config).encoder_ffn_dim
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of transformer blocks in Blenderbot decoder."""
-        return cast(BlenderbotConfig, self.config).decoder_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size of Blenderbot decoder."""
-        return cast(BlenderbotConfig, self.config).d_model
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads of Blenderbot decoder."""
-        return cast(BlenderbotConfig, self.config).decoder_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads of blenderbot decoder."""
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The size of each attention head of Blenderbot decoder."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate of the linear layer in Blenderbot decoder's MLP."""
-        return cast(BlenderbotConfig, self.config).decoder_ffn_dim
diff --git a/friendli/modules/converter/models/bloom.py b/friendli/modules/converter/models/bloom.py
deleted file mode 100644
index 7ce615ad..00000000
--- a/friendli/modules/converter/models/bloom.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Bloom Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-import numpy as np
-import torch
-from transformers import BloomConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import DECODER_PREFIX, DecoderOnlyConverter
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class BloomForCausalLMConverter(DecoderOnlyConverter):
-    """BloomForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if Bloom architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if cast(BloomConfig, self.config).apply_residual_connection_post_layernorm:
-                raise NotSupportedCheckpointError(
-                    invalid_option="apply_residual_connection_post_layernorm=True",
-                    valid_options=[False],
-                )
-            if cast(BloomConfig, self.config).slow_but_exact:
-                raise NotSupportedCheckpointError(
-                    invalid_option="slow_but_exact=True", valid_options=[False]
-                )
-            if not cast(BloomConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="tie_word_embeddings=False", valid_options=[True]
-                )
-            if cast(BloomConfig, self.config).layer_norm_epsilon != 1e-5:
-                raise NotSupportedCheckpointError(
-                    invalid_option="layer_norm_epsilon="
-                    f"{cast(BloomConfig, self.config).layer_norm_epsilon}",
-                    valid_options=[1e-5],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def qkv_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """qkv_weight_reshape for Bloom's attention layer."""
-        assert len(params) == 1
-        qkv_weight = params[0]
-        split_qkv_weight_list = torch.split(qkv_weight, self.decoder_head_size, dim=0)
-        qkv_weight_list = [
-            torch.cat(
-                [
-                    split_qkv_weight_list[j * 3 + i]
-                    for j in range(self.decoder_num_attention_heads)
-                ],
-                dim=0,
-            ).reshape(-1, self.decoder_hidden_size)
-            for i in range(3)
-        ]
-
-        qkv_weight = torch.cat(qkv_weight_list, dim=0).transpose(0, 1)
-        return qkv_weight
-
-    def qkv_bias_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """qkv_bias_reshape for Bloom's attention layer."""
-        assert len(params) == 1
-        qkv_bias = params[0]
-        split_qkv_bias_list = torch.split(qkv_bias, self.decoder_head_size, dim=0)
-        qkv_bias_list = [
-            torch.cat(
-                [
-                    split_qkv_bias_list[j * 3 + i]
-                    for j in range(self.decoder_num_attention_heads)
-                ],
-                dim=0,
-            )
-            for i in range(3)
-        ]
-
-        qkv_bias = torch.cat(qkv_bias_list, dim=0)
-        return qkv_bias
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(BloomConfig, self.config)
-
-        logger.warn(
-            "The 'max_length' field is left blank as it cannot be automatically configured. "
-            "You must determine the 'max_length' according to your needs. The Bloom model does "
-            "not rely on absolute position embeddings, allowing you to choose any "
-            "suitable value."
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "max_length": "FILL ME",
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "bloom"
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in Bloom."""
-        return [
-            ConvertInfo(
-                param_names=["transformer.word_embeddings.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.word_embeddings_layernorm.weight"],
-                data_type=self.data_type,
-                converted_name="wte/ln/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.word_embeddings_layernorm.bias"],
-                data_type=self.data_type,
-                converted_name="wte/ln/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-        ]
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in Bloom."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}input_layernorm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}input_layernorm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attention.query_key_value.bias"
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/bias:0",
-                        reshape_fn=self.qkv_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attention.dense.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}post_attention_layernorm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}post_attention_layernorm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_h_to_4h.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_4h_to_h.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attention.query_key_value.weight"
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attention.dense.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_h_to_4h.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_4h_to_h.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before Bloom's transformer block number."""
-        return "transformer.h."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in Bloom."""
-        return cast(BloomConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """Return the hidden size in Bloom."""
-        return cast(BloomConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in Bloom."""
-        return cast(BloomConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in bloom."""
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The size of each attention head in Bloom."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in Bloom MLP."""
-        return self.decoder_hidden_size * 4
diff --git a/friendli/modules/converter/models/codegen.py b/friendli/modules/converter/models/codegen.py
deleted file mode 100644
index a6f1ef03..00000000
--- a/friendli/modules/converter/models/codegen.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli CodeGen Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-import torch
-from transformers import CodeGenConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    SUPPORTED_GELU_FAMILY,
-    DecoderOnlyConverter,
-)
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class CodegenForCausalLMConverter(
-    DecoderOnlyConverter, RotaryEmbeddingConversionInterface
-):
-    """CodegenForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if CodeGen architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if (
-                cast(CodeGenConfig, self.config).activation_function
-                not in SUPPORTED_GELU_FAMILY
-            ):
-                raise NotSupportedCheckpointError(
-                    invalid_option="'activation_function="
-                    f"{cast(CodeGenConfig, self.config).activation_function}'",
-                    valid_options=SUPPORTED_GELU_FAMILY,
-                )
-            if cast(CodeGenConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if cast(CodeGenConfig, self.config).layer_norm_epsilon != 1e-5:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'layer_norm_epsilon="
-                    f"{cast(CodeGenConfig, self.config).layer_norm_epsilon}'",
-                    valid_options=[1e-5],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def qkv_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """qkv_weight_reshape for CodeGen's attention layer."""
-        assert len(params) == 1
-        original_qkv_weight = params[0]
-        reshaped_qkv_weight = original_qkv_weight.reshape(
-            (4, original_qkv_weight.size(0) // 4, original_qkv_weight.size(1))
-        )
-        q_weight, v_weight, k_weight = torch.split(
-            reshaped_qkv_weight, reshaped_qkv_weight.size(1) // 3, dim=1
-        )
-        q_weight = q_weight.reshape((-1, q_weight.size(2)))
-        k_weight = k_weight.reshape((-1, k_weight.size(2)))
-        v_weight = v_weight.reshape((-1, v_weight.size(2)))
-
-        qkv_weight = torch.cat((q_weight, k_weight, v_weight), dim=0)
-        qkv_weight = qkv_weight.transpose(0, 1)
-
-        return qkv_weight
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(CodeGenConfig, self.config)
-
-        logger.info(
-            "The generated attributes set 'max_length' to %d, but you can change the "
-            "'max_length' according to your needs. The CodeGen model does not rely on "
-            "absolute position embeddings, allowing you to choose any suitable value.",
-            config.n_positions,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "rotary_dim": self.rotary_dim,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "max_length": config.n_positions,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "rope_theta": self.rotary_emb_base,
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "gpt-j"
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in CodeGen."""
-        return [
-            ConvertInfo(
-                param_names=["transformer.wte.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.weight"],
-                data_type=self.data_type,
-                converted_name="head_fc/weight:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.bias"],
-                data_type=self.data_type,
-                converted_name="head_fc/bias:0",
-                reshape_fn=self.linear_bias_reshape,
-            ),
-        ]
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in CodeGen."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_1.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_1.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc_in.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc_out.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.qkv_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.out_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc_in.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc_out.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before CodeGen's transformer block number."""
-        return "transformer.h."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in CodeGen."""
-        return cast(CodeGenConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in CodeGen."""
-        return cast(CodeGenConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in CodeGen."""
-        return cast(CodeGenConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in the codegen."""
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head siez of CodeGen."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in codegen MLP."""
-        return self.decoder_hidden_size * 4
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary dim in CodeGen."""
-        return cast(CodeGenConfig, self.config).rotary_dim
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary emb base in CodeGen."""
-        return 10000.0
diff --git a/friendli/modules/converter/models/cohere.py b/friendli/modules/converter/models/cohere.py
deleted file mode 100644
index 47217e23..00000000
--- a/friendli/modules/converter/models/cohere.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Cohere Checkpoint Converter."""
-
-
-from __future__ import annotations
-
-from typing import cast
-
-from transformers import CohereConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.modules.converter.base import FP8OnlyConverter
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class CohereForCausalLMConverter(FP8OnlyConverter, RotaryEmbeddingConversionInterface):
-    """CohereForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if LLaMA architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if cast(CohereConfig, self.config).hidden_act not in ["silu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={cast(CohereConfig, self.config).hidden_act}'",
-                    valid_options=["silu"],
-                )
-            if not cast(CohereConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=False'",
-                    valid_options=[True],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "cohere"
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before LLaMA's transformer block number."""
-        return "model.layers."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in LLaMA."""
-        return cast(CohereConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in LLaMA."""
-        return cast(CohereConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in LLaMA."""
-        return cast(CohereConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in LLaMA."""
-        config = cast(CohereConfig, self.config)
-        if config.num_key_value_heads is None:
-            return self.decoder_num_attention_heads
-        return config.num_key_value_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of LLaMA."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in LLaMA MLP."""
-        return self.config.intermediate_size
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary embedding dimension of LLaMA."""
-        return self.decoder_head_size
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary embedding base of LLaMA."""
-        return cast(CohereConfig, self.config).rope_theta
diff --git a/friendli/modules/converter/models/dbrx.py b/friendli/modules/converter/models/dbrx.py
deleted file mode 100644
index 88c9094f..00000000
--- a/friendli/modules/converter/models/dbrx.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Dbrx Checkpoint Converter."""
-
-
-from __future__ import annotations
-
-from typing import cast
-
-from transformers.models.dbrx.configuration_dbrx import (  # type: ignore[import]
-    DbrxConfig,
-    DbrxFFNConfig,
-)
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.modules.converter.base import FP8OnlyConverter
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-
-
-class DbrxForCausalLMConverter(FP8OnlyConverter, RotaryEmbeddingConversionInterface):
-    """DbrxForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if Dbrx architectures' config can be converted to Friendli format."""
-        super().check_config()
-        config = cast(DbrxConfig, self.config)
-        try:
-            if config.tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if config.ffn_config.moe_top_k not in [1, 2, 4]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'moe_top_k={config.ffn_config.moe_top_k}'",
-                    valid_options=[1, 2, 4],
-                )
-            if config.ffn_config.moe_num_experts not in [1, 2, 4, 8, 16]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'moe_num_experts={config.ffn_config.moe_num_experts}'",
-                    valid_options=[1, 2, 4, 8, 16],
-                )
-
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "dbrx"
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before LLaMA's transformer block number."""
-        return "transformer.blocks."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in LLaMA."""
-        return cast(DbrxConfig, self.config).n_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in LLaMA."""
-        return cast(DbrxConfig, self.config).d_model
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in LLaMA."""
-        return cast(DbrxConfig, self.config).n_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in LLaMA."""
-        config = cast(DbrxConfig, self.config)
-        if config.attn_config.kv_n_heads is None:
-            return self.decoder_num_attention_heads
-        return config.attn_config.kv_n_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of LLaMA."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in LLaMA MLP."""
-        return cast(DbrxConfig, self.config).ffn_config.ffn_hidden_size
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary embedding dimension of LLaMA."""
-        return self.decoder_head_size
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary embedding base of LLaMA."""
-        return cast(DbrxConfig, self.config).attn_config.rope_theta
diff --git a/friendli/modules/converter/models/falcon.py b/friendli/modules/converter/models/falcon.py
deleted file mode 100644
index 3dfdede2..00000000
--- a/friendli/modules/converter/models/falcon.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Falcon Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-import torch
-from transformers import FalconConfig  # type: ignore[import]
-
-from friendli.errors import NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import DECODER_PREFIX, DecoderOnlyConverter
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import convert_to_gpt_j_params
-
-
-class FalconForCausalLMConverter(
-    DecoderOnlyConverter, RotaryEmbeddingConversionInterface
-):
-    """FalconForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if Falcon architectures' config can be converted to Friendli format."""
-        super().check_config()
-        config = cast(FalconConfig, self.config)
-
-        if config.layer_norm_epsilon != 1e-5:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'layer_norm_epsilon={config.layer_norm_epsilon}'",
-                valid_options=[1e-5],
-            )
-
-        if config.alibi:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'alibi'={config.alibi}'",
-                valid_options=[False],
-            )
-
-        if not config.rotary:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'rotary'={config.rotary}'",
-                valid_options=[True],
-            )
-
-        if config.bias:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'bias'={config.bias}'",
-                valid_options=[False],
-            )
-
-        if not config.new_decoder_architecture and not config.parallel_attn:
-            raise NotSupportedCheckpointError(
-                invalid_option=(
-                    f"'new_decoder_architecture'={config.new_decoder_architecture}"
-                    f"'parallel_attn'={config.parallel_attn}"
-                ),
-                valid_options=[
-                    "'new_decoder_architecture'=True",
-                    "'new_decoder_architecture'=False, 'parallel_attn'=True",
-                ],
-            )
-
-    def qkv_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """qkv_weight_reshape for Falcon's attention layer."""
-        assert len(params) == 1
-        qkv_weight = params[0]
-
-        num_queries_per_kv = (
-            self.decoder_num_attention_heads // self.decoder_num_kv_attention_heads
-        )
-
-        qkv_weight = qkv_weight.reshape(
-            self.decoder_num_kv_attention_heads,
-            num_queries_per_kv + 2,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        q_weight = qkv_weight[:, :num_queries_per_kv].reshape(
-            self.decoder_num_kv_attention_heads * num_queries_per_kv,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        k_weight = qkv_weight[:, [-2]].reshape(
-            self.decoder_num_kv_attention_heads,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        v_weight = qkv_weight[:, [-1]].reshape(
-            self.decoder_num_kv_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        q_weight = convert_to_gpt_j_params(q_weight, self.rotary_dim)
-        k_weight = convert_to_gpt_j_params(k_weight, self.rotary_dim)
-
-        q_weight = q_weight.reshape(
-            self.decoder_num_kv_attention_heads
-            * num_queries_per_kv
-            * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        k_weight = k_weight.reshape(
-            self.decoder_num_kv_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        qkv_weight = torch.cat((q_weight, k_weight, v_weight), dim=0)
-        qkv_weight = qkv_weight.transpose(0, 1)
-
-        return qkv_weight
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(FalconConfig, self.config)
-
-        logger.warn(
-            "The 'max_length' field is left blank as it cannot be automatically configured. "
-            "You must determine the 'max_length' according to your needs. The Falcon model does "
-            "not rely on absolute position embeddings, allowing you to choose any "
-            "suitable value."
-        )
-
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "rotary_dim": self.rotary_dim,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_kv_heads": self.decoder_num_kv_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "max_length": "FILL ME",
-            "vocab_size": config.vocab_size,
-            "eos_token": self.get_eos_token_id() or "FILL ME",
-            "rope_theta": self.rotary_emb_base,
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        if cast(FalconConfig, self.config).new_decoder_architecture:
-            return "falcon"
-        return "falcon-7b"
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in Falcon."""
-        return [
-            ConvertInfo(
-                param_names=["transformer.word_embeddings.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.weight"],
-                data_type=self.data_type,
-                converted_name="head_fc/weight:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-        ]
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in Falcon."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attention.query_key_value.weight"
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attention.dense.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_h_to_4h.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_4h_to_h.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-
-            if cast(FalconConfig, self.config).new_decoder_architecture:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[f"{layer_prefix}ln_attn.weight"],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}ln_1/gamma:0",
-                            reshape_fn=self.ln_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[f"{layer_prefix}ln_attn.bias"],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}ln_1/beta:0",
-                            reshape_fn=self.ln_bias_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[f"{layer_prefix}ln_mlp.weight"],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}ln_2/gamma:0",
-                            reshape_fn=self.ln_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[f"{layer_prefix}ln_mlp.bias"],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}ln_2/beta:0",
-                            reshape_fn=self.ln_bias_reshape,
-                        ),
-                    ]
-                )
-            else:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[f"{layer_prefix}input_layernorm.weight"],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}ln_1/gamma:0",
-                            reshape_fn=self.ln_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[f"{layer_prefix}input_layernorm.bias"],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}ln_1/beta:0",
-                            reshape_fn=self.ln_bias_reshape,
-                        ),
-                    ]
-                )
-
-        return convert_info_list
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before the Falcon's transformer block number."""
-        return "transformer.h."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in Falcon."""
-        return cast(FalconConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in Falcon."""
-        return cast(FalconConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in Falcon."""
-        return cast(FalconConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in Falcon."""
-        config = cast(FalconConfig, self.config)
-
-        if config.new_decoder_architecture:
-            if config.num_kv_heads is not None:
-                return config.num_kv_heads
-            return config.num_attention_heads
-
-        if config.multi_query:
-            return 1
-
-        if config.num_kv_heads is not None:
-            return config.num_kv_heads
-        return config.num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of Falcon."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in falcon MLP."""
-        return self.decoder_hidden_size * 4
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary embedding dimesion of Falcon."""
-        return self.decoder_head_size
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary embedding base of Falcon."""
-        return cast(FalconConfig, self.config).rope_theta
diff --git a/friendli/modules/converter/models/gpt2.py b/friendli/modules/converter/models/gpt2.py
deleted file mode 100644
index d2e2de5d..00000000
--- a/friendli/modules/converter/models/gpt2.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli GPT2 Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-import torch
-from transformers import GPT2Config  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    SUPPORTED_GELU_FAMILY,
-    DecoderOnlyConverter,
-)
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class GPT2LMHeadModelConverter(DecoderOnlyConverter):
-    """GPT2LMHeadModel Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if GPT2 architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if (
-                cast(GPT2Config, self.config).activation_function
-                not in SUPPORTED_GELU_FAMILY
-            ):
-                raise NotSupportedCheckpointError(
-                    invalid_option="'activation_function="
-                    f"{cast(GPT2Config, self.config).activation_function}'",
-                    valid_options=SUPPORTED_GELU_FAMILY,
-                )
-            if cast(GPT2Config, self.config).scale_attn_by_inverse_layer_idx:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'scale_attn_by_inverse_layer_idx=True'",
-                    valid_options=[False],
-                )
-            if not cast(GPT2Config, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=False'",
-                    valid_options=[True],
-                )
-            if cast(GPT2Config, self.config).layer_norm_epsilon != 1e-5:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'layer_norm_epsilon="
-                    f"{cast(GPT2Config, self.config).layer_norm_epsilon}'",
-                    valid_options=[1e-5],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(GPT2Config, self.config)
-
-        logger.warn(
-            "Since GPT2 uses absolute position embedding, 'max_length' cannot be "
-            "larger than %d.",
-            config.n_positions,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "max_length": config.n_positions,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "gpt"
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in GPT2."""
-        return [
-            ConvertInfo(
-                param_names=["transformer.wte.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.wpe.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/wpe/weight:0",
-                reshape_fn=self.pos_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-        ]
-
-    def linear_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """Reshape linear weight in GPT2, which does not need weight transpose."""
-        assert len(params) == 1
-        return params[0]
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in GPT2."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_1.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_1.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.c_attn.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.c_proj.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_2.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_2.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.c_fc.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.c_proj.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.c_attn.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.c_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.c_fc.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.c_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before GPT2's transformer block number."""
-        return "transformer.h."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in GPT2."""
-        return cast(GPT2Config, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in GPT2."""
-        return cast(GPT2Config, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in GPT2."""
-        return cast(GPT2Config, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in gpt2."""
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in codegen MLP."""
-        return self.decoder_hidden_size * 4
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head siez of GPT2."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
diff --git a/friendli/modules/converter/models/gpt_neox.py b/friendli/modules/converter/models/gpt_neox.py
deleted file mode 100644
index 47fe88a1..00000000
--- a/friendli/modules/converter/models/gpt_neox.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli GPT NeoX Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-import torch
-from transformers import GPTNeoXConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    SUPPORTED_GELU_FAMILY,
-    DecoderOnlyConverter,
-)
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import convert_to_gpt_j_params
-
-
-class GPTNeoXForCausalLMConverter(
-    DecoderOnlyConverter, RotaryEmbeddingConversionInterface
-):
-    """GPTNeoXForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if GPTNeoX architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if cast(GPTNeoXConfig, self.config).hidden_act not in SUPPORTED_GELU_FAMILY:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={cast(GPTNeoXConfig, self.config).hidden_act}'",
-                    valid_options=SUPPORTED_GELU_FAMILY,
-                )
-            if not cast(GPTNeoXConfig, self.config).use_parallel_residual:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'use_parallel_residual=False'",
-                    valid_options=[True],
-                )
-            if cast(GPTNeoXConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if cast(GPTNeoXConfig, self.config).layer_norm_eps != 1e-5:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'layer_norm_eps="
-                    f"{cast(GPTNeoXConfig, self.config).layer_norm_eps}'",
-                    valid_options=[1e-5],
-                )
-            if cast(GPTNeoXConfig, self.config).rotary_emb_base != 10000:
-                raise NotSupportedCheckpointError(
-                    invalid_option=(
-                        f"'rotary_emb_base={cast(GPTNeoXConfig, self.config).rotary_emb_base}'"
-                    ),
-                    valid_options=[10000],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def qkv_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """qkv_weight_reshape for GPTNeoX's attention layer."""
-        assert len(params) == 1
-        qkv_weight = params[0]
-        qkv_weight = qkv_weight.reshape(
-            self.decoder_num_attention_heads,
-            3,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        q_weight = qkv_weight[:, 0].reshape(
-            self.decoder_num_attention_heads,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        k_weight = qkv_weight[:, 1].reshape(
-            self.decoder_num_attention_heads,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        v_weight = qkv_weight[:, 2].reshape(
-            self.decoder_num_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        q_weight = convert_to_gpt_j_params(param=q_weight, rotary_dim=self.rotary_dim)
-        k_weight = convert_to_gpt_j_params(param=k_weight, rotary_dim=self.rotary_dim)
-        q_weight = q_weight.reshape(
-            self.decoder_num_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        k_weight = k_weight.reshape(
-            self.decoder_num_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        qkv_weight = torch.cat((q_weight, k_weight, v_weight), dim=0)
-        qkv_weight = qkv_weight.transpose(0, 1)
-
-        return qkv_weight
-
-    def qkv_bias_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """qkv_bias_reshape for GPTNeoX's attention layer."""
-        assert len(params) == 1
-        qkv_bias = params[0]
-        qkv_bias = qkv_bias.reshape(
-            self.decoder_num_attention_heads,
-            3,
-            self.decoder_head_size,
-        )
-
-        q_bias = qkv_bias[:, 0].reshape(
-            self.decoder_num_attention_heads, self.decoder_head_size
-        )
-        k_bias = qkv_bias[:, 1].reshape(
-            self.decoder_num_attention_heads, self.decoder_head_size
-        )
-        v_bias = qkv_bias[:, 2].reshape(
-            self.decoder_num_attention_heads * self.decoder_head_size
-        )
-
-        q_bias = convert_to_gpt_j_params(q_bias, self.rotary_dim).flatten()
-        k_bias = convert_to_gpt_j_params(k_bias, self.rotary_dim).flatten()
-
-        qkv_bias = torch.cat((q_bias, k_bias, v_bias), dim=0)
-        return qkv_bias
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(GPTNeoXConfig, self.config)
-
-        logger.info(
-            "The generated attributes set 'max_length' to %d, but you can change the "
-            "'max_length' according to your needs. The GPTNeoX model does not rely on "
-            "absolute position embeddings, allowing you to choose any suitable value.",
-            config.max_position_embeddings,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "rotary_dim": self.rotary_dim,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "max_length": config.max_position_embeddings,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "rope_theta": self.rotary_emb_base,
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "gpt-neox"
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in GPTNeoX."""
-        return [
-            ConvertInfo(
-                param_names=["gpt_neox.embed_in.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["gpt_neox.final_layer_norm.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["gpt_neox.final_layer_norm.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-            ConvertInfo(
-                param_names=["embed_out.weight"],
-                data_type=self.data_type,
-                converted_name="head_fc/weight:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-        ]
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in GPTNeoX."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}input_layernorm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}input_layernorm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attention.query_key_value.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/bias:0",
-                        reshape_fn=self.qkv_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attention.dense.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}post_attention_layernorm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}post_attention_layernorm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_h_to_4h.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_4h_to_h.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attention.query_key_value.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attention.dense.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_h_to_4h.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.dense_4h_to_h.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before GPTNeoX's transformer block number."""
-        return "gpt_neox.layers."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in GPTNeoX."""
-        return cast(GPTNeoXConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in GPTNeoX."""
-        return cast(GPTNeoXConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in GPTNeoX."""
-        return cast(GPTNeoXConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in gpt_neox."""
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head siez of GPTNeoX."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in codegen MLP."""
-        return self.decoder_hidden_size * 4
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary embedding dimesion of GPTNeoX."""
-        return int(self.decoder_head_size * cast(GPTNeoXConfig, self.config).rotary_pct)
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary embedding base of GPTNeoX."""
-        return float(cast(GPTNeoXConfig, self.config).rotary_emb_base)
diff --git a/friendli/modules/converter/models/gptj.py b/friendli/modules/converter/models/gptj.py
deleted file mode 100644
index 7e0c464f..00000000
--- a/friendli/modules/converter/models/gptj.py
+++ /dev/null
@@ -1,430 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli GPTJ Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-import torch
-from transformers import GPTJConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    SUPPORTED_GELU_FAMILY,
-    DecoderOnlyConverter,
-    DecoderOnlyLoraConverter,
-)
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class GPTJForCausalLMLoraConverter(DecoderOnlyLoraConverter):
-    """GPTJForCausalLM LoRA Converter Class."""
-
-    @property
-    def adapter_target_module_map(self) -> Dict[str, str]:
-        """Return the dictionary that maps Hugging Face's module name to Friendli's module name."""
-        return {
-            "q_proj": "query",
-            "k_proj": "key",
-            "v_proj": "value",
-            "out_proj": "attn_fc",
-            "fc_in": "ff1",
-            "fc_out": "ff2",
-            "wte": "wte",
-        }
-
-    @property
-    def adapter_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for LoRA adapter modules in GPTJ."""
-        convert_info_list = []
-        target_modules = self.adapter_target_modules
-
-        # Non-transformer modules
-        if "wte" in target_modules:
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=["transformer.wte.lora_embedding_A.default"],
-                        data_type=self.converter.data_type,
-                        converted_name="wte/lora/lora_A/weight:0",
-                        reshape_fn=self.lora_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=["transformer.wte.lora_embedding_B.default"],
-                        data_type=self.converter.data_type,
-                        converted_name="wte/lora/lora_B/weight:0",
-                        reshape_fn=self.lora_weight_reshape,
-                    ),
-                ]
-            )
-
-        # Transformer modules
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.converter.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-
-            if "query" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.q_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/query_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.q_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/query_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "key" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.k_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/key_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.k_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/key_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "value" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.v_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/value_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.v_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/value_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "attn_fc" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.out_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_proj/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.out_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_proj/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "ff1" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.fc_in.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_fc/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.fc_in.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_fc/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "ff2" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.fc_out.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_proj/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.fc_out.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_proj/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-        return convert_info_list
-
-
-class GPTJForCausalLMConverter(
-    DecoderOnlyConverter, RotaryEmbeddingConversionInterface
-):
-    """GPTJForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if GPTJ architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if (
-                cast(GPTJConfig, self.config).activation_function
-                not in SUPPORTED_GELU_FAMILY
-            ):
-                raise NotSupportedCheckpointError(
-                    invalid_option="'activation_function="
-                    f"{cast(GPTJConfig, self.config).activation_function}'",
-                    valid_options=SUPPORTED_GELU_FAMILY,
-                )
-            if cast(GPTJConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if cast(GPTJConfig, self.config).layer_norm_epsilon != 1e-5:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'layer_norm_epsilon="
-                    f"{cast(GPTJConfig, self.config).layer_norm_epsilon}'",
-                    valid_options=[1e-5],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def qkv_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """qkv_weight_reshape for GPTJ's attention layer."""
-        assert len(params) == 3
-        qkv_weight = torch.cat(
-            params,
-            dim=0,
-        )
-        qkv_weight = qkv_weight.transpose(0, 1)
-        return qkv_weight
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(GPTJConfig, self.config)
-
-        logger.info(
-            "The generated attributes set 'max_length' to %d, but you can change the "
-            "'max_length' according to your needs. The GPTJ model does not rely on "
-            "absolute position embeddings, allowing you to choose any suitable value.",
-            config.n_positions,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "rotary_dim": self.rotary_dim,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "max_length": config.n_positions,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "rope_theta": self.rotary_emb_base,
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "gpt-j"
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in GPTJ."""
-        return [
-            ConvertInfo(
-                param_names=["transformer.wte.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.ln_f.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.weight"],
-                data_type=self.data_type,
-                converted_name="head_fc/weight:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.bias"],
-                data_type=self.data_type,
-                converted_name="head_fc/bias:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-        ]
-
-    @property
-    def decoder_convert_info_list(self) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer modules in GPTJ."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_1.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_1.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc_in.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc_out.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}attn.q_proj.weight",
-                            f"{layer_prefix}attn.k_proj.weight",
-                            f"{layer_prefix}attn.v_proj.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.out_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc_in.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc_out.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-
-        return convert_info_list
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before GPTJ's transformer module number."""
-        return "transformer.h."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in GPTJ."""
-        return cast(GPTJConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in GPTJ."""
-        return cast(GPTJConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in GPTJ."""
-        return cast(GPTJConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in gpt-j."""
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head siez of GPTJ."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in codegen MLP."""
-        return self.decoder_hidden_size * 4
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary dim in GPTJ."""
-        return cast(GPTJConfig, self.config).rotary_dim
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary emb base in GPTJ."""
-        return 10000.0
diff --git a/friendli/modules/converter/models/llama.py b/friendli/modules/converter/models/llama.py
deleted file mode 100644
index 19381d5a..00000000
--- a/friendli/modules/converter/models/llama.py
+++ /dev/null
@@ -1,494 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli LLaMA Checkpoint Converter."""
-
-
-from __future__ import annotations
-
-from typing import Any, Dict, Iterable, List, Set, cast
-
-import torch
-from transformers import LlamaConfig, LlamaForCausalLM  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    DecoderOnlyConverter,
-    DecoderOnlyLoraConverter,
-)
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import convert_to_gpt_j_params
-
-
-class LlamaForCausalLMLoraConverter(DecoderOnlyLoraConverter):
-    """LlamaForCausalLM LoRA Converter Class."""
-
-    def pre_convert(
-        self,
-        model: torch.nn.Module,
-    ) -> torch.nn.Module:
-        """Adjust the LoRA Adapter module's params in Llama before converting."""
-        converter = cast(LlamaForCausalLMConverter, self.converter)
-        for layer in cast(LlamaForCausalLM, model).model.layers:
-            if "query" in self.adapter_target_modules:
-                query_b = layer.self_attn.q_proj.lora_B.default.weight
-                query_b = query_b.reshape(
-                    converter.decoder_num_attention_heads,
-                    converter.decoder_head_size,
-                    -1,
-                )
-                query_b = convert_to_gpt_j_params(query_b, converter.decoder_head_size)
-                query_b = query_b.reshape(
-                    converter.decoder_num_attention_heads * converter.decoder_head_size,
-                    -1,
-                )
-                layer.self_attn.q_proj.lora_B.default.weight.data = query_b
-
-            if "key" in self.adapter_target_modules:
-                key_b = layer.self_attn.k_proj.lora_B.default.weight
-                key_b = key_b.reshape(
-                    converter.decoder_num_kv_attention_heads,
-                    converter.decoder_head_size,
-                    -1,
-                )
-                key_b = convert_to_gpt_j_params(key_b, converter.decoder_head_size)
-                key_b = key_b.reshape(
-                    converter.decoder_num_attention_heads * converter.decoder_head_size,
-                    -1,
-                )
-                layer.self_attn.k_proj.lora_B.default.weight.data = key_b
-
-        return model
-
-    @property
-    def adapter_target_module_map(self) -> Dict[str, str]:
-        """Return the dictionary that maps Hugging Face's module name to Friendli's module name."""
-        return {
-            "q_proj": "query",
-            "k_proj": "key",
-            "v_proj": "value",
-            "o_proj": "attn_fc",
-            "up_proj": "ff1",
-            "gate_proj": "ff_gate",
-            "down_proj": "ff2",
-            "embed_tokens": "wte",
-        }
-
-    @property
-    def adapter_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for LoRA adapter modules in Llama."""
-        convert_info_list = []
-        target_modules = self.adapter_target_modules
-
-        # Non-transformer modules
-        if "wte" in target_modules:
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=["model.embed_tokens.lora_embedding_A.default"],
-                        data_type=self.converter.data_type,
-                        converted_name="wte/lora/lora_A/weight:0",
-                        reshape_fn=self.lora_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=["model.embed_tokens.lora_embedding_B.default"],
-                        data_type=self.converter.data_type,
-                        converted_name="wte/lora/lora_B/weight:0",
-                        reshape_fn=self.lora_weight_reshape,
-                    ),
-                ]
-            )
-
-        # Transformer modules
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.converter.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            assert self.adapter_config.target_modules is not None
-
-            if "query" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.q_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/query_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.q_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/query_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "key" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.k_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/key_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.k_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/key_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "value" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.v_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/value_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.v_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/value_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "attn_fc" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.o_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_proj/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}self_attn.o_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_proj/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "ff1" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.up_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_fc/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.up_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_fc/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "ff_gate" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.gate_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_gate/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.gate_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_gate/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "ff2" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.down_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_proj/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.down_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_proj/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-        return convert_info_list
-
-
-class LlamaForCausalLMConverter(
-    DecoderOnlyConverter, RotaryEmbeddingConversionInterface
-):
-    """LlamaForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if LLaMA architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if cast(LlamaConfig, self.config).hidden_act not in ["silu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={cast(LlamaConfig, self.config).hidden_act}'",
-                    valid_options=["silu"],
-                )
-            if cast(LlamaConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if cast(LlamaConfig, self.config).rms_norm_eps not in (1e-5, 1e-6):
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'rms_norm_eps={cast(LlamaConfig, self.config).rms_norm_eps}'",
-                    valid_options=[1e-5, 1e-6],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def qkv_weight_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """qkv_weight_reshape for LLaMA's attention layer."""
-        assert len(params) == 3
-        q_weight = params[0]
-        k_weight = params[1]
-        v_weight = params[2]
-
-        q_weight = q_weight.reshape(
-            self.decoder_num_attention_heads,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        k_weight = k_weight.reshape(
-            self.decoder_num_kv_attention_heads,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        q_weight = convert_to_gpt_j_params(q_weight, self.rotary_dim)
-        k_weight = convert_to_gpt_j_params(k_weight, self.rotary_dim)
-        q_weight = q_weight.reshape(
-            self.decoder_num_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        k_weight = k_weight.reshape(
-            self.decoder_num_kv_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
-        qkv_weight = qkv_weight.transpose(0, -1)
-        return qkv_weight
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(LlamaConfig, self.config)
-
-        logger.info(
-            "The generated attributes set 'max_length' to %d, but you can change the "
-            "'max_length' according to your needs. The Llama model does not rely on "
-            "absolute position embeddings, allowing you to choose any suitable value.",
-            config.max_position_embeddings,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "rotary_dim": self.rotary_dim,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_kv_heads": self.decoder_num_kv_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "ff_intermediate_size": self.decoder_ff_intermediate_size,
-            "max_length": config.max_position_embeddings,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "rope_theta": self.rotary_emb_base,
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "llama"
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in LLaMA."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}input_layernorm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attn.q_proj.weight",
-                            f"{layer_prefix}self_attn.k_proj.weight",
-                            f"{layer_prefix}self_attn.v_proj.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn.o_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}post_attention_layernorm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.gate_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_gate/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.up_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.down_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in LLaMA."""
-        return [
-            ConvertInfo(
-                param_names=["model.embed_tokens.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.norm.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.weight"],
-                data_type=self.data_type,
-                converted_name=f"head_fc/weight:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-        ]
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before LLaMA's transformer block number."""
-        return "model.layers."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in LLaMA."""
-        return cast(LlamaConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in LLaMA."""
-        return cast(LlamaConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in LLaMA."""
-        return cast(LlamaConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in LLaMA."""
-        config = cast(LlamaConfig, self.config)
-        if config.num_key_value_heads is None:
-            return self.decoder_num_attention_heads
-        return config.num_key_value_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of LLaMA."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in LLaMA MLP."""
-        return self.config.intermediate_size
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary embedding dimension of LLaMA."""
-        return self.decoder_head_size
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary embedding base of LLaMA."""
-        return cast(LlamaConfig, self.config).rope_theta
diff --git a/friendli/modules/converter/models/mistral.py b/friendli/modules/converter/models/mistral.py
deleted file mode 100644
index bfc9e75b..00000000
--- a/friendli/modules/converter/models/mistral.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Mistral Checkpoint Converter."""
-
-
-from __future__ import annotations
-
-from typing import Any, Dict, cast
-
-from transformers import MistralConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.models.llama import (
-    LlamaForCausalLMConverter,
-    LlamaForCausalLMLoraConverter,
-)
-
-
-class MistralForCausalLMLoraConverter(LlamaForCausalLMLoraConverter):
-    """MistralForCausalLM LoRA Converter Class."""
-
-
-class MistralForCausalLMConverter(LlamaForCausalLMConverter):
-    """MistralForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if Mistral architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if cast(MistralConfig, self.config).hidden_act not in ["silu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={cast(MistralConfig, self.config).hidden_act}'",
-                    valid_options=["silu"],
-                )
-            if cast(MistralConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-
-            if cast(MistralConfig, self.config).rms_norm_eps not in (1e-5, 1e-6):
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'rms_norm_eps={cast(MistralConfig, self.config).rms_norm_eps}'",
-                    valid_options=[1e-5, 1e-6],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(MistralConfig, self.config)
-
-        logger.info(
-            "The generated attributes set 'max_length' to %d, but you can change the "
-            "'max_length' according to your needs. The Mistral model does not rely on "
-            "absolute position embeddings, allowing you to choose any suitable value.",
-            config.max_position_embeddings,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "rotary_dim": self.rotary_dim,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_kv_heads": self.decoder_num_kv_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "ff_intermediate_size": self.decoder_ff_intermediate_size,
-            "max_length": config.max_position_embeddings,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "attention_window_size": self.attention_window_size,  # for sliding window,
-            "rope_theta": self.rotary_emb_base,
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "mistral"
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in Mistral."""
-        return cast(MistralConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in Mistral."""
-        return cast(MistralConfig, self.config).hidden_size
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of linear layer in Mistral MLP."""
-        return cast(MistralConfig, self.config).intermediate_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in Mistral."""
-        return cast(MistralConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in Mistral."""
-        config = cast(MistralConfig, self.config)
-        if config.num_key_value_heads is None:
-            return self.decoder_num_attention_heads
-        return config.num_key_value_heads
-
-    @property
-    def attention_window_size(self) -> int:
-        """The size of sliding window attention in Mistral."""
-        return cast(MistralConfig, self.config).sliding_window
diff --git a/friendli/modules/converter/models/mixtral.py b/friendli/modules/converter/models/mixtral.py
deleted file mode 100644
index 5cf5a366..00000000
--- a/friendli/modules/converter/models/mixtral.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Mixtral Checkpoint Converter."""
-
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, Optional, cast
-
-from transformers import MixtralConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.models.llama import LlamaForCausalLMConverter
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class MixtralForCausalLMConverter(LlamaForCausalLMConverter):
-    """MixtralForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if Mixtral architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if cast(MixtralConfig, self.config).hidden_act not in ["silu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={cast(MixtralConfig, self.config).hidden_act}'",
-                    valid_options=["silu"],
-                )
-            if cast(MixtralConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if cast(MixtralConfig, self.config).num_local_experts != 8:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'num_local_experts={cast(MixtralConfig, self.config).num_local_experts}",
-                    valid_options=[8],
-                )
-            if cast(MixtralConfig, self.config).num_experts_per_tok != 2:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'num_experts_per_tok={cast(MixtralConfig, self.config).num_experts_per_tok}",
-                    valid_options=[2],
-                )
-
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(MixtralConfig, self.config)
-
-        logger.info(
-            "The generated attributes set 'max_length' to %d, but you can change the "
-            "'max_length' according to your needs. The Mixtral model does not rely on "
-            "absolute position embeddings, allowing you to choose any suitable value.",
-            config.max_position_embeddings,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "rotary_dim": self.rotary_dim,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_kv_heads": self.decoder_num_kv_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "ff_intermediate_size": self.decoder_ff_intermediate_size,
-            "max_length": config.max_position_embeddings,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "rope_theta": self.rotary_emb_base,
-            "num_experts": self.num_experts,
-        }
-        if isinstance(self.attention_window_size, int):
-            # for sliding window
-            attr["attention_window_size"] = self.attention_window_size
-        return attr
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in LLaMA."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}input_layernorm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attn.q_proj.weight",
-                            f"{layer_prefix}self_attn.k_proj.weight",
-                            f"{layer_prefix}self_attn.v_proj.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn.o_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}post_attention_layernorm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}block_sparse_moe.gate.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}moe/router/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-            for i in range(self.num_experts):
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}block_sparse_moe.experts.{i}.w1.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}moe/{i}/mlp/c_gate/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}block_sparse_moe.experts.{i}.w2.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}moe/{i}/mlp/c_proj/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}block_sparse_moe.experts.{i}.w3.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefix}moe/{i}/mlp/c_fc/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                    ]
-                )
-        return convert_info_list
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "mixtral"
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in Mixtral."""
-        return cast(MixtralConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in Mixtral."""
-        return cast(MixtralConfig, self.config).hidden_size
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of linear layer in Mixtral MoEs."""
-        return cast(MixtralConfig, self.config).intermediate_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in Mixtral."""
-        return cast(MixtralConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in Mixtral."""
-        config = cast(MixtralConfig, self.config)
-        if config.num_key_value_heads is None:
-            return self.decoder_num_attention_heads
-        return config.num_key_value_heads
-
-    @property
-    def attention_window_size(self) -> Optional[int]:
-        """The size of sliding window attention in Mixtral."""
-        return cast(MixtralConfig, self.config).sliding_window
-
-    @property
-    def num_experts(self) -> int:
-        """The number of moe experts per transformer block in Mixtral."""
-        return cast(MixtralConfig, self.config).num_local_experts
-
-    @property
-    def num_selected_moe_experts(self) -> int:
-        """The number of selected moe experts per transformer block in Mixtral."""
-        return cast(MixtralConfig, self.config).num_experts_per_tok
diff --git a/friendli/modules/converter/models/mpt.py b/friendli/modules/converter/models/mpt.py
deleted file mode 100644
index 48c332b6..00000000
--- a/friendli/modules/converter/models/mpt.py
+++ /dev/null
@@ -1,397 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli MPT Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-from transformers import (  # type: ignore[import]
-    GenerationConfig,
-    MptConfig,
-    PretrainedConfig,
-)
-
-from friendli.enums import ModelDataType  # type: ignore[import]
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    DecoderOnlyConverter,
-    DecoderOnlyLoraConverter,
-)
-from friendli.modules.converter.schema import ConvertInfo
-
-
-def safe_attn_config_get(attn_config: Dict[str, Any], key: str) -> Any:
-    """Safe getter from MptAttentionConfig.
-
-    This function is a temporary function because MptAttentionConfig
-    is not supported `attn_type="grouped_query_attention"` yet.
-    """
-    if key not in attn_config:
-        raise CheckpointConversionError(
-            f"{key} does not exist in MptAttentionConfig {attn_config}"
-        )
-
-    return attn_config[key]
-
-
-class MptForCausalLMLoraConverter(DecoderOnlyLoraConverter):
-    """MptForCausalLM LoRA Converter Class."""
-
-    @property
-    def adapter_target_module_map(self) -> Dict[str, str]:
-        """Return the dictionary that maps Hugging Face's module name to Friendli's module name."""
-        return {
-            "Wqkv": "merged-qkv",
-            "out_proj": "attn_fc",
-            "up_proj": "ff1",
-            "down_proj": "ff2",
-            "wte": "wte",
-        }
-
-    @property
-    def adapter_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for LoRA adapter modules in Mpt."""
-        convert_info_list = []
-        target_modules = self.adapter_target_modules
-
-        # Non-transformer modules
-        if "wte" in target_modules:
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=["transformer.wte.lora_embedding_A.default"],
-                        data_type=self.converter.data_type,
-                        converted_name="wte/lora/lora_A/weight:0",
-                        reshape_fn=self.lora_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=["transformer.wte.lora_embedding_B.default"],
-                        data_type=self.converter.data_type,
-                        converted_name="wte/lora/lora_B/weight:0",
-                        reshape_fn=self.lora_weight_reshape,
-                    ),
-                ]
-            )
-
-        # Transformer modules
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.converter.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-
-            if "merged-qkv" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.Wqkv.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.Wqkv.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_attn/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "attn_fc" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.out_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_proj/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}attn.out_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}attn/c_proj/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "ff1" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.up_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_fc/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.up_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_fc/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-            if "ff2" in target_modules:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.down_proj.lora_A.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_proj/lora/lora_A/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}mlp.down_proj.lora_B.default.weight"
-                            ],
-                            data_type=self.converter.data_type,
-                            converted_name=f"{converted_prefix}mlp/c_proj/lora/lora_B/weight:0",
-                            reshape_fn=self.lora_weight_reshape,
-                        ),
-                    ]
-                )
-
-        return convert_info_list
-
-
-class MPTForCausalLMConverter(DecoderOnlyConverter):
-    """MPTForCausalLM Architectures Converter Class."""
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        generation_config: GenerationConfig | None,
-        data_type: ModelDataType,
-    ) -> None:
-        """Initialize MPTForCausalLMConverter."""
-        super().__init__(config, generation_config, data_type)
-        attn_config = cast(MptConfig, config).attn_config
-        if isinstance(attn_config, PretrainedConfig):
-            attn_config = attn_config.to_dict()  # type: ignore
-        self.attn_config = attn_config
-
-    def check_config(self) -> None:
-        """Check if MPT architectures' config can be converted to Friendli format."""
-        super().check_config()
-
-        if not safe_attn_config_get(self.attn_config, "alibi"):
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'alibi={safe_attn_config_get(self.attn_config, 'alibi')}'",
-                valid_options=[True],
-            )
-
-        if safe_attn_config_get(self.attn_config, "alibi_bias_max") != 8:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'alibi={safe_attn_config_get(self.attn_config, 'alibi_bias_max')}'",
-                valid_options=[8],
-            )
-
-        if safe_attn_config_get(self.attn_config, "attn_type") != "multihead_attention":
-            if (
-                safe_attn_config_get(self.attn_config, "attn_type")
-                == "grouped_query_attention"
-            ):
-                raise CheckpointConversionError(
-                    msg="MptAttentionConfig does not support `attn_type=`grouped_query_attention`` yet (as of transformers==4.35.2).",
-                )
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'attn_type={safe_attn_config_get(self.attn_config, 'attn_type')}'",
-                valid_options=["multihead_attention"],
-            )
-
-        if safe_attn_config_get(self.attn_config, "prefix_lm"):
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'prefix_lm={safe_attn_config_get(self.attn_config, 'prefix_lm')}'",
-                valid_options=[False],
-            )
-
-        if safe_attn_config_get(self.attn_config, "qk_ln"):
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'qk_ln={safe_attn_config_get(self.attn_config, 'qk_ln')}'",
-                valid_options=[False],
-            )
-
-        if safe_attn_config_get(self.attn_config, "softmax_scale") is not None:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'softmax_scale={safe_attn_config_get(self.attn_config, 'softmax_scale')}'",
-                valid_options=[None],
-            )
-
-        if cast(MptConfig, self.config).expansion_ratio != 4:
-            raise NotSupportedCheckpointError(
-                invalid_option=(
-                    f"'expansion_ratio={cast(MptConfig, self.config).expansion_ratio}'"
-                ),
-                valid_options=[4],
-            )
-
-        if not cast(MptConfig, self.config).no_bias:
-            raise NotSupportedCheckpointError(
-                invalid_option=f"'no_bias={cast(MptConfig, self.config).no_bias}'",
-                valid_options=[True],
-            )
-
-        if cast(MptConfig, self.config).logit_scale is not None:
-            raise NotSupportedCheckpointError(
-                invalid_option=(
-                    f"'logit_scale={cast(MptConfig, self.config).logit_scale}'"
-                ),
-                valid_options=[None],
-            )
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in MPT."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}norm_1.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}norm_2.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.Wqkv.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.out_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ffn.up_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ffn.down_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-
-        return convert_info_list
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in MPT."""
-        return [
-            ConvertInfo(
-                param_names=["transformer.wte.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["transformer.norm_f.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-        ]
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        logger.info(
-            "The generated attributes set 'max_length' to %d, but you can change the "
-            "'max_length' according to your needs. The MPT model does not rely on "
-            "absolute position embeddings, allowing you to choose any suitable value.",
-            cast(MptConfig, self.config).max_seq_len,
-        )
-
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_kv_heads": self.decoder_num_kv_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "max_length": cast(MptConfig, self.config).max_seq_len,
-            "vocab_size": cast(MptConfig, self.config).vocab_size,
-            "clip_qkv": safe_attn_config_get(self.attn_config, "clip_qkv") or 0.0,
-            "eos_token": self.get_eos_token_id() or "FILL ME",
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "mpt"
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before the MPT's transformer block number."""
-        return "transformer.blocks."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in MPT."""
-        return cast(MptConfig, self.config).n_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in MPT."""
-        return cast(MptConfig, self.config).d_model
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in MPT."""
-        return cast(MptConfig, self.config).n_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in MPT."""
-        if "kv_n_heads" in self.attn_config:
-            return self.attn_config["kv_n_heads"]
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of MPT."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in MPT MLP."""
-        return self.decoder_hidden_size * 4
diff --git a/friendli/modules/converter/models/opt.py b/friendli/modules/converter/models/opt.py
deleted file mode 100644
index 6d8ad8aa..00000000
--- a/friendli/modules/converter/models/opt.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli OPT Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-import torch
-from transformers import OPTConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import DECODER_PREFIX, DecoderOnlyConverter
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class OPTForCausalLMConverter(DecoderOnlyConverter):
-    """OPTForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if OPT architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if cast(OPTConfig, self.config).activation_function not in ["relu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'activation_function="
-                    f"{cast(OPTConfig, self.config).activation_function}'",
-                    valid_options=["relu"],
-                )
-            if not cast(OPTConfig, self.config).do_layer_norm_before is True:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'do_layer_norm_before={False}'",
-                    valid_options=[True],
-                )
-            if (
-                cast(OPTConfig, self.config).word_embed_proj_dim
-                != cast(OPTConfig, self.config).hidden_size
-            ):
-                raise NotSupportedCheckpointError(
-                    invalid_option="'word_embed_proj_dim"
-                    f"({cast(OPTConfig, self.config).word_embed_proj_dim}) "
-                    f"!= hidden_size({cast(OPTConfig, self.config).hidden_size})'",
-                    valid_options=[
-                        f"'word_embed_proj_dim({cast(OPTConfig, self.config).hidden_size}) "
-                        f"== hidden_size({cast(OPTConfig, self.config).hidden_size})'"
-                    ],
-                )
-            if cast(  # pylint: disable=protected-access
-                OPTConfig, self.config
-            )._remove_final_layer_norm:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'_remove_final_layer_norm={True}'",
-                    valid_options=[False],
-                )
-            if not cast(OPTConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'tie_word_embeddings={False}'",
-                    valid_options=[True],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def pos_embed_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """Positional embedding weight convert for OPT's decoder."""
-        assert len(params) == 1
-        pos_emb = params[0]
-        pos_emb = pos_emb[2:, :]  # offset pos emb
-
-        return pos_emb
-
-    def qkv_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """qkv_weight_reshape for OPT's attention layer."""
-        qkv_weight = torch.cat(
-            params,
-            dim=0,
-        )
-        qkv_weight = qkv_weight.transpose(0, 1)
-        return qkv_weight
-
-    def qkv_bias_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """qkv_bias_reshape for OPT's attention layer."""
-        qkv_bias = torch.cat(
-            params,
-            dim=0,
-        )
-        return qkv_bias
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(OPTConfig, self.config)
-
-        logger.warn(
-            "Since OPT uses absolute position embedding, 'max_length' cannot be "
-            "larger than %d.",
-            config.max_position_embeddings,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "max_length": config.max_position_embeddings,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "opt"
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in OPT."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn_layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn_layer_norm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attn.q_proj.bias",
-                            f"{layer_prefix}self_attn.k_proj.bias",
-                            f"{layer_prefix}self_attn.v_proj.bias",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/bias:0",
-                        reshape_fn=self.qkv_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn.out_proj.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}final_layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}final_layer_norm.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_2/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc1.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc2.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc2.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}fc1.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}self_attn.out_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}self_attn.q_proj.weight",
-                            f"{layer_prefix}self_attn.k_proj.weight",
-                            f"{layer_prefix}self_attn.v_proj.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in OPT."""
-        return [
-            ConvertInfo(
-                param_names=["model.decoder.embed_tokens.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.decoder.embed_positions.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/wpe/weight:0",
-                reshape_fn=self.pos_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.decoder.final_layer_norm.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["model.decoder.final_layer_norm.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.weight"],
-                data_type=self.data_type,
-                converted_name="head_fc/weight:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-        ]
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before OPT's transformer block number."""
-        return "model.decoder.layers."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in OPT."""
-        return cast(OPTConfig, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in OPT."""
-        return cast(OPTConfig, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in OPT."""
-        return cast(OPTConfig, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in opt."""
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of OPT."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in codegen OPT."""
-        return self.decoder_hidden_size * 4
diff --git a/friendli/modules/converter/models/phi3.py b/friendli/modules/converter/models/phi3.py
deleted file mode 100644
index 3f05b8bd..00000000
--- a/friendli/modules/converter/models/phi3.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Cohere Checkpoint Converter."""
-
-
-from __future__ import annotations
-
-from typing import cast
-
-from transformers.models.phi3 import Phi3Config  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.modules.converter.base import FP8OnlyConverter
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-
-
-class Phi3ForCausalLMConverter(FP8OnlyConverter, RotaryEmbeddingConversionInterface):
-    """Phi3ForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if phi3 architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if cast(Phi3Config, self.config).hidden_act not in ["silu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={cast(Phi3Config, self.config).hidden_act}'",
-                    valid_options=["silu"],
-                )
-            if cast(Phi3Config, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "phi3"
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before phi3's transformer block number."""
-        return "model.layers."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in phi3."""
-        return cast(Phi3Config, self.config).num_hidden_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in phi3."""
-        return cast(Phi3Config, self.config).hidden_size
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in phi3."""
-        return cast(Phi3Config, self.config).num_attention_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in phi3."""
-        config = cast(Phi3Config, self.config)
-        if config.num_key_value_heads is None:
-            return self.decoder_num_attention_heads
-        return config.num_key_value_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of phi3."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in phi3 MLP."""
-        return self.config.intermediate_size
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary embedding dimension of phi3."""
-        return self.decoder_head_size
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary embedding base of phi3."""
-        return cast(Phi3Config, self.config).rope_theta
diff --git a/friendli/modules/converter/models/phi_msft.py b/friendli/modules/converter/models/phi_msft.py
deleted file mode 100644
index 493c4402..00000000
--- a/friendli/modules/converter/models/phi_msft.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright (c) 2023-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Phi Checkpoint Converter."""
-
-
-from __future__ import annotations
-
-import math
-from typing import Any, Dict, List, Optional, cast
-
-import torch
-from transformers import PretrainedConfig  # type: ignore[import]
-
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    SUPPORTED_GELU_FAMILY,
-    DecoderOnlyConverter,
-)
-from friendli.modules.converter.interface import RotaryEmbeddingConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import convert_to_gpt_j_params
-
-
-class PhiMsftConfig(PretrainedConfig):
-    """Phi msft configuration. Different from the HuggingFace PhiConfig."""
-
-    model_type = "phi"
-    attribute_map = {
-        "max_position_embeddings": "n_positions",
-        "hidden_size": "n_embd",
-        "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
-    }
-
-    def __init__(
-        self,
-        vocab_size: int = 50304,
-        n_positions: int = 2048,
-        n_embd: int = 1024,
-        n_layer: int = 20,
-        n_inner: Optional[int] = None,
-        n_head: int = 16,
-        n_head_kv: Optional[int] = None,
-        rotary_dim: Optional[int] = 32,
-        activation_function: Optional[str] = "gelu_new",
-        flash_attn: bool = False,
-        flash_rotary: bool = False,
-        fused_dense: bool = False,
-        attn_pdrop: float = 0.0,
-        embd_pdrop: float = 0.0,
-        resid_pdrop: float = 0.0,
-        layer_norm_epsilon: float = 1e-5,
-        initializer_range: float = 0.02,
-        tie_word_embeddings: bool = False,
-        pad_vocab_size_multiple: int = 64,
-        **kwargs,
-    ) -> None:
-        """Initalize the configuration for a phi-msft model."""
-        self.vocab_size = int(
-            math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-        )
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_inner = n_inner
-        self.n_head = n_head
-        self.n_head_kv = n_head_kv
-        self.rotary_dim = min(rotary_dim, n_embd // n_head)  # type: ignore[type-var]
-        self.activation_function = activation_function
-        self.flash_attn = flash_attn
-        self.flash_rotary = flash_rotary
-        self.fused_dense = fused_dense
-        self.attn_pdrop = attn_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.resid_pdrop = resid_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
-
-
-class PhiForCausalLMConverter(DecoderOnlyConverter):
-    """PhiForCausalLM Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if Phi architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if (
-                cast(PhiMsftConfig, self.config).activation_function
-                not in SUPPORTED_GELU_FAMILY
-            ):
-                raise NotSupportedCheckpointError(
-                    invalid_option="'activation_function="
-                    f"{cast(PhiMsftConfig, self.config).activation_function}'",
-                    valid_options=SUPPORTED_GELU_FAMILY,
-                )
-            if cast(PhiMsftConfig, self.config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def qkv_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """qkv_weight_reshape for Phi's attention layer."""
-        assert len(params) == 1
-        qkv_weight = params[0]
-
-        q_size = self.decoder_num_attention_heads * self.decoder_head_size
-        kv_size = self.decoder_num_kv_attention_heads * self.decoder_head_size
-        q_weight, k_weight, v_weight = torch.split(
-            qkv_weight, [q_size, kv_size, kv_size], dim=0
-        )
-
-        q_weight = q_weight.reshape(
-            self.decoder_num_attention_heads,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        k_weight = k_weight.reshape(
-            self.decoder_num_kv_attention_heads,
-            self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        q_weight = convert_to_gpt_j_params(q_weight, self.rotary_dim)
-        k_weight = convert_to_gpt_j_params(k_weight, self.rotary_dim)
-
-        q_weight = q_weight.reshape(
-            self.decoder_num_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-        k_weight = k_weight.reshape(
-            self.decoder_num_kv_attention_heads * self.decoder_head_size,
-            self.decoder_hidden_size,
-        )
-
-        qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
-        qkv_weight = qkv_weight.transpose(0, -1)
-        return qkv_weight
-
-    def qkv_bias_reshape(self, params: List[torch.Tensor]) -> torch.Tensor:
-        """qkv_bias_reshape for Phi's attention layer."""
-        assert len(params) == 1
-        qkv_bias = params[0]
-
-        q_size = self.decoder_num_attention_heads * self.decoder_head_size
-        kv_size = self.decoder_num_kv_attention_heads * self.decoder_head_size
-
-        q_bias, k_bias, v_bias = torch.split(
-            qkv_bias, [q_size, kv_size, kv_size], dim=0
-        )
-
-        q_bias = q_bias.reshape(
-            self.decoder_num_attention_heads, self.decoder_head_size
-        )
-        k_bias = k_bias.reshape(
-            self.decoder_num_kv_attention_heads, self.decoder_head_size
-        )
-
-        q_bias = convert_to_gpt_j_params(q_bias, self.rotary_dim).flatten()
-        k_bias = convert_to_gpt_j_params(k_bias, self.rotary_dim).flatten()
-
-        qkv_bias = torch.cat((q_bias, k_bias, v_bias), dim=0)
-        return qkv_bias
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(PhiMsftConfig, self.config)
-
-        logger.info(
-            "The generated attributes set 'max_length' to %d, but you can change the "
-            "'max_length' according to your needs. The Phi model does not rely on "
-            "absolute position embeddings, allowing you to choose any suitable value.",
-            config.n_positions,
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.decoder_head_size,
-            "rotary_dim": self.rotary_dim,
-            "num_heads": self.decoder_num_attention_heads,
-            "num_kv_heads": self.decoder_num_kv_attention_heads,
-            "num_layers": self.decoder_layer_num,
-            "ff_intermediate_size": self.decoder_ff_intermediate_size,
-            "max_length": config.n_positions,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "rope_theta": self.rotary_emb_base,
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        return "phi"
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in Phi."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}ln_1/beta:0",
-                        reshape_fn=self.ln_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc1.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc1.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc2.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.fc2.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mixer.Wqkv.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mixer.Wqkv.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/bias:0",
-                        reshape_fn=self.qkv_bias_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mixer.out_proj.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mixer.out_proj.bias"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/bias:0",
-                        reshape_fn=self.linear_bias_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for non-transformer blocks in Phi."""
-        return [
-            ConvertInfo(
-                param_names=["transformer.embd.wte.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.ln.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.ln.bias"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/beta:0",
-                reshape_fn=self.ln_bias_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.linear.weight"],
-                data_type=self.data_type,
-                converted_name="head_fc/weight:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["lm_head.linear.bias"],
-                data_type=self.data_type,
-                converted_name="head_fc/bias:0",
-                reshape_fn=self.head_weight_reshape,
-            ),
-        ]
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before Phi's transformer module number."""
-        return "transformer.h."
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of decoder layers in Phi."""
-        return cast(PhiMsftConfig, self.config).n_layer
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size in Phi."""
-        return cast(PhiMsftConfig, self.config).n_embd
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads in Phi."""
-        return cast(PhiMsftConfig, self.config).n_head
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads in Phi."""
-        config = cast(PhiMsftConfig, self.config)
-        if config.n_head_kv is not None:
-            return config.n_head_kv
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of Phi."""
-        return self.decoder_hidden_size // self.decoder_num_attention_heads
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate size of the linear layer in codegen MLP."""
-        config = cast(PhiMsftConfig, self.config)
-        if config.n_inner is None:
-            return self.decoder_hidden_size * 4
-        return config.n_inner
-
-    @property
-    def rotary_dim(self) -> int:
-        """The rotary dim in Phi."""
-        return cast(PhiMsftConfig, self.config).rotary_dim  # type: ignore[return-value]
-
-    @property
-    def rotary_emb_base(self) -> float:
-        """The rotary emb base in Phi."""
-        return 10000.0
diff --git a/friendli/modules/converter/models/t5.py b/friendli/modules/converter/models/t5.py
deleted file mode 100644
index ba188bd2..00000000
--- a/friendli/modules/converter/models/t5.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli T5 Checkpoint Converter."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, cast
-
-import torch
-from transformers import T5Config  # type: ignore[import]
-
-from friendli.enums import ModelDataType
-from friendli.errors import CheckpointConversionError, NotSupportedCheckpointError
-from friendli.logging import logger
-from friendli.modules.converter.base import (
-    DECODER_PREFIX,
-    ENCODER_PREFIX,
-    EncoderDecoderConverter,
-)
-from friendli.modules.converter.schema import ConvertInfo
-
-
-class T5Converter(EncoderDecoderConverter):
-    """T5ForConditionalGeneration Architectures Converter Class."""
-
-    def check_config(self) -> None:
-        """Check if T5 architectures' config can be converted to Friendli format."""
-        super().check_config()
-        try:
-            if not (
-                cast(T5Config, self.config).is_gated_act
-                ^ cast(T5Config, self.config).tie_word_embeddings
-            ):
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'is_gated_act={cast(T5Config, self.config).is_gated_act}'and "
-                    f"'tie_word_embeddings={cast(T5Config, self.config).tie_word_embeddings}'",
-                    valid_options=[
-                        "'is_gated_act' and 'tie_word_embeddings' should be different."
-                    ],
-                )
-
-            if cast(T5Config, self.config).layer_norm_epsilon != 1e-6:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'layer_norm_epsilon="
-                    f"{cast(T5Config, self.config).layer_norm_epsilon}'",
-                    valid_options=[1e-6],
-                )
-        except AttributeError as exc:
-            raise CheckpointConversionError(str(exc)) from exc
-
-    def _decoder_final_ln_weight_reshape(
-        self, params: List[torch.Tensor]
-    ) -> torch.Tensor:
-        """Special handle for T5."""
-        assert len(params) == 1
-        param = params[0]
-
-        if cast(T5Config, self.config).tie_word_embeddings:
-            param = param * (cast(T5Config, self.config).d_model ** -0.5)
-
-        return param
-
-    def pos_embed_weight_reshape(
-        self,
-        params: List[torch.Tensor],
-    ) -> torch.Tensor:
-        """Reshape positional embedding weights in T5."""
-        assert len(params) == 1
-        return params[0]
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Get checkpoint attributes."""
-        config = cast(T5Config, self.config)
-
-        logger.warn(
-            "The 'max_input_length' and 'max_output_length' fields are left blank as "
-            "they cannot be automatically configured. "
-            "Determine the 'max_input_length' and 'max_output_length' according to your "
-            "needs. The T5 model does not rely on absolute position embeddings, "
-            "allowing you to choose any suitable value."
-        )
-
-        eos_token_id = self.get_eos_token_id()
-        decoder_start_token_id = self.get_decoder_start_token_id()
-        attr = {
-            "model_type": self.model_type,
-            "dtype": self.data_type.value,
-            "head_size": self.encoder_head_size,
-            "num_heads": self.encoder_num_attention_heads,
-            "hidden_size": self.encoder_hidden_size,
-            "ff_intermediate_size": self.decoder_ff_intermediate_size,
-            "num_encoder_layers": self.encoder_layer_num,
-            "num_decoder_layers": self.decoder_layer_num,
-            "max_input_length": "FILL ME",
-            "max_output_length": "FILL ME",
-            "num_pos_emb_buckets": config.relative_attention_num_buckets,
-            "max_pos_distance": config.relative_attention_max_distance,
-            "vocab_size": config.vocab_size,
-            "eos_token": eos_token_id if eos_token_id is not None else "FILL ME",
-            "decoder_start_token": (
-                decoder_start_token_id
-                if decoder_start_token_id is not None
-                else "FILL ME"
-            ),
-        }
-        return attr
-
-    @property
-    def model_type(self) -> str:
-        """Model type."""
-        if cast(T5Config, self.config).is_gated_act:
-            return "t5-v1_1"
-        return "t5"
-
-    @property
-    def encoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in T5's encoder."""
-        convert_info_list = []
-        for i in range(self.encoder_layer_num):
-            layer_prefix = f"{self.encoder_layer_prefix}{i}."
-            converted_prefixe = f"{ENCODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}layer.0.layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}layer.1.layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}layer.0.SelfAttention.q.weight",
-                            f"{layer_prefix}layer.0.SelfAttention.k.weight",
-                            f"{layer_prefix}layer.0.SelfAttention.v.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}layer.0.SelfAttention.o.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-
-            if cast(T5Config, self.config).is_gated_act:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.1.DenseReluDense.wi_0.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_gate/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.1.DenseReluDense.wi_1.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_fc/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.1.DenseReluDense.wo.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_proj/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                    ]
-                )
-            else:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.1.DenseReluDense.wi.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_fc/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.1.DenseReluDense.wo.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_proj/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                    ]
-                )
-
-        return convert_info_list
-
-    @property
-    def decoder_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The list of conversion informations for transformer blocks in T5's decoder."""
-        convert_info_list = []
-        for i in range(self.decoder_layer_num):
-            layer_prefix = f"{self.decoder_layer_prefix}{i}."
-            converted_prefixe = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}layer.0.layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}ln_1/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}layer.1.layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}ln_2/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}layer.2.layer_norm.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}ln_3/gamma:0",
-                        reshape_fn=self.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}layer.0.SelfAttention.q.weight",
-                            f"{layer_prefix}layer.0.SelfAttention.k.weight",
-                            f"{layer_prefix}layer.0.SelfAttention.v.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}layer.0.SelfAttention.o.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}layer.1.EncDecAttention.q.weight",
-                            f"{layer_prefix}layer.1.EncDecAttention.k.weight",
-                            f"{layer_prefix}layer.1.EncDecAttention.v.weight",
-                        ],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}cross_attn/c_attn/weight:0",
-                        reshape_fn=self.qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}layer.1.EncDecAttention.o.weight"],
-                        data_type=self.data_type,
-                        converted_name=f"{converted_prefixe}cross_attn/c_proj/weight:0",
-                        reshape_fn=self.linear_weight_reshape,
-                    ),
-                ]
-            )
-
-            if cast(T5Config, self.config).is_gated_act:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.2.DenseReluDense.wi_0.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_gate/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.2.DenseReluDense.wi_1.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_fc/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.2.DenseReluDense.wo.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_proj/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                    ]
-                )
-            else:
-                convert_info_list.extend(
-                    [
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.2.DenseReluDense.wi.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_fc/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                        ConvertInfo(
-                            param_names=[
-                                f"{layer_prefix}layer.2.DenseReluDense.wo.weight"
-                            ],
-                            data_type=self.data_type,
-                            converted_name=f"{converted_prefixe}mlp/c_proj/weight:0",
-                            reshape_fn=self.linear_weight_reshape,
-                        ),
-                    ]
-                )
-
-        return convert_info_list
-
-    @property
-    def non_transformer_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """The convert_info_list for non-transformer blocks in T5."""
-        convert_info_list = [
-            ConvertInfo(
-                param_names=[f"shared.weight"],
-                data_type=self.data_type,
-                converted_name="wte/weight:0",
-                reshape_fn=self.token_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=[
-                    "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
-                ],
-                data_type=ModelDataType.FP32,
-                converted_name=f"{ENCODER_PREFIX}/wpe/weight:0",
-                reshape_fn=self.pos_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=[
-                    "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
-                ],
-                data_type=ModelDataType.FP32,
-                converted_name=f"{DECODER_PREFIX}/wpe/weight:0",
-                reshape_fn=self.pos_embed_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["encoder.final_layer_norm.weight"],
-                data_type=self.data_type,
-                converted_name=f"{ENCODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self.ln_weight_reshape,
-            ),
-            ConvertInfo(
-                param_names=["decoder.final_layer_norm.weight"],
-                data_type=self.data_type,
-                converted_name=f"{DECODER_PREFIX}/ln_f/gamma:0",
-                reshape_fn=self._decoder_final_ln_weight_reshape,
-            ),
-        ]
-
-        if not cast(T5Config, self.config).tie_word_embeddings:
-            convert_info_list.append(
-                ConvertInfo(
-                    param_names=["lm_head.weight"],
-                    data_type=self.data_type,
-                    converted_name="head_fc/weight:0",
-                    reshape_fn=self.head_weight_reshape,
-                )
-            )
-
-        return convert_info_list
-
-    @property
-    def encoder_layer_prefix(self) -> str:
-        """The layer name prefix used before T5 encoder's transformer block number."""
-        return "encoder.block."
-
-    @property
-    def decoder_layer_prefix(self) -> str:
-        """The layer name prefix used before T5 decoder's transformer block number."""
-        return "decoder.block."
-
-    @property
-    def encoder_layer_num(self) -> int:
-        """The number of transformer blocks in T5 encoder."""
-        return cast(T5Config, self.config).num_layers
-
-    @property
-    def encoder_hidden_size(self) -> int:
-        """The hidden size of T5 encoder."""
-        return cast(T5Config, self.config).d_model
-
-    @property
-    def encoder_num_attention_heads(self) -> int:
-        """The number of attention heads of T5 encoder."""
-        return cast(T5Config, self.config).num_heads
-
-    @property
-    def encoder_head_size(self) -> int:
-        """The head size of T5 encoder."""
-        return cast(T5Config, self.config).d_kv
-
-    @property
-    def encoder_ff_intermediate_size(self) -> int:
-        """The intermediate of the linear layer in T5 encoder's MLP."""
-        return cast(T5Config, self.config).d_ff
-
-    @property
-    def decoder_layer_num(self) -> int:
-        """The number of transformer blocks in T5 decoder."""
-        return cast(T5Config, self.config).num_decoder_layers
-
-    @property
-    def decoder_hidden_size(self) -> int:
-        """The hidden size of T5 decoder."""
-        return cast(T5Config, self.config).d_model
-
-    @property
-    def decoder_num_attention_heads(self) -> int:
-        """The number of attention heads of T5 decoder."""
-        return cast(T5Config, self.config).num_heads
-
-    @property
-    def decoder_num_kv_attention_heads(self) -> int:
-        """The number of key-value attention heads of t5 decoder."""
-        return self.decoder_num_attention_heads
-
-    @property
-    def decoder_head_size(self) -> int:
-        """The head size of T5 decoder."""
-        return cast(T5Config, self.config).d_kv
-
-    @property
-    def decoder_ff_intermediate_size(self) -> int:
-        """The intermediate of the linear layer in T5 decoder's MLP."""
-        return cast(T5Config, self.config).d_ff
diff --git a/friendli/modules/converter/saver.py b/friendli/modules/converter/saver.py
deleted file mode 100644
index e9d6d2ae..00000000
--- a/friendli/modules/converter/saver.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright (c) 2023-present, FriendliAI Inc. All rights reserved.
-
-"""Savers to save a converted checkpoints into various file types."""
-
-from __future__ import annotations
-
-import json
-import os
-from abc import abstractmethod
-from contextlib import AbstractContextManager
-from typing import Dict, Generic, List, TypeVar, Union
-
-import h5py  # type: ignore[import]
-import numpy as np
-import safetensors.numpy  # type: ignore[import]
-import safetensors.torch  # type: ignore[import]
-import torch
-from typing_extensions import Self
-
-from friendli.enums import CheckpointFileType
-from friendli.errors import CheckpointConversionError
-from friendli.logging import logger
-
-
-def get_saver(
-    ckpt_file_type: CheckpointFileType, output_dir: str, output_file_name: str
-) -> CheckpointSaver:
-    """Create a saver that corresponds to the file type."""
-    if ckpt_file_type == CheckpointFileType.HDF5:
-        return HDF5Saver(output_dir, output_file_name)
-    if ckpt_file_type == CheckpointFileType.SAFETENSORS:
-        return SafetensorsSaver(output_dir, output_file_name)
-    raise CheckpointConversionError(
-        f"Output file type {ckpt_file_type} is not supported."
-    )
-
-
-class CheckpointSaver(AbstractContextManager):
-    """Abstract for savers."""
-
-    def __init__(
-        self, output_dir: Union[str, os.PathLike], output_file_name: str
-    ) -> None:
-        """Check that the output file already exists."""
-        super().__init__()
-        self._output_dir = output_dir
-        self._output_file_name = output_file_name
-
-    @abstractmethod
-    def save_tensor(self, tensor_id: str, t: Union[np.ndarray, torch.Tensor]) -> None:
-        """Save the tensor in the file."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the output checkpoint file."""
-        raise NotImplementedError
-
-    def __enter__(self) -> Self:
-        """Enter for context manager."""
-        return self
-
-    def __exit__(self, *exc) -> None:
-        """Exit for context manager."""
-        self.close()
-
-
-class HDF5Saver(CheckpointSaver):
-    """Saver for HDF5."""
-
-    def __init__(self, output_dir: str, output_file_name: str) -> None:
-        """Create a HDF5 file."""
-        super().__init__(output_dir, output_file_name)
-        self._out_f = h5py.File(os.path.join(output_dir, output_file_name), "w")
-
-    def save_tensor(self, tensor_id: str, t: Union[np.ndarray, torch.Tensor]) -> None:
-        """Create a group if not exists, and save the tensor in the file."""
-        assert isinstance(t, np.ndarray)
-        self._out_f[tensor_id] = t
-
-    def close(self) -> None:
-        """Close the HDF5 file."""
-        self._out_f.close()
-
-
-T = TypeVar("T")
-
-
-class SafetensorsSaverInterface(Generic[T]):
-    """Interface for saving safetensor format."""
-
-    def get_weight_size(self, tensor: T) -> int:
-        """Get total weight size in `Byte` unit."""
-        raise NotImplementedError
-
-    def save_file(self, tensor: Dict[str, T], path: str) -> None:
-        """Save given tensor to path."""
-        raise NotImplementedError
-
-
-class TorchSafetensorsSaverInterface(SafetensorsSaverInterface[torch.Tensor]):
-    """Interface for saving safetensor format."""
-
-    def get_weight_size(self, tensor: torch.Tensor) -> int:
-        """Get total weight size in `Byte` unit."""
-        return tensor.itemsize * tensor.numel()
-
-    def save_file(self, tensor: Dict[str, torch.Tensor], path: str) -> None:
-        """Save given tensor to path."""
-        safetensors.torch.save_file(tensor, path)
-
-
-class NumpySafetensorsSaverInterface(SafetensorsSaverInterface[np.ndarray]):
-    """Interface for saving safetensor format."""
-
-    def get_weight_size(self, tensor: np.ndarray) -> int:
-        """Get total weight size in `Byte` unit."""
-        return tensor.itemsize * tensor.size
-
-    def save_file(self, tensor: Dict[str, np.ndarray], path: str) -> None:
-        """Save given tensor to path."""
-        safetensors.numpy.save_file(tensor, path)
-
-
-class UnionSafetensorsSaverInterface(
-    SafetensorsSaverInterface[Union[torch.Tensor, np.ndarray]]
-):
-    """Interface for saving safetensor format."""
-
-    def __init__(self) -> None:
-        """Initialize UnionSafetensorsSaverInterface."""
-        self._sub_itfcs = {
-            np.ndarray: NumpySafetensorsSaverInterface(),
-            torch.Tensor: TorchSafetensorsSaverInterface(),
-        }
-        super().__init__()
-
-    def get_weight_size(self, tensor: Union[torch.Tensor, np.ndarray]) -> int:
-        """Get total weight size in `Byte` unit."""
-        return self._sub_itfcs[type(tensor)].get_weight_size(tensor)  # type: ignore[attr-defined]
-
-    def save_file(
-        self, tensor: Dict[str, Union[torch.Tensor, np.ndarray]], path: str
-    ) -> None:
-        """Save given tensor to path."""
-        if len(tensor) == 0:
-            logger.warn("No tensor to save. Skip saving tensors..")
-            return
-        # NOTE: Assume that all tensors are the same type
-        tensor_type = type(next(iter(tensor.values())))
-        itfc = self._sub_itfcs[tensor_type]
-        itfc.save_file(tensor, path)  # type: ignore[attr-defined]
-
-
-class SafetensorsSaver(CheckpointSaver):
-    """Saver for Safetensors.
-
-    This temporally saves the converted tensors in local memory.
-    Then, all of the tensors are saved in the file at a time when close() is called,
-    because Safetensors does not support stream saving.
-    """
-
-    def __init__(
-        self, output_dir: Union[str, os.PathLike], output_file_name: str
-    ) -> None:
-        """Initialize a saver."""
-        super().__init__(output_dir, output_file_name)
-        self._tensors: Dict[str, Union[np.ndarray, torch.Tensor]] = {}
-        self._saver: UnionSafetensorsSaverInterface = UnionSafetensorsSaverInterface()
-
-    def save_tensor(self, tensor_id: str, t: Union[np.ndarray, torch.Tensor]) -> None:
-        """Save the tensor in the local memory."""
-        self._tensors[tensor_id] = t
-
-    def shard_checkpoint(self, max_shard_size: str):
-        """Shard the checkpoint with index."""
-        # pylint: disable=too-many-locals
-        int_max_shard_size = int(max_shard_size[:-2]) * (10**9)
-        sharded_tensors: List[Dict[str, Union[np.ndarray, torch.Tensor]]] = [{}]
-        last_block_size = 0
-        total_size = 0
-
-        for key, weight in self._tensors.items():
-            weight_size = self._saver.get_weight_size(weight)
-            if (
-                last_block_size + weight_size > int_max_shard_size
-                and len(sharded_tensors[-1]) > 0
-            ):
-                sharded_tensors.append({})
-                last_block_size = 0
-
-            sharded_tensors[-1][key] = weight
-            last_block_size += weight_size
-            total_size += weight_size
-
-        if len(sharded_tensors) == 1:
-            return {self._output_file_name: sharded_tensors[0]}, None
-
-        weight_map = {}
-        shards = {}
-        for idx, shard in enumerate(sharded_tensors):
-            shard_file = self._output_file_name.replace(
-                ".safetensors",
-                f"-{idx + 1:05d}-of-{len(sharded_tensors):05d}.safetensors",
-            )
-            shards[shard_file] = shard
-            for key in shard.keys():
-                weight_map[key] = shard_file
-
-        metadata = {"total_size": total_size}
-        index = {"metadata": metadata, "weight_map": weight_map}
-        return shards, index
-
-    def _save_to_file(self) -> None:
-        """Save the tensors in the file."""
-        logger.info("Saving the converted checkpoint...")
-
-        max_shard_size = "10GB"
-        shards, index = self.shard_checkpoint(max_shard_size)
-
-        for shard_file, shard in shards.items():
-            self._saver.save_file(shard, os.path.join(self._output_dir, shard_file))
-
-        if index is None:
-            path_to_weights = os.path.join(self._output_dir, self._output_file_name)
-            logger.info("Model weights saved in (%s)", path_to_weights)
-        else:
-            save_index_file = os.path.join(
-                self._output_dir, "model.safetensors.index.json"
-            )
-            # Save the index as well
-            with open(save_index_file, "w", encoding="utf-8") as f:
-                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-                f.write(content)
-            logger.info(
-                "The model is bigger than the maximum size per checkpoint %s "
-                " and is going to be split in %s checkpoint shards. You can find "
-                "where each parameters has been saved in the index located at (%s).",
-                max_shard_size,
-                str(len(shards)),
-                save_index_file,
-            )
-
-    def close(self) -> None:
-        """Save the tensors in the file."""
-        self._save_to_file()
diff --git a/friendli/modules/converter/schema.py b/friendli/modules/converter/schema.py
deleted file mode 100644
index 21f034b1..00000000
--- a/friendli/modules/converter/schema.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Converter Schema."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Callable, List
-
-import torch
-
-from friendli.enums import ModelDataType
-
-
-@dataclass
-class ConvertInfo:
-    """Dataclass for convert information of the parameter in huggingface checkpoint.
-
-    Args:
-        param_names(List[str]): List of parameter names in the huggingface checkpoint.
-        data_type(ModelDataType): Data type of the parameter.
-        converted_name(str): Name of the converted parameter.
-        reshape_fn(Callable[[List[torch.tensor]], np.ndarray]):
-            Function to reshape the tensor from the huggignface checkpoint.
-    """
-
-    param_names: List[str]
-    data_type: ModelDataType
-    converted_name: str
-    reshape_fn: Callable[[List[torch.Tensor]], torch.Tensor]
diff --git a/friendli/modules/converter/utils.py b/friendli/modules/converter/utils.py
deleted file mode 100644
index 4b9588f4..00000000
--- a/friendli/modules/converter/utils.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Converter Utils."""
-
-from __future__ import annotations
-
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
-
-import torch
-from peft import PeftConfig  # type: ignore[import] # pylint: disable=import-error
-from transformers import (  # type: ignore[import]
-    AutoConfig,
-    AutoTokenizer,
-    GenerationConfig,
-    PretrainedConfig,
-    PreTrainedTokenizer,
-)
-
-from friendli.enums import ModelDataType
-from friendli.errors import (
-    CheckpointConversionError,
-    NotFoundError,
-    NotSupportedCheckpointError,
-    TokenizerNotFoundError,
-)
-
-
-def convert_to_gpt_j_params(param: torch.Tensor, rotary_dim: int) -> torch.Tensor:
-    """Reshape weight or bias tensor with rotary embedding to gpt-j format.
-
-    Args:
-        param (torch.Tensor): Target tensor to convert. Shape must be (num_heads, head_size, ...)
-        rotary_dim (int): Degree of rotary embedding
-
-    Returns:
-        Torch tensor that heads are rotated.
-
-    Raises:
-        CheckpointConversionError: If arguments do not satisfy the requirements.
-
-    """
-    if param.ndim < 2:
-        raise CheckpointConversionError(
-            "Tensor dimension should be greater or equal than 2 for rotary conversion, "
-            f"but got {param.ndim}"
-        )
-
-    head_size = param.shape[1]
-    if rotary_dim > head_size:
-        raise CheckpointConversionError(
-            f"'rotary_dim' ({rotary_dim}) should be less or equal than 'head_size' ({head_size})"
-        )
-
-    param_rot = param[:, :rotary_dim]
-    param_pass = param[:, rotary_dim:]
-
-    origin_shape = param_rot.shape
-    param_rot_1 = param_rot[:, : rotary_dim // 2]
-    param_rot_2 = param_rot[:, rotary_dim // 2 :]
-    param_rot = torch.stack((param_rot_1, param_rot_2), dim=2).reshape(*origin_shape)
-
-    return torch.cat((param_rot, param_pass), dim=1)
-
-
-def get_tensor_from_state_dict(
-    state_dict: Dict[str, Any], tensor_name: str
-) -> torch.Tensor:
-    """Get the tensor whose name is 'tensor_name' from 'state_dict'.
-
-    Args:
-        state_dict (Dict[str, Any]): Model checkpoint's state_dict.
-        tensor_name (str): Name of tensor to get.
-
-    Returns:
-        Corresponding torch Tensor.
-
-    Raises:
-        CheckpointConversionError: If 'tensor_name' does not exist in 'state_dict'
-
-    """
-    if tensor_name not in state_dict:
-        raise CheckpointConversionError(
-            f"Cannot find '{tensor_name}' in the model checkpoint"
-        )
-
-    return state_dict[tensor_name]
-
-
-def get_torch_data_type(data_type: str) -> torch.dtype:
-    """Get torch data type from Enum."""
-    if data_type == ModelDataType.FP16:
-        return torch.float16
-    if data_type == ModelDataType.FP32:
-        return torch.float32
-    if data_type == ModelDataType.BF16:
-        return torch.bfloat16
-    raise CheckpointConversionError(
-        f"Can't not converted original param to {data_type}."
-    )
-
-
-def get_model_data_type(torch_dtype: torch.dtype) -> ModelDataType:
-    """Get torch data type from Enum."""
-    if torch_dtype == torch.float16:
-        return ModelDataType.FP16
-    if torch_dtype == torch.float32:
-        return ModelDataType.FP32
-    if torch_dtype == torch.bfloat16:
-        return ModelDataType.BF16
-    raise CheckpointConversionError(f"{torch_dtype} is not valid dtype.")
-
-
-def convert_tensor_dtype(
-    param: torch.Tensor,
-    data_type: Union[ModelDataType, torch.dtype],
-) -> torch.Tensor:
-    """Convert tensor format to the given data type.
-
-    Args:
-        param (torch.Tensor): The tensor to be converted.
-        data_type (ModelDataType): The data type of the tensor.
-
-    Returns:
-        torch.Tensor: The converted tensor.
-
-    """
-    dtype_map = {
-        ModelDataType.FP8_E4M3: torch.float8_e4m3fn,
-        ModelDataType.BF16: torch.bfloat16,
-        ModelDataType.FP16: torch.float16,
-        ModelDataType.FP32: torch.float32,
-        ModelDataType.INT4: torch.int8,
-        ModelDataType.INT8: torch.int8,
-    }
-
-    dtype = dtype_map[data_type] if isinstance(data_type, ModelDataType) else data_type
-
-    if dtype is torch.float8_e4m3fn:
-        return param.detach().to(dtype).view(dtype=torch.int8).to("cpu")
-
-    if dtype is torch.bfloat16:
-        return param.detach().to(dtype).to("cpu")
-
-    if data_type is ModelDataType.INT4:
-        pack_num = 8 // 4
-        int4_param = torch.zeros(
-            (param.shape[0], param.shape[1] // pack_num),
-            dtype=torch.uint8,
-            device=param.device,
-        )
-        for col in range(int4_param.shape[1]):
-            for i in range(pack_num):
-                int4_param[:, col] |= param[:, col * pack_num + i] << (i * 4)
-        param = int4_param
-
-    return param.detach().to(dtype).to("cpu")
-
-
-def get_tokenizer(
-    model_name_or_path: str,
-    *,
-    cache_dir: Optional[str] = None,
-) -> PreTrainedTokenizer:
-    """Try to get tokenizer of a pretrained model."""
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name_or_path,
-            cache_dir=cache_dir,
-            trust_remote_code=True,
-        )
-    except OSError as exc:
-        raise TokenizerNotFoundError(str(exc)) from exc
-
-    if not tokenizer.is_fast:
-        raise TokenizerNotFoundError(
-            "This model does not support Friendli-compatible tokenizer"
-        )
-
-    if tokenizer.pad_token != "<unk>":
-        tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    return tokenizer
-
-
-def save_tokenizer(
-    model_name_or_path: str,
-    *,
-    cache_dir: Optional[str] = None,
-    save_dir: str,
-) -> Tuple[str, ...]:
-    """Try to save `tokenizer.json` of a pretrained model."""
-    if not os.path.isdir(save_dir):
-        raise NotFoundError(f"Directory '{save_dir}' is not found.")
-
-    tokenizer = get_tokenizer(model_name_or_path, cache_dir=cache_dir)
-    saved_file_paths = tokenizer.save_pretrained(save_directory=save_dir)
-    tokenizer_json_path = None
-    for path in saved_file_paths:
-        if "tokenizer.json" == os.path.basename(path):
-            tokenizer_json_path = path
-            break
-
-    if tokenizer_json_path is None:
-        raise TokenizerNotFoundError(
-            "This model has the Friendli-compatible tokenizer implementation, but "
-            "'tokenizer.json' file is not found."
-        )
-    return saved_file_paths
-
-
-def get_model_generation_config(
-    model_name_or_path: str, cache_dir: Optional[str] = None
-) -> Optional[GenerationConfig]:
-    """Get HuggingFace model generation config."""
-    try:
-        generation_config = GenerationConfig.from_pretrained(
-            model_name_or_path, cache_dir=cache_dir, trust_remote_code=True
-        )
-    except (OSError, TypeError):
-        generation_config = None
-
-    return generation_config
-
-
-def get_model_pretrained_config(
-    model_name_or_path: str, model_output_path: str, cache_dir: Optional[str] = None
-) -> PretrainedConfig:
-    """Get HuggingFace model configs."""
-    try:
-        config = AutoConfig.from_pretrained(
-            model_name_or_path, cache_dir=cache_dir, trust_remote_code=True
-        )
-    except OSError as exc:  # from AutoConfig.from_pretrained()
-        config_dir = Path(model_name_or_path)
-        model_output_dir = Path(model_output_path).parent
-        if config_dir.exists() and model_output_dir.absolute() == config_dir.absolute():
-            raise NotFoundError(
-                f"'output_dir' ({model_output_dir.as_posix()}) and "
-                f"'model_name_or_path' ({model_name_or_path}) are the same. "
-                "In such a case, checkpoints should be prepared in 'output_dir'."
-            ) from exc
-        raise NotFoundError(str(exc)) from exc
-
-    return config
-
-
-def get_model_arch(config: PretrainedConfig) -> str:
-    """Get HuggingFace model architecture from config."""
-    model_arch_list = cast(List[str], cast(PretrainedConfig, config).architectures)
-    if len(model_arch_list) == 0:
-        raise NotSupportedCheckpointError(
-            invalid_option=f"'architectures={model_arch_list}'",
-            valid_options=["non empty list of architectures"],
-        )
-    model_arch = model_arch_list[0]
-    return model_arch
-
-
-def get_adapter_config(
-    adapter_name_or_path: str, cache_dir: Optional[str]
-) -> PeftConfig:
-    """Get PeftConfig for Adapter."""
-    try:
-        adapter_config = PeftConfig.from_pretrained(
-            adapter_name_or_path, cache_dir=cache_dir, trust_remote_code=True
-        )
-    except ValueError as exc:
-        raise NotFoundError(str(exc)) from exc
-    return adapter_config
diff --git a/friendli/modules/quantizer/__init__.py b/friendli/modules/quantizer/__init__.py
deleted file mode 100644
index 9d1a3117..00000000
--- a/friendli/modules/quantizer/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli model quantizer."""
diff --git a/friendli/modules/quantizer/awq/__init__.py b/friendli/modules/quantizer/awq/__init__.py
deleted file mode 100644
index 50a1020d..00000000
--- a/friendli/modules/quantizer/awq/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model AWQ Quantizer."""
diff --git a/friendli/modules/quantizer/awq/base.py b/friendli/modules/quantizer/awq/base.py
deleted file mode 100644
index 172d214c..00000000
--- a/friendli/modules/quantizer/awq/base.py
+++ /dev/null
@@ -1,513 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli AWQ Quantizer Base."""
-
-from __future__ import annotations
-
-import gc
-from abc import abstractmethod
-from dataclasses import fields
-from typing import Any, Dict, Iterator, List, Tuple, Type, cast
-
-import datasets  # type: ignore[import]
-import torch
-from datasets.utils.logging import disable_progress_bar  # type: ignore[import]
-from tqdm import tqdm
-
-from friendli.enums import ModelDataType
-from friendli.errors import QuantizationError
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import get_tokenizer
-from friendli.modules.quantizer.awq.utils import (
-    apply_module_clip,
-    apply_module_scale,
-    search_module_clip,
-    search_module_scale,
-)
-from friendli.modules.quantizer.base import AbstractQuantHook, CommonQuantizer
-from friendli.modules.quantizer.layers import WeightOnlyQuantizedLinearLayer
-from friendli.modules.quantizer.schema.config import AWQConfig
-from friendli.modules.quantizer.schema.data import (
-    ModuleName,
-    QuantInput,
-    TFQuantInputs,
-    TFQuantResults,
-    WeightOnlyQuantResult,
-)
-from friendli.modules.quantizer.utils import (
-    collect_inps,
-    get_weight_only_quant_scales,
-    quantized_linear_weight_reshape,
-    quantized_qkv_weight_reshape,
-    safe_load_datasets,
-    scale_reshape,
-)
-
-
-class AWQScaler(torch.nn.Module):
-    """Store AWQ scale before linear layers.
-
-    If the linear layer is quantized, but the previous layer can't be scaled,
-    then we need to store the AWQ scale in a separate module. This module
-    is used to store the AWQ scale.
-    """
-
-    def __init__(self, in_dim: int):
-        """Initialize AWQScaler."""
-        super().__init__()
-        self.scale = torch.nn.Parameter(torch.ones(in_dim))
-
-    def forward(self, x):
-        """Scale input by AWQ scale."""
-        return (x / self.scale.view(1, 1, -1)).to(x.dtype)
-
-
-class AWQHook(AbstractQuantHook):
-    """Quantization Hook for AWQ."""
-
-    @abstractmethod
-    def iter_inspect_modules(
-        self,
-        block: torch.nn.Module,
-    ) -> Iterator[
-        Tuple[
-            List[torch.nn.Module],
-            List[Tuple[ModuleName, torch.nn.Linear]],
-            torch.nn.Module,
-            ModuleName,
-        ]
-    ]:
-        """Returns iterator of modules to inspect for AWQ scale."""
-
-    @abstractmethod
-    def add_pre_scaler(
-        self,
-        model: torch.nn.Module,
-    ) -> torch.nn.Module:
-        """Add scaler for storing AWQ scale in modules."""
-
-    @abstractmethod
-    def get_inspect_module_types(
-        self, block: torch.nn.Module
-    ) -> Tuple[Type[torch.nn.Module], ...]:
-        """Returns the type of inspect modules in transformer block."""
-
-    def _register_pre_scaler(
-        self,
-        linear: torch.nn.Module,
-    ) -> AWQScaler:
-        """Register pre-scaler for storing AWQ scale in modules."""
-        scaler = AWQScaler(linear.in_features)  # type: ignore
-
-        def pre_scaler_hook(_, x: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
-            return (scaler(x[0]),)
-
-        linear.register_forward_pre_hook(pre_scaler_hook)
-        return scaler
-
-    def get_quant_result(
-        self,
-        quant_inputs: TFQuantInputs,
-        **kwargs: Any,
-    ) -> TFQuantResults:
-        """Get quantization result for AWQ."""
-        awq_config = cast(AWQConfig, self.quant_config)
-
-        def get_scale(
-            quant_input: QuantInput,
-        ) -> WeightOnlyQuantResult:
-            weight, name, start, end = (
-                quant_input.weight,
-                quant_input.name,
-                quant_input.start_offset,
-                quant_input.end_offset,
-            )
-            weight = weight.to(awq_config.device)
-
-            return get_weight_only_quant_scales(
-                layer_name=name,
-                w=weight[start:end],
-                q_bit=awq_config.awq_args.quant_bit,
-                q_group_size=awq_config.awq_args.quant_group_size,
-            )
-
-        return TFQuantResults(
-            layer_prefix_with_index=f"{self.quantized_layer_prefix}{quant_inputs.layer_index}.",
-            block=quant_inputs.block,
-            q=get_scale(quant_inputs.q),
-            k=get_scale(quant_inputs.k),
-            v=get_scale(quant_inputs.v),
-            attn_fc=get_scale(quant_inputs.attn_fc),
-            ff1=get_scale(quant_inputs.ff1),
-            ff2=get_scale(quant_inputs.ff2),
-        )
-
-    @property
-    def quant_dtype(self) -> ModelDataType:
-        """Return the quantization dtype."""
-        quant_config = cast(AWQConfig, self.quant_config)
-        awq_args = quant_config.awq_args
-        if awq_args.quant_bit == 4:
-            return ModelDataType.INT4
-        return ModelDataType.INT8
-
-    @property
-    @abstractmethod
-    def avoid_clipping_layer_names(self) -> List[str]:
-        """Return the layer names to avoid clipping."""
-
-    @property
-    @abstractmethod
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for modified layers."""
-
-    @property
-    def quantized_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for quantized layers."""
-        convert_info_list = []
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}q.weight_scale",
-                            f"{layer_prefix}k.weight_scale",
-                            f"{layer_prefix}v.weight_scale",
-                        ],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/awq/scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}q.zeros",
-                            f"{layer_prefix}k.zeros",
-                            f"{layer_prefix}v.zeros",
-                        ],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}attn/c_attn/awq/zero:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}q.weight",
-                            f"{layer_prefix}k.weight",
-                            f"{layer_prefix}v.weight",
-                        ],
-                        data_type=self.quant_dtype,
-                        converted_name=f"{converted_prefix}attn/c_attn/awq/weight:0",
-                        reshape_fn=quantized_qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn_fc.weight_scale"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/awq/scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn_fc.zeros"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}attn/c_proj/awq/zero:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn_fc.weight"],
-                        data_type=self.quant_dtype,
-                        converted_name=f"{converted_prefix}attn/c_proj/awq/weight:0",
-                        reshape_fn=quantized_linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff1.weight_scale"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/awq/scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff1.zeros"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_fc/awq/zero:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff1.weight"],
-                        data_type=self.quant_dtype,
-                        converted_name=f"{converted_prefix}mlp/c_fc/awq/weight:0",
-                        reshape_fn=quantized_linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff2.weight_scale"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/awq/scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff2.zeros"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_proj/awq/zero:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff2.weight"],
-                        data_type=self.quant_dtype,
-                        converted_name=f"{converted_prefix}mlp/c_proj/awq/weight:0",
-                        reshape_fn=quantized_linear_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-
-class AWQQuantizer(CommonQuantizer):
-    """Quantizer for AWQ."""
-
-    def check_config(self) -> None:
-        """Check if the AWQ quantization config is valid."""
-        super().check_config()
-        quant_config = cast(AWQConfig, self.quant_config)
-        awq_args = quant_config.awq_args
-        if awq_args.quant_bit not in [4, 8]:
-            raise QuantizationError(
-                f"Invalid quant_bit {awq_args.quant_bit} for AWQ."
-                "You can only use 4 or 8 bit for AWQ."
-            )
-        if awq_args.quant_group_size not in [64]:
-            raise QuantizationError(
-                f"Invalid quant_group_size {awq_args.quant_group_size} for AWQ."
-                "You can only use 64 for AWQ."
-            )
-
-    def get_calib_dataset(self) -> datasets.Dataset:
-        """Get calibration dataset for AWQ."""
-        data_cfg = self.quant_config.calibration_dataset
-        tokenizer = get_tokenizer(self.converter.config.name_or_path)
-        dataset = safe_load_datasets(data_cfg)
-
-        def preprocess(sample) -> Dict[str, Any]:
-            """Preprocess dataset for AWQ."""
-            return {"input_ids": tokenizer(sample).input_ids}
-
-        disable_progress_bar()
-        dataset = (
-            dataset.shuffle(self.quant_config.seed)
-            .select(range(data_cfg.num_samples))
-            .map(function=preprocess, input_columns=data_cfg.lookup_column_name)
-            .filter(
-                lambda sample: torch.tensor(sample).numel() != 0,
-                input_columns="input_ids",
-            )
-        )
-
-        return dataset
-
-    def get_batched_samples(self):
-        """Get batched samples from dataset."""
-        dataset = self.get_calib_dataset()
-        seqlen = self.quant_config.calibration_dataset.max_length
-        samples = []
-        for sample in dataset["input_ids"]:
-            samples.append(torch.tensor(sample[:seqlen]))
-
-        batched_samples = torch.cat(samples)
-        if len(batched_samples) // seqlen == 0:
-            return batched_samples.unsqueeze(0)
-
-        batched_samples = [
-            batched_samples[i * seqlen : (i + 1) * seqlen].unsqueeze(0)
-            for i in range(len(batched_samples) // seqlen)
-        ]
-        batched_samples = torch.cat(batched_samples, dim=0)
-        return batched_samples
-
-    def _apply_awq_scale_clip_block(
-        self,
-        block: torch.nn.Module,
-        block_args: Tuple[Any, ...],
-        block_kwargs: Dict[str, Any],
-    ) -> None:
-        """Search AWQ scale, clipping range and Apply them into a transformer block."""
-        # pylint: disable=too-many-locals
-
-        inpsected_mod_types = cast(AWQHook, self.hook).get_inspect_module_types(block)
-        args_dict, kwargs_dict = collect_inps(
-            block,
-            block_args,
-            block_kwargs,
-            self.quant_config.device,
-            tuple([*self.hook.get_linear_layer_types(), *inpsected_mod_types]),
-        )
-        awq_args = cast(AWQConfig, self.quant_config).awq_args
-        for prev_ops, linear_tuples, module2inspect, module2inspect_name in cast(
-            AWQHook, self.hook
-        ).iter_inspect_modules(block):
-            linear_inp = args_dict[linear_tuples[0][0]][0]
-            linear_layers = [linear for _, linear in linear_tuples]
-
-            scales = search_module_scale(
-                module2inspect,
-                args_dict[module2inspect_name],
-                kwargs_dict[module2inspect_name],
-                linear_layers,
-                linear_inp,
-                awq_args.quant_group_size,
-                awq_args.quant_bit,
-            )
-
-            apply_module_scale(
-                prev_ops,
-                linear_layers,
-                scales.to(self.quant_config.device),
-            )
-
-            for name, _ in linear_tuples:
-                assert len(args_dict[name]) == 1
-                assert torch.equal(args_dict[name][0], linear_inp)
-                args_dict[name] = (args_dict[name][0].div(scales.view(1, -1)),)
-
-        named_linears = {
-            name: m
-            for name, m in block.named_modules()
-            if isinstance(m, torch.nn.Linear)
-        }
-        for name, linear in named_linears.items():
-            if any(
-                (
-                    avoid in name
-                    for avoid in cast(AWQHook, self.hook).avoid_clipping_layer_names
-                )
-            ):
-                continue
-            max_val = search_module_clip(
-                linear.weight,
-                args_dict[name][0],
-                awq_args.quant_group_size,
-                awq_args.quant_bit,
-                n_sample_token=self.quant_config.calibration_dataset.num_samples,
-            )
-            apply_module_clip(
-                max_val.to(self.quant_config.device),
-                linear,
-            )
-
-    def get_input_kwargs_tf_blocks(
-        self,
-        model: torch.nn.Module,
-    ) -> Tuple[List[Tuple[Any, ...]], List[Dict[str, Any]]]:
-        """Gather input tensor and kwargs from the designated pytorch module."""
-        block_args = []
-        block_kwargs = []
-
-        num_tf_blocks = len(self.hook.get_tf_blocks(model))
-        progress_bar = tqdm(
-            range(num_tf_blocks),
-            total=num_tf_blocks,
-            desc="Collect args for transformer blocks..",
-        )
-
-        def hook(m, args, kwargs):  # pylint: disable=unused-argument
-            block_args.append(
-                tuple(
-                    (t.detach().cpu() if isinstance(t, torch.Tensor) else t)
-                    for t in args
-                )
-            )
-            block_kwargs.append(
-                {
-                    k: (v.detach().cpu() if isinstance(v, torch.Tensor) else v)
-                    for k, v in kwargs.items()
-                }
-            )
-            progress_bar.update()
-
-        removables = []
-        for tf_block in self.hook.get_tf_blocks(model):
-            removables.append(
-                tf_block.register_forward_pre_hook(hook, with_kwargs=True)
-            )
-
-        batched_samples = self.get_batched_samples()
-        model(batched_samples.to(self.quant_config.device), use_cache=False)
-
-        for removable in removables:
-            removable.remove()
-
-        return block_args, block_kwargs
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Return the attributes of the converted model."""
-        attributes = self.converter.get_attributes()
-        awq_args = cast(AWQConfig, self.quant_config).awq_args
-        attributes["quant_scheme"] = self.quant_config.mode.value  # awq
-        attributes["quant_group_size"] = awq_args.quant_group_size
-        attributes["quant_bit"] = awq_args.quant_bit
-        return attributes
-
-    @torch.no_grad()
-    def _apply_awq_scale_clip(
-        self,
-        model: torch.nn.Module,
-    ) -> None:
-        """Search AWQ scale, clipping range and Apply them into model."""
-        # pylint: disable=too-many-locals
-        model.eval()
-        with self._try_offload_model(model):
-            tf_blocks = self.hook.get_tf_blocks(model)
-            block_args, block_kwargs = self.get_input_kwargs_tf_blocks(model)
-
-            gc.collect()
-            torch.cuda.empty_cache()
-
-            for block, args, kwargs in tqdm(
-                zip(
-                    tf_blocks,
-                    block_args,
-                    block_kwargs,
-                ),
-                total=len(tf_blocks),
-                desc="Search and Apply AWQ Scale, Clip range..",
-            ):
-                self._apply_awq_scale_clip_block(block, args, kwargs)
-                gc.collect()
-                torch.cuda.empty_cache()
-
-    @torch.no_grad()
-    def pre_quantize(
-        self,
-        model: torch.nn.Module,
-    ) -> None:
-        """Pre-procedure that should be called before quantize() is called."""
-        model = cast(AWQHook, self.hook).add_pre_scaler(model)
-        self._apply_awq_scale_clip(model)
-
-    @torch.no_grad()
-    def quantize(
-        self,
-        model: torch.nn.Module,
-    ) -> torch.nn.Module:
-        """Quantize model with AWQ."""
-        model.eval()
-        for quant_input in tqdm(
-            self.hook.iter_tf_quant_inputs(model),
-            total=len(self.hook.get_tf_blocks(model)),
-            desc="Quantize model..",
-        ):
-            assert isinstance(quant_input, TFQuantInputs)
-            quant_result = cast(AWQHook, self.hook).get_quant_result(
-                quant_input, quant_config=cast(AWQConfig, self.quant_config)
-            )
-            for field in fields(quant_result):
-                layer_quant_result = getattr(quant_result, field.name)
-                if isinstance(layer_quant_result, WeightOnlyQuantResult):
-                    layer = model.get_submodule(layer_quant_result.module_name)
-                    q_layer = WeightOnlyQuantizedLinearLayer.from_layer(
-                        layer, layer_quant_result
-                    )
-                    quant_result.block.add_module(field.name, q_layer)
-
-        return model
diff --git a/friendli/modules/quantizer/awq/models/gpt_neox.py b/friendli/modules/quantizer/awq/models/gpt_neox.py
deleted file mode 100644
index 8d48328a..00000000
--- a/friendli/modules/quantizer/awq/models/gpt_neox.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli GPTNeoXForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type
-
-import torch
-
-from friendli.enums import ModelDataType
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.quantizer.awq.base import AWQHook
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.utils import scale_reshape
-
-
-class AWQGPTNeoXHook(AWQHook):
-    """AWQ Hook for GPTNeoXForCausalLM."""
-
-    def __init__(self, quant_config, converter):
-        """Initialize AWQGPTNeoXHook."""
-        super().__init__(quant_config, converter)
-        config = converter.config
-        self.data_type = converter.data_type
-        self.num_attention_heads = config.num_attention_heads
-        self.num_kv_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_dim = int(self.head_size * config.rotary_pct)
-        assert config.use_parallel_residual == True
-
-    def add_pre_scaler(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Adds scaler to GPTNeoXForCausalLM."""
-        for tf_block in self.get_tf_blocks(model):
-            attn_fc_scaler = self._register_pre_scaler(
-                tf_block.attention.dense,
-            )
-            tf_block.attention.add_module("scaler", attn_fc_scaler)
-            ff2_scaler = self._register_pre_scaler(tf_block.mlp.dense_4h_to_h)
-            tf_block.mlp.add_module("scaler", ff2_scaler)
-        return model
-
-    def get_inspect_module_types(
-        self, block: torch.nn.Module
-    ) -> Tuple[Type[torch.nn.Module], ...]:
-        """Returns the type of linear layer (etc. qkv, linear layer) in transformer block."""
-        return (type(block.attention), type(block.mlp))
-
-    def iter_inspect_modules(
-        self,
-        block: torch.nn.Module,
-    ) -> Iterator[
-        Tuple[
-            List[torch.nn.Module],
-            List[Tuple[ModuleName, torch.nn.Linear]],
-            torch.nn.Module,
-            ModuleName,
-        ]
-    ]:
-        """Returns iterator of layers in modules."""
-        # qkv proj
-        yield (
-            [block.input_layernorm],
-            [("attention.query_key_value", block.attention.query_key_value)],
-            block.attention,
-            "attention",
-        )
-        # attn out proj
-        yield (
-            [block.attention.scaler],
-            [("attention.dense", block.attention.dense)],
-            block.attention.dense,
-            "attention.dense",
-        )
-        # ff1
-        yield (
-            [block.post_attention_layernorm],
-            [("mlp.dense_h_to_4h", block.mlp.dense_h_to_4h)],
-            block.mlp,
-            "mlp",
-        )
-        # ff2
-        yield (
-            [block.mlp.scaler],
-            [("mlp.dense_4h_to_h", block.mlp.dense_4h_to_h)],
-            block.mlp.dense_4h_to_h,
-            "mlp.dense_4h_to_h",
-        )
-
-    def iter_tf_quant_inputs(self, model: torch.nn.Module) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of GPTNeoXForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            qkv_weight = self.converter.qkv_weight_reshape(
-                [decoder_layer.attention.query_key_value.weight]
-            ).transpose(
-                0, 1
-            )  # [OutDim, InDim]
-            attn_weight_outdim = qkv_weight.size(0)  # OutDim
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attention.query_key_value",
-                    0,
-                    attn_weight_outdim // 3,
-                ),
-                k=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attention.query_key_value",
-                    attn_weight_outdim // 3,
-                    attn_weight_outdim // 3 * 2,
-                ),
-                v=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attention.query_key_value",
-                    attn_weight_outdim // 3 * 2,
-                    attn_weight_outdim,
-                ),
-                attn_fc=QuantInput(
-                    decoder_layer.attention.dense.weight,
-                    f"{self.quantized_layer_prefix}{index}.attention.dense",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    decoder_layer.mlp.dense_h_to_4h.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_h_to_4h",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    decoder_layer.mlp.dense_4h_to_h.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_4h_to_h",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in GPTNeoXForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in GPTNeoXForCausalLM."""
-        return model.gpt_neox.layers  # type: ignore
-
-    @property
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for modified layers."""
-        convert_info_list = []
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attention.scaler.scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_proj/awq/pre_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}mlp.scaler.scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_proj/awq/pre_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def avoid_clipping_layer_names(self) -> List[str]:
-        """Returns the layer names which should be avoided for AWQ clipping."""
-        return ["query_key_value"]
diff --git a/friendli/modules/quantizer/awq/models/gptj.py b/friendli/modules/quantizer/awq/models/gptj.py
deleted file mode 100644
index da2e81dc..00000000
--- a/friendli/modules/quantizer/awq/models/gptj.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli GPTJForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type
-
-import torch
-
-from friendli.enums import ModelDataType
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.quantizer.awq.base import AWQHook
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.utils import scale_reshape
-
-
-class AWQGPTJHook(AWQHook):
-    """AWQ Hook for GPTJForCausalLM."""
-
-    def __init__(self, quant_config, converter):
-        """Initialize AWQGPTJHook."""
-        super().__init__(quant_config, converter)
-        config = converter.config
-        self.data_type = converter.data_type
-        self.num_attention_heads = config.num_attention_heads
-        self.num_kv_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_dim = config.rotary_dim
-
-    def add_pre_scaler(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Adds scaler to GPTJForCausalLM."""
-        for tf_block in self.get_tf_blocks(model):
-            ff2_scaler = self._register_pre_scaler(tf_block.mlp.fc_out)
-            tf_block.mlp.add_module("ff2_scaler", ff2_scaler)
-        return model
-
-    def get_inspect_module_types(
-        self, block: torch.nn.Module
-    ) -> Tuple[Type[torch.nn.Module], ...]:
-        """Returns the type of linear layer (etc. qkv, linear layer) in transformer block."""
-        return (type(block.attn), type(block.mlp), type(block))
-
-    def iter_inspect_modules(
-        self,
-        block: torch.nn.Module,
-    ) -> Iterator[
-        Tuple[
-            List[torch.nn.Module],
-            List[Tuple[ModuleName, torch.nn.Linear]],
-            torch.nn.Module,
-            ModuleName,
-        ]
-    ]:
-        """Returns iterator of layers in modules."""
-        # qkv proj
-        yield (
-            [block.ln_1],
-            [
-                ("attn.q_proj", block.attn.q_proj),
-                ("attn.k_proj", block.attn.k_proj),
-                ("attn.v_proj", block.attn.v_proj),
-                ("mlp.fc_in", block.mlp.fc_in),
-            ],
-            block,
-            "",
-        )
-        # attn out proj
-        yield (
-            [block.attn.v_proj],
-            [("attn.out_proj", block.attn.out_proj)],
-            block.attn.out_proj,
-            "attn.out_proj",
-        )
-        # ff2
-        yield (
-            [block.mlp.ff2_scaler],
-            [("mlp.fc_out", block.mlp.fc_out)],
-            block.mlp.fc_out,
-            "mlp.fc_out",
-        )
-
-    def iter_tf_quant_inputs(self, model: torch.nn.Module) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of GPTJForCausalLM."""
-        for index, tf_block in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            yield TFQuantInputs(
-                layer_index=index,
-                block=tf_block,
-                q=QuantInput(
-                    tf_block.attn.q_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.q_proj",
-                    None,
-                    None,
-                ),
-                k=QuantInput(
-                    tf_block.attn.k_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.k_proj",
-                    None,
-                    None,
-                ),
-                v=QuantInput(
-                    tf_block.attn.v_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.v_proj",
-                    None,
-                    None,
-                ),
-                attn_fc=QuantInput(
-                    tf_block.attn.out_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    tf_block.mlp.fc_in.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.fc_in",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    tf_block.mlp.fc_out.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.fc_out",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in GPTJForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in GPTJForCausalLM."""
-        return model.transformer.h  # type: ignore
-
-    @property
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for modified layers."""
-        convert_info_list = []
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.append(
-                ConvertInfo(
-                    param_names=[f"{layer_prefix}mlp.ff2_scaler.scale"],
-                    data_type=ModelDataType.FP32,
-                    converted_name=f"{converted_prefix}mlp/c_proj/awq/pre_scale:0",
-                    reshape_fn=scale_reshape,
-                )
-            )
-        return convert_info_list
-
-    @property
-    def avoid_clipping_layer_names(self) -> List[str]:
-        """Returns the layer names which should be avoided for AWQ clipping."""
-        return ["q_proj", "k_proj"]
diff --git a/friendli/modules/quantizer/awq/models/llama.py b/friendli/modules/quantizer/awq/models/llama.py
deleted file mode 100644
index f59bc0cf..00000000
--- a/friendli/modules/quantizer/awq/models/llama.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli LlamaForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Any, Iterator, List, Tuple, Type, cast
-
-import torch
-
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.quantizer.awq.base import AWQHook
-from friendli.modules.quantizer.schema.config import AWQConfig
-from friendli.modules.quantizer.schema.data import (
-    ModuleName,
-    QuantInput,
-    TFQuantInputs,
-    TFQuantResults,
-    WeightOnlyQuantResult,
-)
-from friendli.modules.quantizer.utils import (
-    get_weight_only_quant_scales,
-    quantized_linear_weight_reshape,
-    scale_reshape,
-)
-
-
-@dataclass
-class LlamaTFQuantInputs(TFQuantInputs):
-    """Dataclass for quantization input per layer in LlamaForCausalLM."""
-
-    ff_gate: QuantInput
-
-
-@dataclass
-class LlamaTFQuantResults(TFQuantResults):
-    """Dataclass for quantization result per layer in LlamaForCausalLM."""
-
-    ff_gate: WeightOnlyQuantResult
-
-
-class AWQLlamaHook(AWQHook):
-    """AWQ Hook for LlamaForCausalLM."""
-
-    def __init__(self, quant_config, converter):
-        """Initialize AWQLlamaHook."""
-        super().__init__(quant_config, converter)
-        config = converter.config
-        self.data_type = converter.data_type
-        self.num_attention_heads = config.num_attention_heads
-        if config.num_key_value_heads is None:
-            self.num_kv_attention_heads = self.num_attention_heads
-        else:
-            self.num_kv_attention_heads = config.num_key_value_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_dim = self.head_size
-        self.scale_attn_fc = self.num_attention_heads == self.num_kv_attention_heads
-
-    def add_pre_scaler(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Adds scaler to LlamaForCausalLM."""
-        return model
-
-    def get_inspect_module_types(
-        self, block: torch.nn.Module
-    ) -> Tuple[type[torch.nn.Module], ...]:
-        """Returns the layer types in inspected blocks."""
-        return (type(block.self_attn), type(block.mlp))
-
-    def iter_inspect_modules(
-        self,
-        block: torch.nn.Module,
-    ) -> Iterator[
-        Tuple[
-            List[torch.nn.Module],
-            List[Tuple[ModuleName, torch.nn.Linear]],
-            torch.nn.Module,
-            ModuleName,
-        ]
-    ]:
-        """Returns iterator of layers in blocks."""
-        # qkv proj
-        yield (
-            [block.input_layernorm],
-            [
-                ("self_attn.q_proj", block.self_attn.q_proj),
-                ("self_attn.k_proj", block.self_attn.k_proj),
-                ("self_attn.v_proj", block.self_attn.v_proj),
-            ],
-            block.self_attn,
-            "self_attn",
-        )
-        # attn out proj
-        if self.scale_attn_fc:
-            yield (
-                [block.self_attn.v_proj],
-                [("self_attn.o_proj", block.self_attn.o_proj)],
-                block.self_attn.o_proj,
-                "self_attn.o_proj",
-            )
-        # ff1
-        yield (
-            [block.post_attention_layernorm],
-            [
-                ("mlp.up_proj", block.mlp.up_proj),
-                ("mlp.gate_proj", block.mlp.gate_proj),
-            ],
-            block.mlp,
-            "mlp",
-        )
-        # ff2
-        yield (
-            [block.mlp.up_proj],
-            [("mlp.down_proj", block.mlp.down_proj)],
-            block.mlp.down_proj,
-            "mlp.down_proj",
-        )
-
-    def iter_tf_quant_inputs(self, model: torch.nn.Module) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of LlamaForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.self_attn
-            q_weight, k_weight, v_weight = (
-                self.converter.qkv_weight_reshape(
-                    [
-                        self_attn.q_proj.weight,
-                        self_attn.k_proj.weight,
-                        self_attn.v_proj.weight,
-                    ]
-                )
-                .transpose(0, 1)
-                .split(
-                    [
-                        self.converter.decoder_num_attention_heads
-                        * self.converter.decoder_head_size,
-                        self.converter.decoder_num_kv_attention_heads
-                        * self.converter.decoder_head_size,
-                        self.converter.decoder_num_kv_attention_heads
-                        * self.converter.decoder_head_size,
-                    ],
-                    dim=0,
-                )
-            )
-            fc1 = decoder_layer.mlp.up_proj
-            ff_gate = decoder_layer.mlp.gate_proj
-            fc2 = decoder_layer.mlp.down_proj
-
-            yield LlamaTFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    q_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",
-                    None,
-                    None,
-                ),
-                k=QuantInput(
-                    k_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.k_proj",
-                    None,
-                    None,
-                ),
-                v=QuantInput(
-                    v_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.v_proj",
-                    None,
-                    None,
-                ),
-                attn_fc=QuantInput(
-                    self_attn.o_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.up_proj",
-                    None,
-                    None,
-                ),
-                ff_gate=QuantInput(
-                    ff_gate.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.gate_proj",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.down_proj",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_quant_result(
-        self,
-        quant_input: TFQuantInputs,
-        **kwargs: Any,
-    ) -> TFQuantResults:
-        """Get quantization result for a specific layer in LlamaForCausalLM."""
-        awq_config = cast(AWQConfig, self.quant_config)
-
-        def get_scale(quant_input: QuantInput) -> WeightOnlyQuantResult:
-            weight, name, start, end = (
-                quant_input.weight,
-                quant_input.name,
-                quant_input.start_offset,
-                quant_input.end_offset,
-            )
-            weight = weight.to(awq_config.device)
-
-            return get_weight_only_quant_scales(
-                layer_name=name,
-                w=weight[start:end],
-                q_bit=awq_config.awq_args.quant_bit,
-                q_group_size=awq_config.awq_args.quant_group_size,
-            )
-
-        quant_input = cast(LlamaTFQuantInputs, quant_input)
-        return LlamaTFQuantResults(
-            layer_prefix_with_index=f"{self.quantized_layer_prefix}{quant_input.layer_index}.",
-            block=quant_input.block,
-            q=get_scale(quant_input.q),
-            k=get_scale(quant_input.k),
-            v=get_scale(quant_input.v),
-            attn_fc=get_scale(quant_input.attn_fc),
-            ff1=get_scale(quant_input.ff1),
-            ff_gate=get_scale(quant_input.ff_gate),
-            ff2=get_scale(quant_input.ff2),
-        )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in LlamaForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in LlamaForCausalLM."""
-        return model.model.layers
-
-    @property
-    def quantized_param_names(self) -> List[str]:
-        """Returns the parameter names in LlamaForCausalLM."""
-        param_names = super().quantized_param_names
-        for i in range(self.converter.decoder_layer_num):
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            param_names.append(
-                f"{converted_prefix}mlp/c_gate/weight:0",
-            )
-        return param_names
-
-    @property
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for modified layers."""
-        return []
-
-    @property
-    def avoid_clipping_layer_names(self) -> List[str]:
-        """Returns the layer names which should be avoided for AWQ clipping."""
-        return ["q_proj", "k_proj"]
-
-    @property
-    def quantized_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the convert_info_list for quantized layers."""
-        convert_info_list = super().quantized_convert_info_list
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff_gate.weight_scale"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_gate/awq/scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff_gate.zeros"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}mlp/c_gate/awq/zero:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff_gate.weight"],
-                        data_type=self.quant_dtype,
-                        converted_name=f"{converted_prefix}mlp/c_gate/awq/weight:0",
-                        reshape_fn=quantized_linear_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
diff --git a/friendli/modules/quantizer/awq/models/mpt.py b/friendli/modules/quantizer/awq/models/mpt.py
deleted file mode 100644
index 6c60ca58..00000000
--- a/friendli/modules/quantizer/awq/models/mpt.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli MPTForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type
-
-import torch
-
-from friendli.enums import ModelDataType
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.quantizer.awq.base import AWQHook
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.utils import scale_reshape
-
-
-class AWQMPTHook(AWQHook):
-    """AWQ Hook for MPTForCausalLM."""
-
-    def add_pre_scaler(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Adds scaler to MPTForCausalLM."""
-        for tf_block in self.get_tf_blocks(model):
-            attn_fc_scaler = self._register_pre_scaler(
-                tf_block.attn.out_proj,
-            )
-            tf_block.attn.add_module("scaler", attn_fc_scaler)
-            ff2_scaler = self._register_pre_scaler(tf_block.ffn.down_proj)
-            tf_block.ffn.add_module("scaler", ff2_scaler)
-        return model
-
-    def get_inspect_module_types(
-        self, block: torch.nn.Module
-    ) -> Tuple[Type[torch.nn.Module], ...]:
-        """Returns the type of linear layer (etc. qkv, linear layer) in transformer block."""
-        return (type(block.attn), type(block.ffn))
-
-    def iter_inspect_modules(
-        self,
-        block: torch.nn.Module,
-    ) -> Iterator[
-        Tuple[
-            List[torch.nn.Module],
-            List[Tuple[ModuleName, torch.nn.Linear]],
-            torch.nn.Module,
-            ModuleName,
-        ]
-    ]:
-        """Returns iterator of layers in modules."""
-        # qkv proj
-        yield (
-            [block.norm_1],
-            [("attn.Wqkv", block.attn.Wqkv)],
-            block.attn,
-            "attn",
-        )
-        # attn out proj
-        yield (
-            [block.attn.scaler],
-            [("attn.out_proj", block.attn.out_proj)],
-            block.attn.out_proj,
-            "attn.out_proj",
-        )
-        # ff1
-        yield (
-            [block.norm_2],
-            [("ffn.up_proj", block.ffn.up_proj)],
-            block.ffn,
-            "ffn",
-        )
-        # ff2
-        yield (
-            [block.ffn.scaler],
-            [("ffn.down_proj", block.ffn.down_proj)],
-            block.ffn.down_proj,
-            "ffn.down_proj",
-        )
-
-    def iter_tf_quant_inputs(self, model: torch.nn.Module) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of MPTForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.attn
-            q_outdim = (
-                self.converter.decoder_num_attention_heads
-                * self.converter.decoder_head_size
-            )
-            kv_outdim = (
-                self.converter.decoder_num_kv_attention_heads
-                * self.converter.decoder_head_size
-            )
-            qkv_outdim = self_attn.Wqkv.weight.size(0)
-            assert qkv_outdim == q_outdim + kv_outdim * 2
-            fc1 = decoder_layer.ffn.up_proj  # type: ignore
-            fc2 = decoder_layer.ffn.down_proj  # type: ignore
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    self_attn.Wqkv.weight,  # type: ignore
-                    f"{self.quantized_layer_prefix}{index}.attn.Wqkv",
-                    0,
-                    q_outdim,
-                ),
-                k=QuantInput(
-                    self_attn.Wqkv.weight,  # type: ignore
-                    f"{self.quantized_layer_prefix}{index}.attn.Wqkv",
-                    q_outdim,
-                    q_outdim + kv_outdim,
-                ),
-                v=QuantInput(
-                    self_attn.Wqkv.weight,  # type: ignore
-                    f"{self.quantized_layer_prefix}{index}.attn.Wqkv",
-                    q_outdim + kv_outdim,
-                    qkv_outdim,
-                ),
-                attn_fc=QuantInput(
-                    self_attn.out_proj.weight,  # type: ignore
-                    f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,  # type: ignore
-                    f"{self.quantized_layer_prefix}{index}.ffn.up_proj",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,  # type: ignore
-                    f"{self.quantized_layer_prefix}{index}.ffn.down_proj",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in MPTForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in MPTForCausalLM."""
-        return model.transformer.blocks  # type: ignore
-
-    @property
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for modified layers."""
-        convert_info_list = []
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn.scaler.scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_proj/awq/pre_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ffn.scaler.scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_proj/awq/pre_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-    @property
-    def avoid_clipping_layer_names(self) -> List[str]:
-        """Returns the layer names which should be avoided for AWQ clipping."""
-        return ["Wqkv"]
diff --git a/friendli/modules/quantizer/awq/utils.py b/friendli/modules/quantizer/awq/utils.py
deleted file mode 100644
index c6efdec4..00000000
--- a/friendli/modules/quantizer/awq/utils.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-# Copyright (c) 2023 MIT HAN Lab
-# MIT License
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-"""Friendli AWQ Quantizer Util."""
-
-from __future__ import annotations
-
-import gc
-from typing import Any, Dict, Iterable, List, Tuple
-
-import torch
-
-
-def pseudo_quantize_tensor(w: torch.Tensor, q_bit: int = 8, q_group_size: int = -1):
-    """Pseudo quantize tensor."""
-    org_w_shape = w.shape
-    w = w.reshape(-1, q_group_size)
-    max_val = w.amax(dim=1, keepdim=True)
-    min_val = w.amin(dim=1, keepdim=True)
-    max_int = 2**q_bit - 1
-    min_int = 0
-    scales = (max_val - min_val).clamp(min=1e-5) / max_int
-    zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
-
-    assert torch.isnan(scales).sum() == 0
-    assert torch.isnan(w).sum() == 0
-
-    w = (
-        torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros
-    ) * scales
-    assert torch.isnan(w).sum() == 0
-
-    w = w.reshape(org_w_shape)
-
-    return w
-
-
-@torch.no_grad()
-def get_weight_scale(weight: torch.Tensor, q_group_size=-1):
-    """Get weight scale for AWQ."""
-    org_shape = weight.shape
-    if q_group_size > 0:
-        weight = weight.view(-1, q_group_size)
-    scale = weight.abs() / weight.abs().amax(dim=1, keepdim=True)
-    scale = scale.view(org_shape)
-    scale = scale.mean(0)
-    return scale
-
-
-@torch.no_grad()
-def get_act_scale(x):
-    """Get activation scale for AWQ."""
-    return x.abs().view(-1, x.shape[-1]).mean(0)
-
-
-def search_module_scale(
-    module: torch.nn.Module,
-    module_args: Tuple[Any, ...],
-    module_kwargs: Dict[str, Any],
-    linears2scale: Iterable[torch.nn.Linear],
-    linear_inp: torch.Tensor,
-    q_group_size: int,
-    q_bit: int,
-) -> torch.Tensor:
-    """Search the AWQ scale for a module."""
-    # pylint: disable=too-many-locals
-    weight = torch.cat([_m.weight for _m in linears2scale], dim=0)  # type: ignore
-    with torch.no_grad():
-        org_out = module(*module_args, **module_kwargs)
-        if isinstance(org_out, tuple):
-            org_out = org_out[0]
-
-    x_max = get_act_scale(linear_inp)
-    w_max = get_weight_scale(weight, q_group_size)
-    del weight
-    gc.collect()  # type: ignore
-    torch.cuda.empty_cache()
-
-    best_error = float("inf")
-    best_scales = torch.zeros(x_max.shape[0], device=x_max.device)
-    n_grid = 20
-    history = []
-    org_sd = {k: v.to("cpu", copy=True) for k, v in module.state_dict().items()}
-    for grid in range(n_grid):
-        ratio = grid * 1.0 / n_grid
-        scales = (x_max.pow(ratio) / w_max.pow(1 - ratio)).clamp(min=1e-4).view(-1)
-        scales = scales / (scales.max() * scales.min()).sqrt()
-        for fc in linears2scale:
-            fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))  # type: ignore
-            fc.weight.data = pseudo_quantize_tensor(
-                w=fc.weight.data,  # type: ignore
-                q_bit=q_bit,
-                q_group_size=q_group_size,
-            ) / (scales.view(1, -1))
-
-        out = module(*module_args, **module_kwargs)
-        if isinstance(out, tuple):
-            out = out[0]
-
-        loss = (org_out - out).float().pow(2).mean().item()  # float prevents overflow
-        history.append(loss)
-        is_best = loss < best_error
-        if is_best:
-            best_error = loss
-            best_scales = scales
-        module.load_state_dict(org_sd)
-    best_scales = best_scales.view(-1)
-
-    assert torch.isnan(best_scales).sum() == 0, best_scales
-    return best_scales.detach()
-
-
-def apply_module_scale(
-    prev_ops: List[torch.nn.Module],
-    linear_layers: Iterable[torch.nn.Linear],
-    scales: torch.Tensor,
-) -> None:
-    """Apply AWQ Scale for Module, and return the scaled input for Clipping."""
-    for prev_op in prev_ops:
-        for _, param in prev_op.named_parameters(recurse=False):
-            if isinstance(prev_op, torch.nn.Linear):
-                # TODO: handle bias
-                assert len(param.data.shape) == 2
-                param.data.div_(scales.view(-1, 1))
-            else:
-                assert param.data.shape == scales.shape
-                param.data.div_(scales)
-
-    for layer in linear_layers:
-        layer.weight.data.mul_(scales.view(1, -1))
-
-
-def search_module_clip(
-    w: torch.Tensor,
-    inp: torch.Tensor,
-    q_group_size: int,
-    q_bit: int,
-    n_grid=20,
-    max_shrink=0.5,
-    n_sample_token=512,
-) -> torch.Tensor:
-    """Search the best clip for a module."""
-    # pylint: disable=too-many-locals
-    # w           [co, ci]      -> [co, 1, n_group, group size]
-    # inp  [n_token, ci] -> [1, n_token, n_group, group size]
-    w = w.view(w.shape[0], 1, -1, q_group_size)
-
-    inp = inp.view(-1, inp.shape[-1])
-    inp = inp.reshape(1, inp.shape[0], -1, q_group_size)
-    inp = inp[:, 0 :: inp.shape[1] // n_sample_token]
-
-    oc_batch_size = 256 if w.shape[0] % 256 == 0 else 64  # prevent OOM
-    assert w.shape[0] % oc_batch_size == 0
-    w_all = w
-    best_max_val_all = []
-
-    for i_b in range(w.shape[0] // oc_batch_size):
-        w = w_all[i_b * oc_batch_size : (i_b + 1) * oc_batch_size]
-
-        org_max_val = w.abs().amax(dim=-1, keepdim=True)  # co, 1, n_group, 1
-
-        best_max_val = org_max_val.clone()
-        min_errs = torch.ones_like(org_max_val) * 1e9
-        inp = inp.to(w.device)
-        org_out = (inp * w).sum(dim=-1)  # co, n_token, n_group
-
-        for i_s in range(int(max_shrink * n_grid)):
-            max_val = org_max_val * (1 - i_s / n_grid)
-            min_val = -max_val
-            cur_w = torch.clamp(w, min_val, max_val)
-            q_w = pseudo_quantize_tensor(
-                w=cur_w,
-                q_bit=q_bit,
-                q_group_size=q_group_size,
-            )
-            cur_out = (inp * q_w).sum(dim=-1)
-
-            # co, 1, n_group, 1
-            err = (cur_out - org_out).pow(2).mean(dim=1).view(min_errs.shape)
-            del cur_w
-            del cur_out
-            cur_best_idx = err < min_errs
-            min_errs[cur_best_idx] = err[cur_best_idx]
-            best_max_val[cur_best_idx] = max_val[cur_best_idx]
-        best_max_val_all.append(best_max_val)
-
-    best_max_val = torch.cat(best_max_val_all, dim=0)
-
-    del inp
-    del org_out
-    gc.collect()
-    torch.cuda.empty_cache()
-
-    return best_max_val.squeeze(1)
-
-
-def apply_module_clip(
-    max_val: torch.Tensor,
-    layer: torch.nn.Linear,
-):
-    """Apply AWQ Clip for Module."""
-    max_val = max_val.to(layer.weight.device)  # type: ignore
-    org_shape = layer.weight.shape
-    layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)  # type: ignore
-    layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
-    layer.weight.data = layer.weight.data.reshape(org_shape)  # type: ignore
diff --git a/friendli/modules/quantizer/base.py b/friendli/modules/quantizer/base.py
deleted file mode 100644
index ea97e092..00000000
--- a/friendli/modules/quantizer/base.py
+++ /dev/null
@@ -1,507 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Quantizer Base."""
-
-from __future__ import annotations
-
-import os
-from abc import ABC, abstractmethod
-from collections.abc import Generator
-from contextlib import contextmanager
-from typing import Any, Dict, Iterator, List, Tuple, Type, Union, cast
-
-import datasets  # type: ignore[import]
-import huggingface_hub  # type: ignore[import]
-import numpy as np
-import torch
-from torch.nn.modules import Module
-from tqdm import tqdm
-
-from friendli.enums import (
-    QuantDatasetFormat,  # TODO: move this to friendli/modules/converter/enums.py
-)
-from friendli.enums import ModelDataType
-from friendli.errors import NotSupportedQuantConfigError
-from friendli.logging import logger
-from friendli.modules.converter.base import DECODER_PREFIX, OneOfConverter
-from friendli.modules.converter.interface import ModelConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import get_tokenizer, get_torch_data_type
-from friendli.modules.quantizer.layers import WeightActQuantizedLinearLayer
-from friendli.modules.quantizer.schema.config import OneOfQuantConfig
-from friendli.modules.quantizer.schema.data import (
-    HFTFQuantInputs,
-    ModuleName,
-    TFQuantInputs,
-    TFQuantResults,
-    WeightActQuantResult,
-)
-from friendli.modules.quantizer.utils import (
-    collect_stats,
-    offload_module_sequence,
-    safe_load_datasets,
-    send_model_to_device,
-)
-
-
-class AbstractQuantHook(ABC):
-    """Quantization Hook for a specific model architecture."""
-
-    def __init__(self, quant_config: Dict[str, Any], converter: OneOfConverter):
-        """Initialize the Quantization Hook.
-
-        Args:
-            quant_config: Quantization configuration.
-            converter (OneOfConverter): Converter for a specific model architecture.
-        """
-        self.quant_config = quant_config
-        self.converter = converter
-
-    @abstractmethod
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks."""
-
-    @abstractmethod
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the type of linear layer (etc. qkv, linear layer) in transformer block."""
-
-    @abstractmethod
-    def iter_tf_quant_inputs(
-        self, model: torch.nn.Module
-    ) -> Union[Iterator[TFQuantInputs], Iterator[HFTFQuantInputs]]:
-        """Returns the layers which should be quantized in transformer blocks."""
-
-    @abstractmethod
-    def get_quant_result(
-        self,
-        quant_inputs: TFQuantInputs,
-        **kwargs: Any,
-    ) -> TFQuantResults:
-        """Returns the quantization result of the layer."""
-
-    @property
-    @abstractmethod
-    def quantized_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for quantized layers."""
-
-    @property
-    @abstractmethod
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for modified layers."""
-
-    @property
-    def quantized_layer_prefix(self) -> str:
-        """Returns the prefix of the transformer block name."""
-        return self.converter.decoder_layer_prefix
-
-    @property
-    def quantized_param_names(self) -> List[str]:
-        """Return the parameter names of quantized layers."""
-        param_names = []
-        for i in range(self.converter.decoder_layer_num):
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            param_names.append(f"{converted_prefix}attn/c_attn/weight:0")
-            param_names.append(f"{converted_prefix}attn/c_proj/weight:0")
-            param_names.append(f"{converted_prefix}mlp/c_fc/weight:0")
-            param_names.append(f"{converted_prefix}mlp/c_proj/weight:0")
-
-        return param_names
-
-
-class AbstractQuantizer(ABC):
-    """Abstract Quantizer for a specific model architecture."""
-
-    def __init__(
-        self,
-        hook: AbstractQuantHook,
-        config: OneOfQuantConfig,
-        converter: OneOfConverter,
-    ):
-        """Initialize the Quantizer.
-
-        Args:
-            hook (AbstractQuantHook): Quantization Hook for a specific model architecture
-            config (CommonQuantConfig): Quantization configuration.
-            converter (OneOfConverter): Converter for a specific model architecture.
-
-        """
-        self.hook = hook
-        self.quant_config = config
-        self.converter = converter
-
-    @abstractmethod
-    def get_calib_dataset(
-        self,
-    ) -> datasets.Dataset:
-        """Get calibration dataset."""
-
-    @abstractmethod
-    def pre_quantize(
-        self,
-        model: torch.nn.Module,
-    ) -> None:
-        """Pre-procedure that should be called before quantize() is called."""
-
-    @abstractmethod
-    def quantize(
-        self,
-        model: torch.nn.Module,
-    ) -> torch.nn.Module:
-        """Setting Quantizer from config and Quantize model."""
-
-
-class CommonQuantizer(AbstractQuantizer, ModelConversionInterface):
-    """Common Quantizer."""
-
-    def check_config(self) -> None:
-        """Check if the quantization config is valid."""
-        self.converter.check_config()
-        calibration_dataset_config = self.quant_config.calibration_dataset
-        data_path_or_name = calibration_dataset_config.path_or_name
-        percentile = self.quant_config.percentile
-        if percentile <= 0 or percentile > 100:
-            raise NotSupportedQuantConfigError(
-                invalid_option=str(percentile),
-                valid_options=["0 < percentile <= 100"],
-            )
-        if not os.path.exists(data_path_or_name):
-            data_name = data_path_or_name.split(":")[0]
-            if data_name not in (
-                data.id for data in huggingface_hub.list_datasets(search=data_name)
-            ):
-                raise NotSupportedQuantConfigError(
-                    invalid_option=data_name,
-                    valid_options=["datasets on the huggingface hub", "local path"],
-                )
-        else:
-            if calibration_dataset_config.format not in QuantDatasetFormat:
-                raise NotSupportedQuantConfigError(
-                    invalid_option=calibration_dataset_config.format,
-                    valid_options=list(QuantDatasetFormat),
-                )
-        try:
-            torch.device(self.quant_config.device)
-        except ValueError as err:
-            raise NotSupportedQuantConfigError(
-                invalid_option=self.quant_config.device,
-                valid_options=["cpu", "cuda"],
-            ) from err
-
-    def get_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Get List of the convert informations for the model."""
-        convert_info_list = self.converter.get_convert_info_list()
-        new_convert_info_list = []
-        for convert_info in convert_info_list:
-            if convert_info.converted_name in self.hook.quantized_param_names:
-                continue
-            new_convert_info_list.append(convert_info)
-
-        return (
-            new_convert_info_list
-            + self.hook.quantized_convert_info_list
-            + self.hook.modified_layers_convert_info_list
-        )
-
-    def get_attributes(self) -> Dict[str, Any]:
-        """Return the attributes of the converted model."""
-        return self.converter.get_attributes()
-
-    @contextmanager
-    def _try_offload_model(self, model: torch.nn.Module):
-        if not self.quant_config.offload:
-            logger.info("Offloading not enabled. Skipping.")
-            model.to(self.quant_config.device)
-            yield
-        else:
-            logger.info("Offloading enabled.")
-            tf_blocks = self.hook.get_tf_blocks(model)
-            send_model_to_device(model, self.quant_config.device, exclude=tf_blocks)
-            with offload_module_sequence(tf_blocks, self.quant_config.device):
-                yield
-
-    def convert(
-        self,
-        model: torch.nn.Module,
-        convert_info_list: List[ConvertInfo],
-        save_numpy_format: bool = True,
-    ) -> Generator[Tuple[str, Union[np.ndarray, torch.Tensor]], None, None]:
-        """Convert Huggingface Model to Friendli format(.h5).
-
-        Args:
-            model (torch.nn.Module): Huggingface model.
-            state_dict (Dict[str, torch.Tensor]):
-                Dictionary of mapping of tensor name to tensor
-            convert_info_list (List[ConvertInfo]):
-                Dictionary of mapping converted params name to conversion functions.
-            save_numpy_format (bool, optional): Save the converted tensor in numpy format.
-                                                Defaults to True.
-        """
-        self.pre_quantize(model)
-        model = self.quantize(model)
-        yield from self.converter.convert(model, convert_info_list, save_numpy_format)
-
-
-class FP8QuantHook(AbstractQuantHook):
-    """Quantization Hook for FP8Quantizer."""
-
-    def pre_quantize(self, model: Module) -> torch.nn.Module:  # type: ignore[]
-        """Pre-procedure that should be called before quantize() is called in FP8Quantizer."""
-        return model
-
-    def post_quantize(self, model: Module) -> torch.nn.Module:
-        """Post-procedure that should be called after quantize() is called in FP8Quantizer."""
-        return model
-
-    def get_quant_result(
-        self, quant_inputs: TFQuantInputs, **kwargs: Any
-    ) -> TFQuantResults:
-        """Returns the quantization result of the layer."""
-        raise NotImplementedError
-
-    def get_quantized_param_names(self, model: torch.nn.Module) -> List[str]:
-        """Return the parameter names of quantized layers."""
-        quantized_param_names = []
-        for tf_quant_input in self.iter_tf_quant_inputs(model):
-            assert isinstance(tf_quant_input, HFTFQuantInputs)
-            for quant_input in tf_quant_input.quant_inputs:
-                for target_name in quant_input.target_names:
-                    quantized_param_names.append(f"{target_name}.weight")
-        return quantized_param_names
-
-    def get_quantized_param_scale_names(self, model):
-        """Return the parameter scale names of quantized layers."""
-        quantized_param_scale_names = []
-        for tf_quant_input in self.iter_tf_quant_inputs(model):
-            assert isinstance(tf_quant_input, HFTFQuantInputs)
-            for quant_input in tf_quant_input.quant_inputs:
-                for target_name in quant_input.target_names:
-                    quantized_param_scale_names.append(f"{target_name}.weight_scale")
-                    quantized_param_scale_names.append(f"{target_name}.in_scale")
-        return quantized_param_scale_names
-
-    @property
-    def quantized_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for quantized layers."""
-        raise NotImplementedError
-
-    @property
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for modified layers."""
-        raise NotImplementedError
-
-
-class FP8Quantizer(CommonQuantizer):
-    """FP8Quantizer for huggingface format.
-
-    This quantizer supports per-tensor weight-activation quantization by
-    using calibration dataset. It adds quantization scale, and quantized
-    parameter to the checkpoint, while preserves parameter shape, and name
-    in huggingface checkpoint.
-    """
-
-    def get_calib_dataset(self) -> datasets.Dataset:
-        """Get calibration dataset."""
-        data_cfg = self.quant_config.calibration_dataset
-        tokenizer = get_tokenizer(self.converter.config.name_or_path)
-        dataset = safe_load_datasets(data_cfg)
-
-        dataset = (
-            dataset.shuffle(self.quant_config.seed)
-            .select(range(data_cfg.num_samples))
-            .select_columns([data_cfg.lookup_column_name])
-        )
-
-        encoded_dataset = tokenizer(
-            dataset[data_cfg.lookup_column_name],
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=data_cfg.max_length,
-        )
-        return encoded_dataset["input_ids"]
-
-    def get_convert_info_list(self) -> List[ConvertInfo]:
-        """Not used in FP8Quantizer."""
-        return []
-
-    def pre_quantize(self, model: Module) -> None:
-        """Not used in FP8Quantizer."""
-        return None
-
-    def _get_weight_act_quantize_results(
-        self,
-        model: torch.nn.Module,
-        names: List[ModuleName],
-        max_input_stats: Dict[ModuleName, torch.Tensor],
-    ) -> List[WeightActQuantResult]:
-        """Get the quantization scales and quantized_weight for a specific layer."""
-        assert (
-            self.quant_config.quant_dtype == ModelDataType.FP8_E4M3
-        ), "currently support fp8_e4m3"
-        max_val = 448.0
-        min_val = -448.0
-        input_max = None
-        for name in names:
-            input_max = max_input_stats.get(name)
-            if input_max is not None:
-                break
-        assert input_max is not None
-        target_weights = [model.get_submodule(name).weight for name in names]
-        target_weight = torch.concat(target_weights)
-
-        act_scale = float(input_max.detach().abs().max().item()) / float(max_val)
-        weight_scale = float(target_weight.detach().abs().max().item()) / float(max_val)
-
-        q_weights = [
-            (
-                (weight.detach().float() / weight_scale)
-                .clip(min_val, max_val)
-                .to(torch.float8_e4m3fn)
-                .view(torch.int8)
-                .to("cpu")
-            )
-            for weight in target_weights
-        ]
-        return [
-            WeightActQuantResult(
-                name,
-                quant_dtype=self.quant_config.quant_dtype,
-                act_scale=torch.tensor(act_scale, dtype=torch.float32),
-                weight_scale=torch.tensor(weight_scale, dtype=torch.float32),
-                q_weight=q_weight,
-                q_group_size=-1,
-                zero_point=torch.tensor(0.0),
-            )
-            for name, q_weight in zip(names, q_weights)
-        ]
-
-    @torch.no_grad()
-    def quantize(
-        self,
-        model: torch.nn.Module,
-    ) -> torch.nn.Module:
-        """Quantize model to lower data type. Currently supports FP8."""
-        # pylint: disable=too-many-locals
-        dataset = self.get_calib_dataset()
-        model.eval()
-        with self._try_offload_model(model):
-            max_input_stats, _ = collect_stats(
-                model,
-                self.quant_config.device,
-                dataset,
-                cast(FP8QuantHook, self.hook).get_linear_layer_types(),
-                percentile=self.quant_config.percentile,
-                tqdm_desc="Collecting stats for Static Quantization.",
-                batch_size=32,
-            )
-            for tf_quant_input in tqdm(
-                self.hook.iter_tf_quant_inputs(model),
-                total=len(self.hook.get_tf_blocks(model)),
-                desc="Quantize",
-                unit="layer",
-            ):
-                assert isinstance(tf_quant_input, HFTFQuantInputs)
-                for quant_input in tf_quant_input.quant_inputs:
-                    parent_module, local_names, names = (
-                        quant_input.parent_module,
-                        quant_input.local_names,
-                        quant_input.target_names,
-                    )
-
-                    if isinstance(parent_module, torch.nn.ModuleList):
-                        # For MoE models with seperate expert layers
-                        parent_modules_w_local_name = []
-                        for p_module in parent_module:
-                            for local_name in local_names:
-                                parent_modules_w_local_name.append(
-                                    (p_module, local_name)
-                                )
-
-                        layers = [
-                            p_module.get_submodule(local_name)
-                            for p_module, local_name in parent_modules_w_local_name
-                        ]
-
-                        quant_results = self._get_weight_act_quantize_results(
-                            model,
-                            names,
-                            max_input_stats,
-                        )
-                        q_layers = [
-                            WeightActQuantizedLinearLayer.from_layer(
-                                layer, quant_result
-                            )
-                            for layer, quant_result in zip(layers, quant_results)
-                        ]
-                        for (p_module, local_name), q_layer in zip(
-                            parent_modules_w_local_name, q_layers
-                        ):
-                            setattr(p_module, local_name, q_layer)
-
-                    else:
-                        layers = [
-                            parent_module.get_submodule(local_name)
-                            for local_name in local_names
-                        ]
-                        quant_results = self._get_weight_act_quantize_results(
-                            model,
-                            names,
-                            max_input_stats,
-                        )
-                        q_layers = [
-                            WeightActQuantizedLinearLayer.from_layer(
-                                layer, quant_result
-                            )
-                            for layer, quant_result in zip(layers, quant_results)
-                        ]
-                        for local_name, q_layer in zip(local_names, q_layers):
-                            setattr(parent_module, local_name, q_layer)
-
-        return model
-
-    def convert(  # type: ignore[override]
-        self,
-        model: torch.nn.Module,
-        convert_info_list: List[ConvertInfo],
-        save_numpy_format: bool = False,
-    ) -> Generator[Tuple[str, Union[torch.Tensor, np.ndarray]], None, None]:
-        """Convert Huggingface Model to Friendli format(.h5).
-
-        Args:
-            model (torch.nn.Module): Huggingface model.
-            state_dict (Dict[str, torch.Tensor]):
-                Dictionary of mapping of tensor name to tensor
-            convert_info_list (List[ConvertInfo]):
-                Dictionary of mapping converted params name to conversion functions.
-                It will be depreciated.
-            save_numpy_format (bool, optional): Save the converted tensor in numpy format.
-                                                It will be depreciated.
-        """
-        model = cast(FP8QuantHook, self.hook).pre_quantize(model)
-        model = self.quantize(model)
-        model = cast(FP8QuantHook, self.hook).post_quantize(model)
-        state_dict: Dict[str, torch.Tensor] = model.state_dict()
-
-        quantized_param_names = cast(FP8QuantHook, self.hook).get_quantized_param_names(
-            model
-        )
-        quantized_param_names.extend(
-            cast(FP8QuantHook, self.hook).get_quantized_param_scale_names(model)
-        )
-
-        with tqdm(total=len(state_dict), desc="Converting", unit="tensor") as pbar:
-            for param_name, param in state_dict.items():
-                if param_name not in quantized_param_names:
-                    param = param.to(get_torch_data_type(self.converter.data_type))
-                yield param_name, param
-                pbar.update()
diff --git a/friendli/modules/quantizer/layers.py b/friendli/modules/quantizer/layers.py
deleted file mode 100644
index 31d104b1..00000000
--- a/friendli/modules/quantizer/layers.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Quantization Layers."""
-
-from __future__ import annotations
-
-from typing import Optional, cast
-
-import torch
-
-from friendli.modules.quantizer.schema.data import (
-    CommonQuantResult,
-    WeightActQuantResult,
-    WeightOnlyQuantResult,
-)
-
-
-class WeightOnlyQuantizedLinearLayer(torch.nn.Module):
-    """Linear Layer with weight only quantization."""
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        q_weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        zeros: torch.Tensor,
-        bias: Optional[torch.nn.Parameter] = None,
-    ):
-        """Initialize the Weight Only Quantized Linear Layer."""
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.weight_scale = torch.nn.Parameter(weight_scale)
-        self.zeros = torch.nn.Parameter(zeros, requires_grad=False)
-        self.weight = torch.nn.Parameter(q_weight, requires_grad=False)
-        self.register_parameter("bias", bias)
-
-    @staticmethod
-    def from_layer(
-        layer: torch.nn.Module, quant_result: CommonQuantResult
-    ) -> torch.nn.Module:
-        """Returns the quantized layer from the original layer."""
-        q_result = cast(WeightOnlyQuantResult, quant_result)
-        return WeightOnlyQuantizedLinearLayer(
-            cast(torch.nn.Linear, layer).in_features,
-            cast(torch.nn.Linear, layer).out_features,
-            q_result.q_weight,
-            q_result.weight_scale,
-            q_result.zero_point,
-            cast(torch.nn.Linear, layer).bias,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass with fake quantization. Not used in conversion."""
-        raise NotImplementedError("Not used in conversion.")
-
-
-class WeightActQuantizedLinearLayer(torch.nn.Module):
-    """Linear Layer with weight-act quantization."""
-
-    def __init__(  # pylint: disable=too-many-arguments
-        self,
-        q_weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        act_scale: torch.Tensor,
-        bias: Optional[torch.nn.Parameter] = None,
-    ):
-        """Initialize the Weight Only Quantized Linear Layer."""
-        super().__init__()
-        self.in_scale = torch.nn.Parameter(act_scale)
-        self.weight_scale = torch.nn.Parameter(weight_scale)
-        self.weight = torch.nn.Parameter(q_weight, requires_grad=False)
-        self.register_parameter("bias", bias)
-
-    @staticmethod
-    def from_layer(
-        layer: torch.nn.Module, quant_result: CommonQuantResult
-    ) -> torch.nn.Module:
-        """Returns the quantized layer from the original layer."""
-        q_result = cast(WeightActQuantResult, quant_result)
-        return WeightActQuantizedLinearLayer(
-            q_result.q_weight,
-            q_result.weight_scale,
-            q_result.act_scale,
-            cast(torch.nn.Linear, layer).bias if hasattr(layer, "bias") else None,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass with fake quantization. Not used in conversion."""
-        raise NotImplementedError("Not used in conversion.")
diff --git a/friendli/modules/quantizer/maps.py b/friendli/modules/quantizer/maps.py
deleted file mode 100644
index 465d5c3e..00000000
--- a/friendli/modules/quantizer/maps.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Quantizer Maps."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, Type
-
-from friendli.enums import QuantMode
-from friendli.errors import NotSupportedQuantModeError
-from friendli.modules.converter.base import OneOfConverter
-from friendli.modules.converter.utils import get_model_arch
-from friendli.modules.quantizer.awq.base import AWQHook, AWQQuantizer
-from friendli.modules.quantizer.awq.models.gpt_neox import AWQGPTNeoXHook
-from friendli.modules.quantizer.awq.models.gptj import AWQGPTJHook
-from friendli.modules.quantizer.awq.models.llama import AWQLlamaHook
-from friendli.modules.quantizer.awq.models.mpt import AWQMPTHook
-from friendli.modules.quantizer.base import CommonQuantizer, FP8QuantHook, FP8Quantizer
-from friendli.modules.quantizer.models.arctic import ArcticHook
-from friendli.modules.quantizer.models.dbrx import DbrxHook
-from friendli.modules.quantizer.models.llama import LlamaHook
-from friendli.modules.quantizer.models.mixtral import MixtralHook
-from friendli.modules.quantizer.models.mpt import MPTHook
-from friendli.modules.quantizer.models.phi3 import Phi3Hook
-from friendli.modules.quantizer.schema.config import OneOfQuantConfig
-from friendli.modules.quantizer.smoothquant.base import (
-    SmoothQuantHook,
-    SmoothQuantQuantizer,
-)
-from friendli.modules.quantizer.smoothquant.models.bloom import SmoothQuantBloomHook
-from friendli.modules.quantizer.smoothquant.models.codegen import SmoothQuantCodeGenHook
-from friendli.modules.quantizer.smoothquant.models.falcon import SmoothQuantFalconHook
-from friendli.modules.quantizer.smoothquant.models.gpt2 import SmoothQuantGPT2Hook
-from friendli.modules.quantizer.smoothquant.models.gpt_neox import (
-    SmoothQuantGPTNeoXHook,
-)
-from friendli.modules.quantizer.smoothquant.models.gptj import SmoothQuantGPTJHook
-from friendli.modules.quantizer.smoothquant.models.llama import SmoothQuantLlamaHook
-from friendli.modules.quantizer.smoothquant.models.mpt import SmoothQuantMPTHook
-from friendli.modules.quantizer.smoothquant.models.opt import SmoothQuantOPTHook
-
-model_arch_smoothquant_hook_map: Dict[str, type[SmoothQuantHook]] = {
-    "OPTForCausalLM": SmoothQuantOPTHook,
-    "MPTForCausalLM": SmoothQuantMPTHook,
-    "BloomForCausalLM": SmoothQuantBloomHook,
-    "CodeGenForCausalLM": SmoothQuantCodeGenHook,
-    "GPTNeoXForCausalLM": SmoothQuantGPTNeoXHook,
-    "GPTJForCausalLM": SmoothQuantGPTJHook,
-    "GPT2LMHeadModel": SmoothQuantGPT2Hook,
-    "FalconForCausalLM": SmoothQuantFalconHook,
-    "LlamaForCausalLM": SmoothQuantLlamaHook,
-}
-
-model_arch_awq_hook_map: Dict[str, type[AWQHook]] = {
-    "GPTJForCausalLM": AWQGPTJHook,
-    "GPTNeoXForCausalLM": AWQGPTNeoXHook,
-    "LlamaForCausalLM": AWQLlamaHook,
-    "MPTForCausalLM": AWQMPTHook,
-    "MistralForCausalLM": AWQLlamaHook,
-}
-
-model_arch_fp8_hook_map: Dict[str, type[FP8QuantHook]] = {
-    "LlamaForCausalLM": LlamaHook,
-    "MistralForCausalLM": LlamaHook,
-    "MixtralForCausalLM": MixtralHook,
-    "MPTForCausalLM": MPTHook,
-    "CohereForCausalLM": LlamaHook,
-    "DbrxForCausalLM": DbrxHook,
-    "Phi3ForCausalLM": Phi3Hook,
-    "ArcticForCausalLM": ArcticHook,
-}
-
-
-def get_quanthook_map(quant_mode: QuantMode) -> Dict[str, Any]:
-    """Get quantizer map."""
-    if quant_mode == QuantMode.SMOOTH_QUANT:
-        return model_arch_smoothquant_hook_map
-    if quant_mode == QuantMode.AWQ:
-        return model_arch_awq_hook_map
-    if quant_mode == QuantMode.FP8:
-        return model_arch_fp8_hook_map
-    raise NotSupportedQuantModeError(
-        invalid_option=quant_mode,
-        valid_options=[e.value for e in QuantMode],
-    )
-
-
-def get_quantizer_class(quant_mode: QuantMode) -> Type[CommonQuantizer]:
-    """Get quantizer class."""
-    if quant_mode == QuantMode.SMOOTH_QUANT:
-        return SmoothQuantQuantizer
-    if quant_mode == QuantMode.AWQ:
-        return AWQQuantizer
-    if quant_mode == QuantMode.FP8:
-        return FP8Quantizer
-    raise NotSupportedQuantModeError(
-        invalid_option=quant_mode,
-        valid_options=[e.value for e in QuantMode],
-    )
-
-
-def get_quantized_converter(
-    quant_config: OneOfQuantConfig,
-    converter: OneOfConverter,
-) -> CommonQuantizer:
-    """Get quantizer for specific model architecture with quant mode and args."""
-    model_arch = get_model_arch(converter.config)
-    quant_mode = quant_config.mode
-    quantizer = get_quantizer_class(quant_mode)
-    quanthook_map = get_quanthook_map(quant_mode)
-    quanthook = quanthook_map[model_arch](quant_config, converter)
-    return quantizer(quanthook, quant_config, converter)
diff --git a/friendli/modules/quantizer/models/arctic.py b/friendli/modules/quantizer/models/arctic.py
deleted file mode 100644
index cc7d3fd9..00000000
--- a/friendli/modules/quantizer/models/arctic.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli ArcticForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type
-
-import torch
-
-from friendli.modules.quantizer.base import FP8QuantHook
-from friendli.modules.quantizer.schema.data import (
-    HFQuantInput,
-    HFTFQuantInputs,
-    TFQuantInputs,
-)
-
-
-class ArcticHook(FP8QuantHook):
-    """FP8QuantHook for ArcticForCausalLM."""
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in ArcticForCausalLM."""
-        return model.model.layers
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in ArcticForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def iter_tf_quant_inputs(
-        self, model: torch.nn.Module
-    ) -> Iterator[TFQuantInputs] | Iterator[HFTFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of ArcticForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.self_attn
-            block_sparse_moe = decoder_layer.block_sparse_moe
-            mlp = decoder_layer.residual_mlp
-            moe_ff1_ff_gate_target_names = []
-            for expert_idx in range(self.converter.num_experts):
-                moe_ff1_ff_gate_target_names.extend(
-                    [
-                        f"{self.quantized_layer_prefix}{index}.block_sparse_moe.experts.{expert_idx}.w1",
-                        f"{self.quantized_layer_prefix}{index}.block_sparse_moe.experts.{expert_idx}.w3",
-                    ]
-                )
-
-            yield HFTFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                quant_inputs=[
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",
-                            f"{self.quantized_layer_prefix}{index}.self_attn.k_proj",
-                            f"{self.quantized_layer_prefix}{index}.self_attn.v_proj",
-                        ],
-                        local_names=["q_proj", "k_proj", "v_proj"],
-                    ),
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                        ],
-                        local_names=[
-                            "o_proj",
-                        ],
-                    ),
-                    # router
-                    HFQuantInput(
-                        parent_module=block_sparse_moe,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.block_sparse_moe.gate",
-                        ],
-                        local_names=["gate"],
-                    ),
-                    # ff1, ff_gate in each moe
-                    HFQuantInput(
-                        parent_module=block_sparse_moe.experts,
-                        target_names=moe_ff1_ff_gate_target_names,
-                        local_names=["w1", "w3"],
-                    ),
-                    # ff2 in each moe
-                    HFQuantInput(
-                        parent_module=block_sparse_moe.experts,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.block_sparse_moe.experts.{expert_idx}.w2"
-                            for expert_idx in range(self.converter.num_experts)
-                        ],
-                        local_names=["w2"],
-                    ),
-                    # ff1, ff_gate in parallel mlp
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.residual_mlp.w1",
-                            f"{self.quantized_layer_prefix}{index}.residual_mlp.w3",
-                        ],
-                        local_names=["w1", "w3"],
-                    ),
-                    # ff2 in parallel mlp
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.residual_mlp.w2"
-                        ],
-                        local_names=["w2"],
-                    ),
-                ],
-            )
diff --git a/friendli/modules/quantizer/models/dbrx.py b/friendli/modules/quantizer/models/dbrx.py
deleted file mode 100644
index f4e3232a..00000000
--- a/friendli/modules/quantizer/models/dbrx.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli DbrxForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Dict, Iterator, List, Tuple, Type, cast
-
-import torch
-from torch.nn.modules import Module
-from tqdm import tqdm
-from transformers.models.dbrx.modeling_dbrx import DbrxBlock, DbrxConfig, DbrxExpertGLU
-
-from friendli.modules.quantizer.base import FP8QuantHook
-from friendli.modules.quantizer.schema.data import (
-    HFQuantInput,
-    HFTFQuantInputs,
-    TFQuantInputs,
-)
-
-
-class DbrxLinearLayer(torch.nn.Module):
-    """Custom FF2Proj layer for DbrxForCausalLM."""
-
-    def __init__(self, weight: torch.nn.Parameter):
-        """Initialize the DbrxLinearLayer."""
-        super().__init__()
-        self.weight = weight
-
-    def forward(self, x: torch.Tensor, chunked_weight: torch.Tensor) -> torch.Tensor:
-        """Forward pass for the DbrxLinearLayer."""
-        return x.matmul(chunked_weight)
-
-
-class CustomDbrxExpertGLU(DbrxExpertGLU):
-    """Custom DbrxExpertGLU layer for DbrxForCausalLM.
-
-    This layer is used to replace the DbrxExpertGLU layer in DbrxForCausalLM.
-    For collecting input of the ff2 layer in each experts, we need to override the forward method.
-    """
-
-    def __init__(self, layer: DbrxExpertGLU, ffn_act_fn: Dict):
-        """Initialize the CustomDbrxExpertGLU."""
-        super().__init__(
-            layer.hidden_size, layer.ffn_hidden_size, layer.moe_num_experts, ffn_act_fn
-        )
-
-        self.v1_linear = DbrxLinearLayer(layer.v1.detach())
-        self.w1_linear = DbrxLinearLayer(layer.w1.detach())
-        self.w2_linear = DbrxLinearLayer(layer.w2.detach())
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        expert_w1: torch.Tensor,
-        expert_v1: torch.Tensor,
-        expert_w2: torch.Tensor,
-    ) -> torch.Tensor:
-        """Forward pass for the CustomDbrxExpertGLU."""
-        gate_proj = self.w1_linear(x, expert_w1.t())
-        up_proj = self.v1_linear(x, expert_v1.t())
-        gate_proj = self.activation_fn(gate_proj)
-        intermediate_states = gate_proj * up_proj
-        down_proj = self.w2_linear(intermediate_states, expert_w2)
-        return down_proj
-
-    @staticmethod
-    def from_layer(layer: DbrxExpertGLU, config: DbrxConfig) -> CustomDbrxExpertGLU:
-        """Creates a CustomDbrxExpertGLU layer from a DbrxExpertGLU layer."""
-        custom_layer = CustomDbrxExpertGLU(layer, config.ffn_config.ffn_act_fn)
-        custom_layer.v1 = layer.v1
-        custom_layer.w1 = layer.w1
-        custom_layer.w2 = layer.w2
-        return custom_layer
-
-
-class DbrxHook(FP8QuantHook):
-    """FP8QuantHook for DbrxForCausalLM."""
-
-    def get_quantized_param_names(self, model: torch.nn.Module) -> List[str]:
-        """Return the parameter names of quantized layers."""
-        quantized_param_names = []
-        for index in range(
-            len(self.get_tf_blocks(model))  # type: ignore[union-attr, arg-type]
-        ):
-            quantized_param_names.extend(
-                [
-                    f"{self.quantized_layer_prefix}{index}.norm_attn_norm.attn.Wqkv.weight",
-                    f"{self.quantized_layer_prefix}{index}.norm_attn_norm.attn.out_proj.weight",
-                    f"{self.quantized_layer_prefix}{index}.ffn.router.layer.weight",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.v1",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.w1",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.w2",
-                ]
-            )
-        return quantized_param_names
-
-    def get_quantized_param_scale_names(self, model: torch.nn.Module) -> List[str]:
-        """Return the parameter scale names of quantized layers."""
-        quantized_param_scale_names = []
-        for index in range(
-            len(self.get_tf_blocks(model))  # type: ignore[union-attr, arg-type]
-        ):
-            quantized_param_scale_names.extend(
-                [
-                    f"{self.quantized_layer_prefix}{index}.norm_attn_norm.attn.Wqkv.weight_scale",
-                    f"{self.quantized_layer_prefix}{index}.norm_attn_norm.attn.out_proj.weight_scale",
-                    f"{self.quantized_layer_prefix}{index}.ffn.router.layer.weight_scale",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.v1_weight_scale",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.w1_weight_scale",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.w2_weight_scale",
-                ]
-            )
-            quantized_param_scale_names.extend(
-                [
-                    f"{self.quantized_layer_prefix}{index}.norm_attn_norm.attn.Wqkv.in_scale",
-                    f"{self.quantized_layer_prefix}{index}.norm_attn_norm.attn.out_proj.in_scale",
-                    f"{self.quantized_layer_prefix}{index}.ffn.router.layer.in_scale",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.v1_in_scale",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.w1_in_scale",
-                    f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.w2_in_scale",
-                ]
-            )
-        return quantized_param_scale_names
-
-    def pre_quantize(self, model: Module) -> torch.nn.Module:
-        """Pre-quantization hook for DbrxForCausalLM."""
-        for decoder_layer in tqdm(
-            self.get_tf_blocks(model),
-            desc="Pre-quantizing DbrxForCausalLM",
-            unit="layer",
-        ):
-            cast(
-                DbrxBlock, decoder_layer
-            ).ffn.experts.mlp = CustomDbrxExpertGLU.from_layer(
-                cast(DbrxBlock, decoder_layer).ffn.experts.mlp, self.converter.config
-            )
-        return model
-
-    def post_quantize(self, model: Module) -> torch.nn.Module:
-        """Post-quantization hook for DbrxForCausalLM."""
-        for decoder_layer in tqdm(
-            self.get_tf_blocks(model),
-            desc="Post-quantizing DbrxForCausalLM",
-            unit="layer",
-        ):
-            mlp = cast(DbrxBlock, decoder_layer).ffn.experts.mlp
-
-            # ff1
-            setattr(mlp, "v1_in_scale", mlp.v1_linear.in_scale)
-            setattr(mlp, "v1_weight_scale", mlp.v1_linear.weight_scale)
-            mlp.v1 = mlp.v1_linear.weight
-            del mlp.v1_linear
-
-            # ff_gate
-            setattr(mlp, "w1_in_scale", mlp.w1_linear.in_scale)
-            setattr(mlp, "w1_weight_scale", mlp.w1_linear.weight_scale)
-            mlp.w1 = mlp.w1_linear.weight
-            del mlp.w1_linear
-
-            # ff2
-            setattr(mlp, "w2_in_scale", mlp.w2_linear.in_scale)
-            setattr(mlp, "w2_weight_scale", mlp.w2_linear.weight_scale)
-            mlp.w2 = mlp.w2_linear.weight
-            del mlp.w2_linear
-        return model
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in DbrxForCausalLM."""
-        return model.transformer.blocks
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in DbrxForCausalLM."""
-        return (
-            torch.nn.Linear,
-            DbrxLinearLayer,
-        )
-
-    def iter_tf_quant_inputs(
-        self, model: torch.nn.Module
-    ) -> Iterator[TFQuantInputs] | Iterator[HFTFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of DbrxForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = cast(DbrxBlock, decoder_layer).norm_attn_norm.attn
-            mlp = cast(DbrxBlock, decoder_layer).ffn.experts.mlp
-
-            yield HFTFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                quant_inputs=[
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.norm_attn_norm.attn.Wqkv",
-                        ],
-                        local_names=["Wqkv"],
-                    ),
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.norm_attn_norm.attn.out_proj",
-                        ],
-                        local_names=[
-                            "out_proj",
-                        ],
-                    ),
-                    HFQuantInput(
-                        parent_module=cast(DbrxBlock, decoder_layer).ffn.router,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.ffn.router.layer",
-                        ],
-                        local_names=["layer"],
-                    ),
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.w1_linear",
-                            f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.v1_linear",
-                        ],
-                        local_names=["w1_linear", "v1_linear"],
-                    ),
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.ffn.experts.mlp.w2_linear"
-                        ],
-                        local_names=["w2_linear"],
-                    ),
-                ],
-            )
diff --git a/friendli/modules/quantizer/models/llama.py b/friendli/modules/quantizer/models/llama.py
deleted file mode 100644
index d4002955..00000000
--- a/friendli/modules/quantizer/models/llama.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli LlamaForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Iterator, List, Tuple, Type
-
-import torch
-
-from friendli.modules.quantizer.base import FP8QuantHook
-from friendli.modules.quantizer.schema.data import (
-    HFQuantInput,
-    HFTFQuantInputs,
-    TFQuantInputs,
-)
-
-
-class LlamaHook(FP8QuantHook):
-    """FP8QuantHook for LlamaForCausalLM."""
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in LlamaForCausalLM."""
-        return model.model.layers
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in LlamaForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def iter_tf_quant_inputs(
-        self, model: torch.nn.Module
-    ) -> Iterator[TFQuantInputs] | Iterator[HFTFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of LlamaForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.self_attn
-            mlp = decoder_layer.mlp
-
-            yield HFTFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                quant_inputs=[
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",
-                            f"{self.quantized_layer_prefix}{index}.self_attn.k_proj",
-                            f"{self.quantized_layer_prefix}{index}.self_attn.v_proj",
-                        ],
-                        local_names=["q_proj", "k_proj", "v_proj"],
-                    ),
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                        ],
-                        local_names=[
-                            "o_proj",
-                        ],
-                    ),
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.up_proj",
-                            f"{self.quantized_layer_prefix}{index}.mlp.gate_proj",
-                        ],
-                        local_names=["up_proj", "gate_proj"],
-                    ),
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.down_proj"
-                        ],
-                        local_names=["down_proj"],
-                    ),
-                ],
-            )
diff --git a/friendli/modules/quantizer/models/mixtral.py b/friendli/modules/quantizer/models/mixtral.py
deleted file mode 100644
index 70abc34b..00000000
--- a/friendli/modules/quantizer/models/mixtral.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli MixtralForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List
-
-import torch
-
-from friendli.modules.quantizer.models.llama import LlamaHook
-from friendli.modules.quantizer.schema.data import (
-    HFQuantInput,
-    HFTFQuantInputs,
-    TFQuantInputs,
-)
-
-
-class MixtralHook(LlamaHook):
-    """FP8QuantHook for MixtralForCausalLM."""
-
-    def iter_tf_quant_inputs(
-        self, model: torch.nn.Module
-    ) -> Iterator[TFQuantInputs] | Iterator[HFTFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of MixtralForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.self_attn
-            block_sparse_moe = decoder_layer.block_sparse_moe
-            moe_ff1_ff_gate_target_names = []
-            for expert_idx in range(self.converter.num_experts):
-                moe_ff1_ff_gate_target_names.extend(
-                    [
-                        f"{self.quantized_layer_prefix}{index}.block_sparse_moe.experts.{expert_idx}.w1",
-                        f"{self.quantized_layer_prefix}{index}.block_sparse_moe.experts.{expert_idx}.w3",
-                    ]
-                )
-
-            yield HFTFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                quant_inputs=[
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",
-                            f"{self.quantized_layer_prefix}{index}.self_attn.k_proj",
-                            f"{self.quantized_layer_prefix}{index}.self_attn.v_proj",
-                        ],
-                        local_names=["q_proj", "k_proj", "v_proj"],
-                    ),
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                        ],
-                        local_names=[
-                            "o_proj",
-                        ],
-                    ),
-                    # router
-                    HFQuantInput(
-                        parent_module=block_sparse_moe,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.block_sparse_moe.gate",
-                        ],
-                        local_names=["gate"],
-                    ),
-                    # ff1, ff_gate in each moe
-                    HFQuantInput(
-                        parent_module=block_sparse_moe.experts,
-                        target_names=moe_ff1_ff_gate_target_names,
-                        local_names=["w1", "w3"],
-                    ),
-                    # ff2 in each moe
-                    HFQuantInput(
-                        parent_module=block_sparse_moe.experts,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.block_sparse_moe.experts.{expert_idx}.w2"
-                            for expert_idx in range(self.converter.num_experts)
-                        ],
-                        local_names=["w2"],
-                    ),
-                ],
-            )
diff --git a/friendli/modules/quantizer/models/mpt.py b/friendli/modules/quantizer/models/mpt.py
deleted file mode 100644
index 39a17ff1..00000000
--- a/friendli/modules/quantizer/models/mpt.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli MPTForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type
-
-import torch
-
-from friendli.modules.quantizer.base import FP8QuantHook
-from friendli.modules.quantizer.schema.data import (
-    HFQuantInput,
-    HFTFQuantInputs,
-    TFQuantInputs,
-)
-
-
-class MPTHook(FP8QuantHook):
-    """FP8QuantHook for MPTForCausalLM."""
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in MPTForCausalLM."""
-        return model.transformer.blocks
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in MPTForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def iter_tf_quant_inputs(
-        self, model: torch.nn.Module
-    ) -> Iterator[TFQuantInputs] | Iterator[HFTFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of MPTForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.attn
-            mlp = decoder_layer.ffn
-
-            yield HFTFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                quant_inputs=[
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.attn.Wqkv",
-                        ],
-                        local_names=["Wqkv"],
-                    ),
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                        ],
-                        local_names=[
-                            "out_proj",
-                        ],
-                    ),
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.ffn.up_proj",
-                        ],
-                        local_names=["up_proj"],
-                    ),
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.ffn.down_proj"
-                        ],
-                        local_names=["down_proj"],
-                    ),
-                ],
-            )
diff --git a/friendli/modules/quantizer/models/phi3.py b/friendli/modules/quantizer/models/phi3.py
deleted file mode 100644
index 4d4d15cb..00000000
--- a/friendli/modules/quantizer/models/phi3.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Phi3ForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type
-
-import torch
-
-from friendli.modules.quantizer.base import FP8QuantHook
-from friendli.modules.quantizer.schema.data import (
-    HFQuantInput,
-    HFTFQuantInputs,
-    TFQuantInputs,
-)
-
-
-class Phi3Hook(FP8QuantHook):
-    """FP8QuantHook for Phi3ForCausalLM."""
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the transformer blocks in Phi3ForCausalLM."""
-        return model.model.layers
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in Phi3ForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def iter_tf_quant_inputs(
-        self, model: torch.nn.Module
-    ) -> Iterator[TFQuantInputs] | Iterator[HFTFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of Phi3ForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.self_attn
-            mlp = decoder_layer.mlp
-
-            yield HFTFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                quant_inputs=[
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.qkv_proj",
-                        ],
-                        local_names=["qkv_proj"],
-                    ),
-                    HFQuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                        ],
-                        local_names=[
-                            "o_proj",
-                        ],
-                    ),
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.gate_up_proj",
-                        ],
-                        local_names=["gate_up_proj"],
-                    ),
-                    HFQuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.down_proj"
-                        ],
-                        local_names=["down_proj"],
-                    ),
-                ],
-            )
diff --git a/friendli/modules/quantizer/schema/__init__.py b/friendli/modules/quantizer/schema/__init__.py
deleted file mode 100644
index f5d8dd04..00000000
--- a/friendli/modules/quantizer/schema/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantizer Schema."""
diff --git a/friendli/modules/quantizer/schema/config.py b/friendli/modules/quantizer/schema/config.py
deleted file mode 100644
index 2ca36f7b..00000000
--- a/friendli/modules/quantizer/schema/config.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantizer Config Schema."""
-
-from __future__ import annotations
-
-from typing import Literal, Union
-
-from pydantic import BaseModel, Field
-from typing_extensions import Annotated
-
-from friendli.enums import ModelDataType, QuantDatasetFormat, QuantMode
-
-
-class CalibrationDatasetConfig(BaseModel):
-    """Calibration dataset config."""
-
-    path_or_name: str = "cnn_dailymail:3.0.0"
-    format: QuantDatasetFormat = QuantDatasetFormat.JSON
-    split: str = "validation"
-    lookup_column_name: str = "article"
-    num_samples: int = 512
-    max_length: int = 512
-
-
-class AbstractQuantConfig(BaseModel):
-    """Abstract quantization config."""
-
-    mode: QuantMode
-    device: str = "cuda:0"
-    offload: bool = True
-    seed: int = 42
-    percentile: float = 100.0
-    quant_dtype: ModelDataType = ModelDataType.INT8
-    calibration_dataset: CalibrationDatasetConfig = Field(
-        default_factory=CalibrationDatasetConfig
-    )
-
-
-class FP8QuantConfig(AbstractQuantConfig):
-    """FP8 quantization config.
-
-    The data type of parameters are converted to the one specified at `quant_dtype`
-    by using calibration dataset. The quantization scale for weight and activation is
-    added to converted checkpoint.
-
-    """
-
-    mode: Literal[QuantMode.FP8] = QuantMode.FP8
-
-
-class SmoothQuantArgs(BaseModel):
-    """SmoothQuant args."""
-
-    migration_strength: float = 0.5
-    attn_fc_smoothing: bool = False
-    ff2_smoothing: bool = False
-
-
-class SmoothQuantConfig(AbstractQuantConfig):
-    """SmoothQuant config."""
-
-    mode: Literal[QuantMode.SMOOTH_QUANT] = QuantMode.SMOOTH_QUANT
-    smoothquant_args: SmoothQuantArgs = Field(default_factory=SmoothQuantArgs)
-
-
-class AWQArgs(BaseModel):
-    """AWQ args."""
-
-    quant_dtype: ModelDataType = ModelDataType.INT4
-    quant_bit: int = 4
-    quant_group_size: int = 64
-
-
-class AWQConfig(AbstractQuantConfig):
-    """AWQ config."""
-
-    mode: Literal[QuantMode.AWQ] = QuantMode.AWQ
-    awq_args: AWQArgs = Field(default_factory=AWQArgs)
-
-
-OneOfQuantConfig = Annotated[
-    Union[SmoothQuantConfig, AWQConfig, FP8QuantConfig], Field(discriminator="mode")
-]
-
-
-class QuantConfig(BaseModel):
-    """Quantization config."""
-
-    config: OneOfQuantConfig
diff --git a/friendli/modules/quantizer/schema/data.py b/friendli/modules/quantizer/schema/data.py
deleted file mode 100644
index ae472126..00000000
--- a/friendli/modules/quantizer/schema/data.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantizer Data Schema."""
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Callable, List, Optional
-
-import torch
-
-from friendli.enums import ModelDataType
-
-ModuleName = str
-
-
-@dataclass
-class CommonQuantResult:
-    """Dataclass for quantization result per layer."""
-
-    module_name: str
-    quant_dtype: ModelDataType
-    q_group_size: int
-    zero_point: torch.Tensor
-
-
-@dataclass
-class WeightOnlyQuantResult(CommonQuantResult):
-    """Dataclass for weight-only quantization result per layer."""
-
-    weight_scale: torch.Tensor
-    q_weight: torch.Tensor
-
-
-@dataclass
-class WeightActQuantResult(WeightOnlyQuantResult):
-    """Dataclass for weight-activation quantization result per layer."""
-
-    act_scale: torch.Tensor
-    zero_point: torch.Tensor
-    q_group_size: int
-
-
-@dataclass
-class QuantInput:
-    """Dataclass for int8 quantization input of each layer in transformer block."""
-
-    weight: torch.Tensor  # [OutDim, InDim]
-    name: ModuleName
-    start_offset: Optional[int]  # start offset of the weight tensor along the out_dim
-    end_offset: Optional[int]  # end offset of the weight tensor along the out_dim
-    sort_fn: Optional[
-        Callable[[torch.Tensor], torch.Tensor]
-    ] = None  # sort function for max_output_stats
-
-
-@dataclass
-class HFQuantInput:
-    """Dataclass for quantization input of each layer in transformer block.
-
-    Attributes:
-        parent_module: module contains target layers.
-        target_names: list of target module's full name
-                    (ex. model.model.layers.0.self_attn.q_proj, )
-        local_names: list of target module's name using when access from parent_module
-                    (ex. q_proj, k_proj, v_proj )
-    """
-
-    parent_module: torch.nn.Module
-    target_names: List[ModuleName]
-    local_names: str
-
-
-@dataclass
-class HFTFQuantInputs:
-    """Dataclass for quantization input per transformer block."""
-
-    layer_index: int
-    block: torch.nn.Module
-    quant_inputs: List[HFQuantInput]
-
-
-@dataclass
-class TFQuantInputs:  # pylint: disable=too-many-instance-attributes
-    """Dataclass for int8 quantization input per transformer block."""
-
-    layer_index: int
-    block: torch.nn.Module
-    q: QuantInput
-    k: QuantInput
-    v: QuantInput
-    attn_fc: QuantInput
-    ff1: QuantInput
-    ff2: QuantInput
-
-
-@dataclass
-class TFQuantResults:  # pylint: disable=too-many-instance-attributes
-    """Dataclass for int8 quantization result per a transformer block."""
-
-    layer_prefix_with_index: str
-    block: torch.nn.Module
-    q: CommonQuantResult
-    k: CommonQuantResult
-    v: CommonQuantResult
-    attn_fc: CommonQuantResult
-    ff1: CommonQuantResult
-    ff2: CommonQuantResult
diff --git a/friendli/modules/quantizer/smoothquant/__init__.py b/friendli/modules/quantizer/smoothquant/__init__.py
deleted file mode 100644
index 5205fe18..00000000
--- a/friendli/modules/quantizer/smoothquant/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model SmoothQuant Quantizer."""
diff --git a/friendli/modules/quantizer/smoothquant/base.py b/friendli/modules/quantizer/smoothquant/base.py
deleted file mode 100644
index 8ee4e1a7..00000000
--- a/friendli/modules/quantizer/smoothquant/base.py
+++ /dev/null
@@ -1,567 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli SmoothQuant Quantizer Base."""
-
-from __future__ import annotations
-
-from abc import abstractmethod
-from dataclasses import fields
-from typing import Any, Dict, Iterator, List, Tuple, cast
-
-import datasets  # type: ignore[import]
-import torch
-
-from friendli.enums import ModelDataType
-from friendli.errors import NotSupportedQuantConfigError
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.interface import ModelConversionInterface
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.converter.utils import get_tokenizer
-from friendli.modules.quantizer.base import AbstractQuantHook, CommonQuantizer
-from friendli.modules.quantizer.layers import WeightActQuantizedLinearLayer
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import (
-    ModuleName,
-    QuantInput,
-    TFQuantInputs,
-    TFQuantResults,
-    WeightActQuantResult,
-)
-from friendli.modules.quantizer.utils import (
-    collect_stats,
-    get_weight_act_quant_scales,
-    quantized_linear_weight_reshape,
-    quantized_qkv_weight_reshape,
-    safe_load_datasets,
-    scale_reshape,
-)
-
-
-class PreSmoother(torch.nn.Module):
-    """Module for containing smoothing scale.
-
-    This module is used to contain the smoothing scale for the quantization.
-    If the matmul layer have previous layer, the smoothing scale can be migrated
-    to the previous layer. But, if the matmul layer is the first layer, the scale
-    need to be stored in this module. Especially, When MLP ff2 layer with previous activation
-    layer that prevent migrating the scale to the previous layer needs SmoothQuant, then,
-    this module is used to store the smoothing scale. [SmoothQunat Issue #15]
-    (https://github.com/mit-han-lab/smoothquant/issues/15#issuecomment-1353390283).
-
-    Args:
-        in_dim (float): input dimension of the matmul layer's weight dimension.
-    """
-
-    def __init__(self, in_dim: int):
-        """Initialize PreSmoother."""
-        super().__init__()
-        self.scale = torch.nn.Parameter(torch.ones(in_dim, dtype=torch.float32))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward function of PreSmoother."""
-        return (x * self.scale).to(x.dtype)
-
-
-class SmoothQuantHook(AbstractQuantHook):
-    """Quantization Hook for SmoothQuant."""
-
-    @abstractmethod
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the attention fc layer in the decoder block."""
-
-    @abstractmethod
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the second feed-forward layer in the decoder block."""
-
-    @abstractmethod
-    def iter_smooth_norm_weights(
-        self, model: torch.nn.Module
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm and linear layer's weight per transformer block."""
-
-    def _register_pre_smoother(self, linear: torch.nn.Linear) -> PreSmoother:
-        """Register pre_smoother storing smoothing scale of linear layer."""
-        pre_smoother = PreSmoother(linear.in_features).to(device=linear.weight.device)
-
-        def pre_smoother_hook(_, x: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
-            return (pre_smoother.forward(x[0]),)
-
-        linear.register_forward_pre_hook(pre_smoother_hook)
-        return pre_smoother
-
-    def pre_smooth(
-        self,
-        model: torch.nn.Module,
-    ) -> torch.nn.Module:
-        """Pre-procedure for SmoothQuant before Smoothing."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-        for decoder_layer in self.get_tf_blocks(model):
-            if quant_args.attn_fc_smoothing:
-                attn_fc_pre_smoother = self._register_pre_smoother(
-                    self.get_attn_fc_layer(decoder_layer)
-                )
-                decoder_layer.add_module("attn_fc_pre_smoother", attn_fc_pre_smoother)
-            if quant_args.ff2_smoothing:
-                ff2_pre_smoother = self._register_pre_smoother(
-                    self.get_ff2_layer(decoder_layer)
-                )
-                decoder_layer.add_module("ff2_pre_smoother", ff2_pre_smoother)
-        return model
-
-    def sort_qkv_output_stats(self, max_output_stat: torch.Tensor) -> torch.Tensor:
-        """Sort max_output_stas for seperating qkv_layer's output_stats."""
-        return max_output_stat
-
-    def copy_norms(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Copy and Register norms in transformer block for seperated scaling.
-
-        In some models(e.g. llama, gptj, codegen), matmul layers share activations
-        from the same norms. Therefore, we need to copy and register the norms for
-        seperated smoothing scale. For example, in llama, normalization layer is
-        shared with gate linear layer and attention linear layer. Thus, we need to
-        copy and register the norms for each linear layer and use them for smoothing.
-        """
-        return model
-
-    def get_quant_result(
-        self,
-        quant_inputs: TFQuantInputs,
-        **kwargs: Any,
-    ) -> TFQuantResults:
-        """Returns the quantization result of the quantized layer.
-
-        If the model has another quantized layer, it should be implemented in the subclass.
-
-        """
-        max_input_stats: Dict[ModuleName, torch.Tensor] = kwargs["max_input_stats"]
-        max_output_stats: Dict[ModuleName, torch.Tensor] = kwargs["max_output_stats"]
-
-        def get_scale(
-            quant_input: QuantInput,
-        ) -> WeightActQuantResult:
-            weight, name, start, end, sort_fn = (
-                quant_input.weight,
-                quant_input.name,
-                quant_input.start_offset,
-                quant_input.end_offset,
-                quant_input.sort_fn,
-            )
-
-            return get_weight_act_quant_scales(
-                name,
-                max_input_stats[name],
-                weight[start:end],
-                weight[start:end],
-                sort_fn(max_output_stats[name])[start:end]
-                if sort_fn
-                else max_output_stats[name][start:end],
-            )
-
-        return TFQuantResults(
-            layer_prefix_with_index=f"{self.quantized_layer_prefix}{quant_inputs.layer_index}.",
-            block=quant_inputs.block,
-            q=get_scale(quant_inputs.q),
-            k=get_scale(quant_inputs.k),
-            v=get_scale(quant_inputs.v),
-            attn_fc=get_scale(quant_inputs.attn_fc),
-            ff1=get_scale(quant_inputs.ff1),
-            ff2=get_scale(quant_inputs.ff2),
-        )
-
-    @property
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for modified modules.
-
-        This convert_info_list is used for modules that are modified for quantization.
-        Especially, for attention fc layer and MLP ff2 layer, we need to migrate
-        smooth scale to the previous layer. Thus, we add the smoothing scaler, and
-        modify the convert_info_list for the modified modules.
-
-        In some models, matmul layers share activations from the same norms. Therefore,
-        we use `copy_norms()` to copy and register the norms for seperated smoothing scale.
-        Thus, we modify the convert_info_list for the modified modules.
-        """
-        sq_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-        new_layer_convert_info_list = []
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-
-            if sq_args.attn_fc_smoothing:
-                new_layer_convert_info_list.append(
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn_fc_pre_smoother.scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_proj/smoothquant/smoothing_vector:0",  # pylint: disable=line-too-long
-                        reshape_fn=scale_reshape,
-                    )
-                )
-            if sq_args.ff2_smoothing:
-                new_layer_convert_info_list.append(
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff2_pre_smoother.scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_proj/smoothquant/smoothing_vector:0",  # pylint: disable=line-too-long
-                        reshape_fn=scale_reshape,
-                    )
-                )
-
-        return new_layer_convert_info_list
-
-    @property
-    def quantized_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Return the list of conversion informations for quantized layers."""
-        convert_info_list = []
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}q.weight_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_attn/smoothquant/q_weight_scale:0",  # pylint: disable=line-too-long
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}k.weight_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_attn/smoothquant/k_weight_scale:0",  # pylint: disable=line-too-long
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}v.weight_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_attn/smoothquant/v_weight_scale:0",  # pylint: disable=line-too-long
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}q.out_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_attn/smoothquant/q_out_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}k.out_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_attn/smoothquant/k_out_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}v.out_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_attn/smoothquant/v_out_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}q.in_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_attn/smoothquant/in_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn_fc.weight_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_proj/smoothquant/weight_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn_fc.out_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_proj/smoothquant/out_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn_fc.in_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}attn/c_proj/smoothquant/in_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff1.weight_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_fc/smoothquant/weight_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff1.out_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_fc/smoothquant/out_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff1.in_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_fc/smoothquant/in_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff2.weight_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_proj/smoothquant/weight_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff2.out_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_proj/smoothquant/out_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff2.in_scale"],
-                        data_type=ModelDataType.FP32,
-                        converted_name=f"{converted_prefix}mlp/c_proj/smoothquant/in_scale:0",
-                        reshape_fn=scale_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[
-                            f"{layer_prefix}q.weight",
-                            f"{layer_prefix}k.weight",
-                            f"{layer_prefix}v.weight",
-                        ],
-                        data_type=ModelDataType.INT8,
-                        converted_name=f"{converted_prefix}attn/c_attn/smoothquant/weight:0",
-                        reshape_fn=quantized_qkv_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}attn_fc.weight"],
-                        data_type=ModelDataType.INT8,
-                        converted_name=f"{converted_prefix}attn/c_proj/smoothquant/weight:0",
-                        reshape_fn=quantized_linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff1.weight"],
-                        data_type=ModelDataType.INT8,
-                        converted_name=f"{converted_prefix}mlp/c_fc/smoothquant/weight:0",
-                        reshape_fn=quantized_linear_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ff2.weight"],
-                        data_type=ModelDataType.INT8,
-                        converted_name=f"{converted_prefix}mlp/c_proj/smoothquant/weight:0",
-                        reshape_fn=quantized_linear_weight_reshape,
-                    ),
-                ]
-            )
-        return convert_info_list
-
-
-class SmoothQuantQuantizer(CommonQuantizer, ModelConversionInterface):
-    """Quantizer for SmoothQuant."""
-
-    def check_config(self) -> None:
-        """Check if the SmoothQuant quantization config is valid."""
-        quant_config = cast(SmoothQuantConfig, self.quant_config)
-        smoothquant_args = quant_config.smoothquant_args
-        super().check_config()
-        if 0 > smoothquant_args.migration_strength > 1:
-            raise NotSupportedQuantConfigError(
-                invalid_option=str(smoothquant_args.migration_strength),
-                valid_options=["between 0 and 1."],
-            )
-
-    def get_calib_dataset(self) -> datasets.Dataset:
-        """Get calibration dataset for SmoothQuant."""
-        data_cfg = self.quant_config.calibration_dataset
-        tokenizer = get_tokenizer(self.converter.config.name_or_path)
-        dataset = safe_load_datasets(data_cfg)
-
-        def preprocess(example) -> Dict[str, torch.Tensor]:
-            truncate_length = data_cfg.max_length * 4
-            while True:
-                input_ids = tokenizer(
-                    example[data_cfg.lookup_column_name][:truncate_length],
-                    return_tensors="pt",
-                    max_length=data_cfg.max_length * 2,
-                    truncation=True,
-                    padding=False,
-                ).input_ids
-
-                if input_ids.size(
-                    1
-                ) >= data_cfg.max_length * 2 or truncate_length >= len(
-                    example[data_cfg.lookup_column_name]
-                ):
-                    input_ids = input_ids[:, : data_cfg.max_length]
-                    break
-
-                truncate_length *= 2
-            return {"input_ids": input_ids}
-
-        dataset = (
-            dataset.shuffle(self.quant_config.seed)
-            .select(range(data_cfg.num_samples))
-            .select_columns([data_cfg.lookup_column_name])
-            .map(function=preprocess)
-        )
-
-        return dataset
-
-    @torch.no_grad()
-    def _perform_smoothing(
-        self,
-        activation_norms: List[torch.Tensor],
-        fc_weights: List[torch.Tensor],
-        activation_max: torch.Tensor,
-        *,
-        migration_strength: float = 0.5,
-        epsilon: float = 1e-5,
-        inplace: bool = False,
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-        """Perform activation-weight smoothing in SmoothQuant.
-
-        Performs the activation-weight smoothing scheme described in SmoothQuant
-        (Xiao et al., 2023), which migrates the amplitude of outliers from activations
-        to weights of matmul layers. The function takes in the following parameters:
-
-        Args:
-            activation_norms: torch.Tensors representing affine parameters
-                (i.e., beta and gamma) of a normalization layer before each matmul layer.
-            fc_weights: torch.Tensors representing the weight matrices of the matmul layer.
-            activation_max: The maximum activation value of inputs of the matmul layer.
-            migration_strength: the strength of the activation migration. Default is 0.5.
-            epsilon: The epsilon used for numerical stability when calculating the scales.
-                Default is 1e-5.
-
-        Returns:
-            A tuple of three torch.Tensors: (smoothed_activation_norms, smoothed_fc_weights)
-
-        The function calculates "scales" as `pow(|Activation|, migration_strength) /
-        pow(|Weight|, 1-migration_strength)` and applies the smoothing effect into
-        a normalization layer that exists before every matmul layer. This is done because
-        it is more efficient than introducing a new smoothing layer before every matmul layer.
-        Fusing the smoothing effect into the normalization layer results in a faster and
-        more efficient implementation of the smoothing scheme.
-
-        The function returns the smoothed normalization coefficients and the smoothed weight
-        matrices after the smoothing process.
-        """
-        # shape of activation norms: [InChannels]
-        # shape of fc weights: [OutChannels, InChannels]
-        # shape of activation_max: [InChannels]
-
-        # pylint: disable=too-many-locals
-        assert activation_norms
-        assert fc_weights
-
-        assert activation_norms[0].ndim == 1
-        in_channels = activation_norms[0].size(0)
-        device = activation_norms[0].device
-        dtype = activation_norms[0].dtype
-
-        for norm in activation_norms:
-            assert tuple(norm.size()) == (in_channels,)
-            assert norm.device == device
-            assert norm.dtype == dtype
-
-        for weight in fc_weights:
-            assert weight.ndim == 2
-            assert weight.size(1) == in_channels
-            assert weight.device == device
-            assert weight.dtype == dtype
-
-        activation_max = activation_max.to(device=device)
-        weight_max = fc_weights[0].abs().max(dim=0).values
-        for weight in fc_weights[1:]:
-            weight_max = torch.maximum(weight_max, weight.abs().max(dim=0).values)
-
-        assert tuple(activation_max.size()) == (in_channels,)
-        assert tuple(weight_max.size()) == (in_channels,)
-        alpha = migration_strength
-        scales = (
-            (
-                activation_max.to(dtype=torch.float32).pow(alpha)
-                / weight_max.to(dtype=torch.float32).pow(1 - alpha)
-            )
-            .clamp(min=epsilon)
-            .to(dtype=dtype)
-        )
-
-        scaled_activation_norms = [act_norm / scales for act_norm in activation_norms]
-        scaled_weights = [w * scales.view(1, -1) for w in fc_weights]
-
-        if inplace:
-            for dst, src in zip(activation_norms, scaled_activation_norms):
-                dst.copy_(src)
-            for dst, src in zip(fc_weights, scaled_weights):
-                dst.copy_(src)
-
-        return scaled_activation_norms, scaled_weights
-
-    def _smooth(
-        self,
-        model: torch.nn.Module,
-    ) -> None:
-        """Smooths the models before Quantization."""
-        model.to(device=torch.device(self.quant_config.device))
-        model.eval()
-        model = cast(SmoothQuantHook, self.hook).pre_smooth(model)
-
-        # collect stats for SmoothQuant scale.
-        dataset = self.get_calib_dataset()
-        quant_config = cast(SmoothQuantConfig, self.quant_config)
-        max_input_stats, _ = collect_stats(
-            model,
-            quant_config.device,
-            dataset,
-            cast(SmoothQuantHook, self.hook).get_linear_layer_types(),
-            tqdm_desc="Collecting stats for Smoothing.",
-            percentile=100.0,
-        )
-
-        # TODO change name to pre_act_params, post_act_params
-        # (attn_fc, ff2 are not scaled with norms)
-        for norms, weights, name in cast(
-            SmoothQuantHook, self.hook
-        ).iter_smooth_norm_weights(model):
-            self._perform_smoothing(
-                norms,
-                weights,
-                max_input_stats[name],
-                migration_strength=quant_config.smoothquant_args.migration_strength,
-                inplace=True,
-            )
-
-    def pre_quantize(
-        self,
-        model: torch.nn.Module,
-    ) -> None:
-        """Pre-procedure that should be called before quantize() is called."""
-        self._smooth(model)
-
-    def quantize(
-        self,
-        model: torch.nn.Module,
-    ) -> torch.nn.Module:
-        """Quantize model with SmoothQuant."""
-        dataset = self.get_calib_dataset()
-        max_input_stats, max_output_stats = collect_stats(
-            model,
-            self.quant_config.device,
-            dataset,
-            cast(SmoothQuantHook, self.hook).get_linear_layer_types(),
-            percentile=self.quant_config.percentile,
-            tqdm_desc="Collecting stats for Static Quantization.",
-        )
-        for quant_input in self.hook.iter_tf_quant_inputs(model):
-            assert isinstance(quant_input, TFQuantInputs)
-            quant_result = cast(SmoothQuantHook, self.hook).get_quant_result(
-                quant_input,
-                max_input_stats=max_input_stats,
-                max_output_stats=max_output_stats,
-            )
-
-            for field in fields(quant_result):
-                layer_quant_result = getattr(quant_result, field.name)
-                if isinstance(layer_quant_result, WeightActQuantResult):
-                    layer = model.get_submodule(layer_quant_result.module_name)
-                    q_layer = WeightActQuantizedLinearLayer.from_layer(
-                        layer, layer_quant_result
-                    )
-                    quant_result.block.add_module(field.name, q_layer)
-
-        return model
diff --git a/friendli/modules/quantizer/smoothquant/models/bloom.py b/friendli/modules/quantizer/smoothquant/models/bloom.py
deleted file mode 100644
index 86fc39de..00000000
--- a/friendli/modules/quantizer/smoothquant/models/bloom.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli BloomForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Any, Dict, Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers.models.bloom import (  # type: ignore[import]
-    BloomConfig,
-    BloomForCausalLM,
-)
-
-from friendli.modules.converter.base import OneOfConverter
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-
-
-class SmoothQuantBloomHook(SmoothQuantHook):
-    """SmoothQuant Hook for BloomForCausalLM."""
-
-    def __init__(self, quant_config: Dict[str, Any], converter: OneOfConverter):
-        """Initialize SmoothQuantBloomHook."""
-        super().__init__(quant_config, converter)
-        self.num_heads = cast(BloomConfig, converter.config).num_attention_heads
-        self.hidden_size = cast(BloomConfig, converter.config).hidden_size
-        self.head_size = self.hidden_size // self.num_heads
-
-    def iter_smooth_norm_weights(
-        self,
-        model: BloomForCausalLM,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight pr transformer block in BloomForCausalLM."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-        for index, decoder_layer in enumerate(model.transformer.h):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection ] gets smoothed
-            yield (
-                [
-                    decoder_layer.input_layernorm.weight.data,
-                    decoder_layer.input_layernorm.bias.data,
-                ],
-                [
-                    decoder_layer.self_attention.query_key_value.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",  # the input tensors fed into Q, K, V matrices are identical.
-            )
-            # [LayerNorm 2] - [ MLP FF 1 ] gets smoothed
-            yield (
-                [
-                    decoder_layer.post_attention_layernorm.weight.data,
-                    decoder_layer.post_attention_layernorm.bias.data,
-                ],
-                [decoder_layer.mlp.dense_h_to_4h.weight.data],  # [OutDim, InDim]
-                f"{self.quantized_layer_prefix}{index}.mlp.dense_h_to_4h",
-            )
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data],
-                    [decoder_layer.self_attention.dense.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.self_attention.dense",
-                )
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data],
-                    [decoder_layer.mlp.dense_4h_to_h.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_4h_to_h",
-                )
-
-    def reshape_qkv_weight(
-        self, attn_layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Reshapes the qkv weight in BloomForCausalLM for Quantization."""
-        qkv_layer = cast(torch.nn.Linear, attn_layer.query_key_value)
-        split_qkv_weight_list = torch.split(qkv_layer.weight, self.head_size, dim=0)
-        num_heads = cast(BloomConfig, self.converter.config).num_attention_heads
-
-        [q_weight, k_weight, v_weight] = [
-            torch.cat(
-                [split_qkv_weight_list[j * 3 + i] for j in range(num_heads)],
-                dim=0,
-            ).reshape(-1, self.hidden_size)
-            for i in range(3)
-        ]
-        return q_weight, k_weight, v_weight
-
-    def sort_qkv_output_stats(self, max_output_stat: torch.Tensor) -> torch.Tensor:
-        """Sort max_output_stas for seperating qkv_layer's output_stats."""
-        split_qkv_output_stat = torch.split(max_output_stat, self.head_size)
-        qkv_output_stat_list = [
-            torch.cat(
-                [split_qkv_output_stat[j * 3 + i] for j in range(self.num_heads)],
-            )
-            for i in range(3)
-        ]
-        qkv_output_stat = torch.cat(qkv_output_stat_list)
-        return qkv_output_stat
-
-    def iter_tf_quant_inputs(self, model: BloomForCausalLM) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of BloomForCausalLM."""
-        for index, decoder_layer in enumerate(model.transformer.h):
-            self_attn = decoder_layer.self_attention
-            q_weight, k_weight, v_weight = self.reshape_qkv_weight(self_attn)
-            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
-            qkv_weight_out_dim = qkv_weight.size(0)
-            fc1 = decoder_layer.mlp.dense_h_to_4h
-            fc2 = decoder_layer.mlp.dense_4h_to_h
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",
-                    0,
-                    qkv_weight_out_dim // 3,
-                    self.sort_qkv_output_stats,
-                ),
-                k=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",
-                    qkv_weight_out_dim // 3,
-                    qkv_weight_out_dim // 3 * 2,
-                    self.sort_qkv_output_stats,
-                ),
-                v=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",
-                    qkv_weight_out_dim // 3 * 2,
-                    qkv_weight_out_dim,
-                    self.sort_qkv_output_stats,
-                ),
-                attn_fc=QuantInput(
-                    self_attn.dense.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attention.dense",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_h_to_4h",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_4h_to_h",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in BloomForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.self_attention.dense
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.dense_4h_to_h
-
-    def get_tf_blocks(self, model: BloomForCausalLM) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.transformer.h)
diff --git a/friendli/modules/quantizer/smoothquant/models/codegen.py b/friendli/modules/quantizer/smoothquant/models/codegen.py
deleted file mode 100644
index 00455186..00000000
--- a/friendli/modules/quantizer/smoothquant/models/codegen.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli CodeGenForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-import copy
-from typing import Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers.models.codegen import CodeGenForCausalLM  # type: ignore[import]
-
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-
-
-class SmoothQuantCodeGenHook(SmoothQuantHook):
-    """SmoothQuant Hook for CodeGenForCausalLM."""
-
-    def pre_smooth(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Pre-procedure for SmoothQuant in CodeGenForCausalLM that should be called before smooth() is called."""
-        super().pre_smooth(model)
-        for decoder_layer in cast(CodeGenForCausalLM, model).transformer.h:
-            decoder_layer.add_module("ln_2", copy.deepcopy(decoder_layer.ln_1))
-        return model
-
-    def iter_smooth_norm_weights(
-        self,
-        model: CodeGenForCausalLM,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight per transformer block in CodeGenForCausalLM."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-
-        for index, decoder_layer in enumerate(model.transformer.h):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection, MLP FF 1 ] gets smoothed
-            yield (
-                [
-                    decoder_layer.ln_1.weight.data,
-                    decoder_layer.ln_1.bias.data,
-                ],
-                [
-                    decoder_layer.attn.qkv_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.attn.qkv_proj",
-            )
-            yield (
-                [
-                    decoder_layer.ln_2.weight.data,
-                    decoder_layer.ln_2.bias.data,
-                ],
-                [
-                    decoder_layer.mlp.fc_in.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.mlp.fc_in",
-            )
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data],
-                    [decoder_layer.attn.out_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                )
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data],
-                    [decoder_layer.mlp.fc_out.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.mlp.fc_out",
-                )
-
-    def reshape_qkv_weight(
-        self, attn_layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Reshapes the qkv weight in CodeGenForCausalLM for Quantization."""
-        qkv_layer = cast(torch.nn.Linear, attn_layer.qkv_proj)
-        original_qkv_weight = qkv_layer.weight
-        reshaped_qkv_weight = original_qkv_weight.reshape(
-            (4, original_qkv_weight.size(0) // 4, original_qkv_weight.size(1))
-        )
-        q_weight, v_weight, k_weight = torch.split(
-            reshaped_qkv_weight, reshaped_qkv_weight.size(1) // 3, dim=1
-        )
-        q_weight = q_weight.reshape((-1, q_weight.size(2)))
-        k_weight = k_weight.reshape((-1, k_weight.size(2)))
-        v_weight = v_weight.reshape((-1, v_weight.size(2)))
-
-        return q_weight, k_weight, v_weight
-
-    def sort_qkv_output_stats(self, max_output_stat: torch.Tensor) -> torch.Tensor:
-        """Sorts the max output stats of qkv_proj in CodeGenForCausalLM."""
-        reshpaed_max_output_stat = max_output_stat.reshape(
-            (4, max_output_stat.size(0) // 4)
-        )
-        q_max_output_stat, v_max_output_stat, k_max_output_stat = torch.split(
-            reshpaed_max_output_stat, reshpaed_max_output_stat.size(1) // 3, dim=1
-        )
-        q_max_output_stat = q_max_output_stat.reshape((-1,))
-        k_max_output_stat = k_max_output_stat.reshape((-1,))
-        v_max_output_stat = v_max_output_stat.reshape((-1,))
-        return torch.cat(
-            (q_max_output_stat, k_max_output_stat, v_max_output_stat), dim=0
-        )
-
-    def iter_tf_quant_inputs(
-        self, model: CodeGenForCausalLM
-    ) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of CodeGenForCausalLM."""
-        for index, decoder_layer in enumerate(model.transformer.h):
-            self_attn = decoder_layer.attn
-            q_weight, k_weight, v_weight = self.reshape_qkv_weight(self_attn)
-            qkv_weight = torch.cat((q_weight, k_weight, v_weight), dim=0)
-            attn_weight_outdim = qkv_weight.size(0)  # OutDim
-            fc1 = decoder_layer.mlp.fc_in
-            fc2 = decoder_layer.mlp.fc_out
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.qkv_proj",
-                    0,
-                    attn_weight_outdim // 3,
-                    self.sort_qkv_output_stats,
-                ),
-                k=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.qkv_proj",
-                    attn_weight_outdim // 3,
-                    attn_weight_outdim // 3 * 2,
-                    self.sort_qkv_output_stats,
-                ),
-                v=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.qkv_proj",
-                    attn_weight_outdim // 3 * 2,
-                    attn_weight_outdim,
-                    self.sort_qkv_output_stats,
-                ),
-                attn_fc=QuantInput(
-                    self_attn.out_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.fc_in",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.fc_out",
-                    None,
-                    None,
-                ),
-            )
-
-    @property
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Returns the list of conversion informations for modified layers in CodeGenForCausalLM."""
-        convert_info_list = super().modified_layers_convert_info_list
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_2.weight"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.converter.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_2.bias"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}ln_2/beta:0",
-                        reshape_fn=self.converter.ln_bias_reshape,
-                    ),
-                ]
-            )
-
-        return convert_info_list
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in CodeGenForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.attn.out_proj
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.fc_out
-
-    def get_tf_blocks(self, model: CodeGenForCausalLM) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.transformer.h)
diff --git a/friendli/modules/quantizer/smoothquant/models/falcon.py b/friendli/modules/quantizer/smoothquant/models/falcon.py
deleted file mode 100644
index 7722f9ba..00000000
--- a/friendli/modules/quantizer/smoothquant/models/falcon.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli FalconForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Any, Dict, Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers.models.falcon import (  # type: ignore[import]
-    FalconConfig,
-    FalconForCausalLM,
-)
-
-from friendli.modules.converter.base import OneOfConverter
-from friendli.modules.converter.utils import convert_to_gpt_j_params
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-
-
-class SmoothQuantFalconHook(SmoothQuantHook):
-    """SmoothQuant Hook for FalconForCausalLM."""
-
-    def __init__(self, quant_config: Dict[str, Any], converter: OneOfConverter):
-        """Initialize SmoothQuantFalconHook."""
-        super().__init__(quant_config, converter)
-        config = cast(FalconConfig, converter.config)
-        self.num_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_dim = self.head_size
-        self.num_kv_attention_heads = self.get_num_kv_attention_heads(config)
-
-    def get_num_kv_attention_heads(self, config: FalconConfig) -> int:
-        """Returns the number of key-value attention heads in FalconForCausalLM."""
-        if config.new_decoder_architecture:
-            if config.num_kv_heads is not None:
-                return config.num_kv_heads
-            return config.num_attention_heads
-
-        if config.multi_query:
-            return 1
-
-        if config.num_kv_heads is not None:
-            return config.num_kv_heads
-        return config.num_attention_heads
-
-    def iter_smooth_norm_weights(
-        self,
-        model: FalconForCausalLM,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight per transformer block in FalconForCausalLM."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-        for index, decoder_layer in enumerate(model.transformer.h):  # type: ignore[union-attr]
-            if cast(FalconConfig, self.converter.config).new_decoder_architecture:
-                # [LayerNorm 1] - [ QKV projection ] gets smoothed
-                yield (
-                    [
-                        decoder_layer.ln_attn.weight.data,
-                        decoder_layer.ln_attn.bias.data,
-                    ],
-                    [
-                        decoder_layer.self_attention.query_key_value.weight.data,
-                    ],
-                    f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",  # the input tensors fed into Q, K, V matrices are identical.
-                )
-                # [LayerNorm 2] - [ MLP FF 1 ] gets smoothed
-                yield (
-                    [
-                        decoder_layer.ln_mlp.weight.data,
-                        decoder_layer.ln_mlp.bias.data,
-                    ],
-                    [decoder_layer.mlp.dense_h_to_4h.weight.data],  # [OutDim, InDim]
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_h_to_4h",
-                )
-            else:
-                # [LayerNorm 1] - [ QKV projection ] gets smoothed ( MLP FF1 is not smoothed. No LayerNorm 2. )
-                yield (
-                    [
-                        decoder_layer.input_layernorm.weight.data,
-                    ],
-                    [
-                        decoder_layer.self_attention.query_key_value.weight.data,
-                    ],
-                    f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",  # the input tensors fed into Q, K, V matrices are identical.
-                )
-
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data],
-                    [decoder_layer.self_attention.dense.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.self_attention.dense",
-                )
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data],
-                    [decoder_layer.mlp.dense_4h_to_h.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_4h_to_h",
-                )
-
-    def reshape_qkv_weight(
-        self, attn_layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Reshapes the qkv weight in FalconForCausalLM for Quantization."""
-        qkv_weight = cast(torch.nn.Linear, attn_layer.query_key_value).weight
-        num_queries_per_kv = self.num_attention_heads // self.num_kv_attention_heads
-
-        qkv_weight = qkv_weight.reshape(
-            self.num_kv_attention_heads,
-            num_queries_per_kv + 2,
-            self.head_size,
-            self.hidden_size,
-        )
-
-        q_weight = qkv_weight[:, :num_queries_per_kv].reshape(
-            self.num_kv_attention_heads * num_queries_per_kv,
-            self.head_size,
-            self.hidden_size,
-        )
-        k_weight = qkv_weight[:, [-2]].reshape(
-            self.num_kv_attention_heads,
-            self.head_size,
-            self.hidden_size,
-        )
-        v_weight = qkv_weight[:, [-1]].reshape(
-            self.num_kv_attention_heads * self.head_size,
-            self.hidden_size,
-        )
-
-        q_weight = convert_to_gpt_j_params(q_weight, self.rotary_dim)
-        k_weight = convert_to_gpt_j_params(k_weight, self.rotary_dim)
-
-        q_weight = q_weight.reshape(
-            self.num_kv_attention_heads * num_queries_per_kv * self.head_size,
-            self.hidden_size,
-        )
-        k_weight = k_weight.reshape(
-            self.num_kv_attention_heads * self.head_size,
-            self.hidden_size,
-        )
-
-        return q_weight, k_weight, v_weight
-
-    def sort_qkv_output_stats(self, max_output_stat: torch.Tensor) -> torch.Tensor:
-        """Sort max output stats of qkv_layer in FalconForCausalLM."""
-        num_queries_per_kv = self.num_attention_heads // self.num_kv_attention_heads
-        qkv_output_stat = max_output_stat.reshape(
-            self.num_kv_attention_heads,
-            num_queries_per_kv + 2,
-            self.head_size,
-        )
-        q_out_stats = qkv_output_stat[:, :num_queries_per_kv].reshape(
-            self.num_kv_attention_heads * num_queries_per_kv,
-            self.head_size,
-        )
-        k_out_stats = qkv_output_stat[:, [-2]].reshape(
-            self.num_kv_attention_heads,
-            self.head_size,
-        )
-        v_out_stats = qkv_output_stat[:, [-1]].reshape(
-            self.num_kv_attention_heads * self.head_size,
-        )
-        q_out_stats = convert_to_gpt_j_params(q_out_stats, self.rotary_dim)
-        k_out_stats = convert_to_gpt_j_params(k_out_stats, self.rotary_dim)
-        q_out_stats = q_out_stats.reshape(
-            self.num_kv_attention_heads * num_queries_per_kv * self.head_size,
-        )
-        k_out_stats = k_out_stats.reshape(
-            self.num_kv_attention_heads * self.head_size,
-        )
-
-        return torch.cat((q_out_stats, k_out_stats, v_out_stats), dim=0)
-
-    def iter_tf_quant_inputs(self, model: FalconForCausalLM) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of FalconForCausalLM."""
-        for index, decoder_layer in enumerate(model.transformer.h):
-            self_attn = decoder_layer.self_attention
-            q_weight, k_weight, v_weight = self.reshape_qkv_weight(self_attn)
-            qkv_weight = torch.cat((q_weight, k_weight, v_weight), dim=0)
-            fc1 = decoder_layer.mlp.dense_h_to_4h
-            fc2 = decoder_layer.mlp.dense_4h_to_h
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",
-                    0,
-                    q_weight.size(0),
-                    self.sort_qkv_output_stats,
-                ),
-                k=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",
-                    q_weight.size(0),
-                    q_weight.size(0) + k_weight.size(0),
-                    self.sort_qkv_output_stats,
-                ),
-                v=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attention.query_key_value",
-                    q_weight.size(0) + k_weight.size(0),
-                    qkv_weight.size(0),
-                    self.sort_qkv_output_stats,
-                ),
-                attn_fc=QuantInput(
-                    self_attn.dense.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attention.dense",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_h_to_4h",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_4h_to_h",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in FalconForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.self_attention.dense
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.dense_4h_to_h
-
-    def get_tf_blocks(self, model: FalconForCausalLM) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.transformer.h)
diff --git a/friendli/modules/quantizer/smoothquant/models/gpt2.py b/friendli/modules/quantizer/smoothquant/models/gpt2.py
deleted file mode 100644
index 50a20695..00000000
--- a/friendli/modules/quantizer/smoothquant/models/gpt2.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli GPT2LMHeadModel QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers.models.gpt2 import GPT2LMHeadModel  # type: ignore[import]
-from transformers.pytorch_utils import Conv1D  # type: ignore[import]
-
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-
-
-class SmoothQuantGPT2Hook(SmoothQuantHook):
-    """SmoothQuant Hook for GPT2LMHeadModel."""
-
-    def iter_smooth_norm_weights(
-        self, model: GPT2LMHeadModel
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight per transformer block in GPT2LMHeadModel."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-        for index, decoder_layer in enumerate(model.transformer.h):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection ] gets smoothed
-            yield (
-                [
-                    decoder_layer.ln_1.weight.data,
-                    decoder_layer.ln_1.bias.data,
-                ],
-                [
-                    decoder_layer.attn.c_attn.weight.data.transpose(
-                        0, 1
-                    ),  # [OutDim, InDim]
-                ],
-                f"{self.quantized_layer_prefix}{index}.attn.c_attn",  # the input tensors fed into Q, K, V matrices are identical.
-            )
-            # [LayerNorm 2] - [ MLP FF 1 ] gets smoothed
-            yield (
-                [
-                    decoder_layer.ln_2.weight.data,
-                    decoder_layer.ln_2.bias.data,
-                ],
-                [decoder_layer.mlp.c_fc.weight.data.transpose(0, 1)],  # [OutDim, InDim]
-                f"{self.quantized_layer_prefix}{index}.mlp.c_fc",
-            )
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data.transpose(0, 1)],
-                    [decoder_layer.attn.c_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.attn.c_proj",
-                )
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data.transpose(0, 1)],
-                    [decoder_layer.mlp.c_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.mlp.c_proj",
-                )
-
-    def iter_tf_quant_inputs(self, model: GPT2LMHeadModel) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of GPT2LMHeadModel."""
-        for index, decoder_layer in enumerate(model.transformer.h):
-            attn = decoder_layer.attn
-            attn_weight_outdim = attn.c_attn.nf  # OutDim
-            fc1 = decoder_layer.mlp.c_fc
-            fc2 = decoder_layer.mlp.c_proj
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    attn.c_attn.weight.transpose(0, 1),
-                    f"{self.quantized_layer_prefix}{index}.attn.c_attn",
-                    0,
-                    attn_weight_outdim // 3,
-                ),
-                k=QuantInput(
-                    attn.c_attn.weight.transpose(0, 1),
-                    f"{self.quantized_layer_prefix}{index}.attn.c_attn",
-                    attn_weight_outdim // 3,
-                    attn_weight_outdim // 3 * 2,
-                ),
-                v=QuantInput(
-                    attn.c_attn.weight.transpose(0, 1),
-                    f"{self.quantized_layer_prefix}{index}.attn.c_attn",
-                    attn_weight_outdim // 3 * 2,
-                    attn_weight_outdim,
-                ),
-                attn_fc=QuantInput(
-                    attn.c_proj.weight.transpose(0, 1),
-                    f"{self.quantized_layer_prefix}{index}.attn.c_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight.transpose(0, 1),
-                    f"{self.quantized_layer_prefix}{index}.mlp.c_fc",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight.transpose(0, 1),
-                    f"{self.quantized_layer_prefix}{index}.mlp.c_proj",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in GPT2LMHeadModel."""
-        return (Conv1D,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.attn.c_proj
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.c_proj
-
-    def get_tf_blocks(self, model: GPT2LMHeadModel) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.transformer.h)
diff --git a/friendli/modules/quantizer/smoothquant/models/gpt_neox.py b/friendli/modules/quantizer/smoothquant/models/gpt_neox.py
deleted file mode 100644
index d2df5090..00000000
--- a/friendli/modules/quantizer/smoothquant/models/gpt_neox.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli GPTNeoXForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Any, Dict, Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers.models.gpt_neox import (  # type: ignore[import]
-    GPTNeoXConfig,
-    GPTNeoXForCausalLM,
-)
-
-from friendli.modules.converter.base import OneOfConverter
-from friendli.modules.converter.utils import convert_to_gpt_j_params
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-
-
-class SmoothQuantGPTNeoXHook(SmoothQuantHook):
-    """SmoothQuant Hook for GPTNeoXForCausalLM."""
-
-    def __init__(self, quant_config: Dict[str, Any], converter: OneOfConverter):
-        """Initialize SmoothQuantGPTNeoXHook."""
-        super().__init__(quant_config, converter)
-        config = cast(GPTNeoXConfig, converter.config)
-        self.num_attention_heads = config.num_attention_heads
-        self.num_kv_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_dim = int(self.head_size * config.rotary_pct)
-
-    def iter_smooth_norm_weights(
-        self,
-        model: GPTNeoXForCausalLM,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight per transformer block in GPTNeoXForCausalLM."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-        for index, decoder_layer in enumerate(model.gpt_neox.layers):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection ] gets smoothed
-            yield (
-                [
-                    decoder_layer.input_layernorm.weight.data,
-                    decoder_layer.input_layernorm.bias.data,
-                ],
-                [
-                    decoder_layer.attention.query_key_value.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.attention.query_key_value",  # the input tensors fed into Q, K, V matrices are identical.
-            )
-            # [LayerNorm 2] - [ MLP FF 1 ] gets smoothed
-            yield (
-                [
-                    decoder_layer.post_attention_layernorm.weight.data,
-                    decoder_layer.post_attention_layernorm.bias.data,
-                ],
-                [decoder_layer.mlp.dense_h_to_4h.weight.data],  # [OutDim, InDim]
-                f"{self.quantized_layer_prefix}{index}.mlp.dense_h_to_4h",
-            )
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data],
-                    [decoder_layer.attention.dense.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.attention.dense",
-                )
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data],
-                    [decoder_layer.mlp.dense_4h_to_h.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_4h_to_h",
-                )
-
-    def reshape_qkv_weight(
-        self, attn_layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Reshape GPTNeoXForCausalLM's qkv weight for int8 quantization."""
-        qkv_weight = cast(torch.nn.Linear, attn_layer).weight
-        qkv_weight = qkv_weight.reshape(
-            self.num_attention_heads,
-            3,
-            self.head_size,
-            self.hidden_size,
-        )
-
-        q_weight = qkv_weight[:, 0].reshape(
-            self.num_attention_heads,
-            self.head_size,
-            self.hidden_size,
-        )
-        k_weight = qkv_weight[:, 1].reshape(
-            self.num_attention_heads,
-            self.head_size,
-            self.hidden_size,
-        )
-        v_weight = qkv_weight[:, 2].reshape(
-            self.num_attention_heads * self.head_size,
-            self.hidden_size,
-        )
-
-        q_weight = convert_to_gpt_j_params(param=q_weight, rotary_dim=self.rotary_dim)
-        k_weight = convert_to_gpt_j_params(param=k_weight, rotary_dim=self.rotary_dim)
-        q_weight = q_weight.reshape(
-            self.num_attention_heads * self.head_size,
-            self.hidden_size,
-        )
-        k_weight = k_weight.reshape(
-            self.num_attention_heads * self.head_size,
-            self.hidden_size,
-        )
-        return q_weight, k_weight, v_weight
-
-    def sort_qkv_output_stats(self, max_output_stat: torch.Tensor) -> torch.Tensor:
-        """Sort max output stats of qkv_layer in GPTNeoXForCausalLM."""
-        max_output_stat = max_output_stat.reshape(
-            self.num_attention_heads,
-            3,
-            self.head_size,
-        )
-        q_output_stat = max_output_stat[:, 0].reshape(
-            self.num_attention_heads,
-            self.head_size,
-        )
-        k_output_stat = max_output_stat[:, 1].reshape(
-            self.num_attention_heads,
-            self.head_size,
-        )
-        v_output_stat = max_output_stat[:, 2].reshape(
-            self.num_attention_heads * self.head_size,
-        )
-        q_output_stat = convert_to_gpt_j_params(q_output_stat, self.rotary_dim)
-        k_output_stat = convert_to_gpt_j_params(k_output_stat, self.rotary_dim)
-        q_output_stat = q_output_stat.reshape(
-            self.num_attention_heads * self.head_size,
-        )
-        k_output_stat = k_output_stat.reshape(
-            self.num_attention_heads * self.head_size,
-        )
-        return torch.cat((q_output_stat, k_output_stat, v_output_stat), dim=0)
-
-    def iter_tf_quant_inputs(
-        self, model: GPTNeoXForCausalLM
-    ) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of GPTNeoXForCausalLM."""
-        for index, decoder_layer in enumerate(model.gpt_neox.layers):
-            attention = decoder_layer.attention
-            attention_weight_outdim = attention.query_key_value.weight.size(0)  # OutDim
-            q_weight, k_weight, v_weight = self.reshape_qkv_weight(
-                attention.query_key_value
-            )
-            qkv_weight = torch.cat((q_weight, k_weight, v_weight), dim=0)
-            fc1 = decoder_layer.mlp.dense_h_to_4h
-            fc2 = decoder_layer.mlp.dense_4h_to_h
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attention.query_key_value",
-                    0,
-                    attention_weight_outdim // 3,
-                    self.sort_qkv_output_stats,
-                ),
-                k=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attention.query_key_value",
-                    attention_weight_outdim // 3,
-                    attention_weight_outdim // 3 * 2,
-                    self.sort_qkv_output_stats,
-                ),
-                v=QuantInput(
-                    qkv_weight,
-                    f"{self.quantized_layer_prefix}{index}.attention.query_key_value",
-                    attention_weight_outdim // 3 * 2,
-                    attention_weight_outdim,
-                    self.sort_qkv_output_stats,
-                ),
-                attn_fc=QuantInput(
-                    attention.dense.weight,
-                    f"{self.quantized_layer_prefix}{index}.attention.dense",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_h_to_4h",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.dense_4h_to_h",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in GPTNeoXForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.attention.dense
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.dense_4h_to_h
-
-    def get_tf_blocks(self, model: GPTNeoXForCausalLM) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.gpt_neox.layers)
diff --git a/friendli/modules/quantizer/smoothquant/models/gptj.py b/friendli/modules/quantizer/smoothquant/models/gptj.py
deleted file mode 100644
index 77e15732..00000000
--- a/friendli/modules/quantizer/smoothquant/models/gptj.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli GPTJForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-import copy
-from typing import Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers.models.gptj import GPTJForCausalLM  # type: ignore[import]
-
-from friendli.modules.converter.base import DECODER_PREFIX
-from friendli.modules.converter.schema import ConvertInfo
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-
-
-class SmoothQuantGPTJHook(SmoothQuantHook):
-    """SmoothQuant Hook for GPTJForCausalLM."""
-
-    def pre_smooth(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Pre-procedure for SmoothQuant in GPTJForCausalLM that should be called before smooth() is called."""
-        super().pre_smooth(model)
-        for decoder_layer in cast(GPTJForCausalLM, model).transformer.h:
-            decoder_layer.add_module("ln_2", copy.deepcopy(decoder_layer.ln_1))
-        return model
-
-    def iter_smooth_norm_weights(
-        self,
-        model: GPTJForCausalLM,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight per transformer block in GPTJForCausalLM."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-        for index, decoder_layer in enumerate(model.transformer.h):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection] gets smoothed
-            yield (
-                [
-                    decoder_layer.ln_1.weight.data,
-                    decoder_layer.ln_1.bias.data,
-                ],
-                [
-                    decoder_layer.attn.q_proj.weight.data,  # [OutDim, InDim]
-                    decoder_layer.attn.k_proj.weight.data,  # [OutDim, InDim]
-                    decoder_layer.attn.v_proj.weight.data,  # [OutDim, InDim]
-                ],
-                f"{self.quantized_layer_prefix}{index}.attn.q_proj",  # the input tensors fed into Q, K, V matrices are identical.
-            )
-            # [LayerNorm 1] - [ MLP FF1 ] gets smoothed
-            yield (
-                [
-                    decoder_layer.ln_2.weight.data,
-                    decoder_layer.ln_2.bias.data,
-                ],
-                [
-                    decoder_layer.mlp.fc_in.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.mlp.fc_in",
-            )
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data],
-                    [decoder_layer.attn.out_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                )
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data],
-                    [decoder_layer.mlp.fc_out.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.mlp.fc_out",
-                )
-
-    def iter_tf_quant_inputs(self, model: GPTJForCausalLM) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of GPTJForCausalLM."""
-        for index, decoder_layer in enumerate(model.transformer.h):
-            attn = decoder_layer.attn
-            fc1 = decoder_layer.mlp.fc_in
-            fc2 = decoder_layer.mlp.fc_out
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    attn.q_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.q_proj",
-                    None,
-                    None,
-                ),
-                k=QuantInput(
-                    attn.k_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.k_proj",
-                    None,
-                    None,
-                ),
-                v=QuantInput(
-                    attn.v_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.v_proj",
-                    None,
-                    None,
-                ),
-                attn_fc=QuantInput(
-                    attn.out_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.fc_in",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.fc_out",
-                    None,
-                    None,
-                ),
-            )
-
-    @property
-    def modified_layers_convert_info_list(
-        self,
-    ) -> List[ConvertInfo]:
-        """Returns the modified layers' convert dict in GPTJForCausalLM."""
-        convert_info_list = super().modified_layers_convert_info_list
-
-        for i in range(self.converter.decoder_layer_num):
-            layer_prefix = f"{self.quantized_layer_prefix}{i}."
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            convert_info_list.extend(
-                [
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_2.weight"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}ln_2/gamma:0",
-                        reshape_fn=self.converter.ln_weight_reshape,
-                    ),
-                    ConvertInfo(
-                        param_names=[f"{layer_prefix}ln_2.bias"],
-                        data_type=self.converter.data_type,
-                        converted_name=f"{converted_prefix}ln_2/beta:0",
-                        reshape_fn=self.converter.ln_bias_reshape,
-                    ),
-                ]
-            )
-
-        return convert_info_list
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in GPTJForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.attn.out_proj
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.fc_out
-
-    def get_tf_blocks(self, model: GPTJForCausalLM) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.transformer.h)
diff --git a/friendli/modules/quantizer/smoothquant/models/llama.py b/friendli/modules/quantizer/smoothquant/models/llama.py
deleted file mode 100644
index 5256401a..00000000
--- a/friendli/modules/quantizer/smoothquant/models/llama.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli LlamaForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-import copy
-from dataclasses import dataclass
-from typing import Any, Dict, Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers.models.llama import (  # type: ignore[import]
-    LlamaConfig,
-    LlamaForCausalLM,
-)
-
-from friendli.modules.converter.base import DECODER_PREFIX, OneOfConverter
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import (
-    ModuleName,
-    QuantInput,
-    TFQuantInputs,
-    TFQuantResults,
-    WeightActQuantResult,
-)
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-from friendli.modules.quantizer.utils import get_weight_act_quant_scales
-
-
-@dataclass
-class LlamaTFQuantInput(TFQuantInputs):
-    """Dataclass for int8 quantization input per layer in LlamaForCausalLM.""" ""
-
-    ff_gate: QuantInput
-
-
-@dataclass
-class LlamaTFQuantResults(TFQuantResults):
-    """Dataclass for int8 quantization result per a transformer block in LlamaForCausalLM.""" ""
-
-    ff_gate: WeightActQuantResult
-
-
-class SmoothQuantLlamaHook(SmoothQuantHook):
-    """SmoothQuant Hook for LlamaForCausalLM."""
-
-    def __init__(self, quant_config: SmoothQuantConfig, converter: OneOfConverter):
-        """Initialize SmoothQuantLlamaHook."""
-        super().__init__(quant_config, converter)
-        config = cast(LlamaConfig, converter.config)
-        self.num_attention_heads = config.num_attention_heads
-        if config.num_key_value_heads is None:
-            self.num_kv_attention_heads = self.num_attention_heads
-        else:
-            self.num_kv_attention_heads = config.num_key_value_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_dim = self.head_size
-
-    def pre_smooth(self, model: torch.nn.Module) -> torch.nn.Module:
-        """Pre-procedure for SmoothQuant in LlamaForCausalLM that should be called before smooth() is called."""
-        super().pre_smooth(model)
-        for decoder_layer in cast(LlamaForCausalLM, model).model.layers:
-            decoder_layer.add_module(
-                "post_attention_layernorm_2",
-                copy.deepcopy(decoder_layer.post_attention_layernorm),
-            )
-        return model
-
-    def iter_smooth_norm_weights(
-        self,
-        model: LlamaForCausalLM,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight per transformer block in LlamaForCausalLM."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-
-        for index, decoder_layer in enumerate(model.model.layers):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection ] gets smoothed
-            yield (
-                [
-                    decoder_layer.input_layernorm.weight.data,
-                ],
-                [
-                    decoder_layer.self_attn.q_proj.weight.data,
-                    decoder_layer.self_attn.k_proj.weight.data,
-                    decoder_layer.self_attn.v_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",  # the input tensors fed into Q, K, V matrices are identical.
-            )
-            # [LayerNorm 2] - [ MLP FF 1 ] gets smoothed
-            yield (
-                [
-                    decoder_layer.post_attention_layernorm.weight.data,
-                ],
-                [
-                    decoder_layer.mlp.up_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.mlp.up_proj",
-            )
-            # [LayerNomr 2] = [ MLP GATED FF ] gets smoothed
-            yield (
-                [
-                    decoder_layer.post_attention_layernorm_2.weight.data,
-                ],
-                [
-                    decoder_layer.mlp.gate_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.mlp.gate_proj",
-            )
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data],
-                    [decoder_layer.self_attn.o_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                )
-
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data],
-                    [decoder_layer.mlp.down_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.mlp.down_proj",
-                )
-
-    def iter_tf_quant_inputs(self, model: LlamaForCausalLM) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of LlamaForCausalLM."""
-        for index, decoder_layer in enumerate(model.model.layers):
-            self_attn = decoder_layer.self_attn
-            fc1 = decoder_layer.mlp.up_proj
-            ff_gate = decoder_layer.mlp.gate_proj
-            fc2 = decoder_layer.mlp.down_proj
-
-            yield LlamaTFQuantInput(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    self_attn.q_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",
-                    None,
-                    None,
-                ),
-                k=QuantInput(
-                    self_attn.k_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.k_proj",
-                    None,
-                    None,
-                ),
-                v=QuantInput(
-                    self_attn.v_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.v_proj",
-                    None,
-                    None,
-                ),
-                attn_fc=QuantInput(
-                    self_attn.o_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.up_proj",
-                    None,
-                    None,
-                ),
-                ff_gate=QuantInput(
-                    ff_gate.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.gate_proj",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.mlp.down_proj",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_quant_result(
-        self,
-        quant_input: TFQuantInputs,
-        **kwargs: Any,
-    ) -> TFQuantResults:
-        """Returns the quantization result for a specific layer in LlamaForCausalLM."""
-        max_input_stats: Dict[ModuleName, torch.Tensor] = kwargs["max_input_stats"]
-        max_output_stats: Dict[ModuleName, torch.Tensor] = kwargs["max_output_stats"]
-
-        def get_scale(quant_input: QuantInput) -> WeightActQuantResult:
-            weight, name, start, end = (
-                quant_input.weight,
-                quant_input.name,
-                quant_input.start_offset,
-                quant_input.end_offset,
-            )
-            return get_weight_act_quant_scales(
-                name,
-                max_input_stats[name],
-                weight[start:end],
-                max_output_stats[name][start:end],
-            )
-
-        quant_input = cast(LlamaTFQuantInput, quant_input)
-        return LlamaTFQuantResults(
-            layer_prefix_with_index=f"{self.quantized_layer_prefix}{quant_input.layer_index}.",
-            q=get_scale(quant_input.q),
-            k=get_scale(quant_input.k),
-            v=get_scale(quant_input.v),
-            attn_fc=get_scale(quant_input.attn_fc),
-            ff1=get_scale(quant_input.ff1),
-            ff_gate=get_scale(quant_input.ff_gate),
-            ff2=get_scale(quant_input.ff2),
-        )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in LlamaForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.self_attn.o_proj
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.down_proj
-
-    def get_tf_blocks(self, model: LlamaForCausalLM) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.model.layers)
-
-    @property
-    def quantized_param_names(self) -> List[str]:
-        """Returns the parameter names in LlamaForCausalLM."""
-        param_names = super().quantized_param_names
-        for i in range(self.converter.decoder_layer_num):
-            converted_prefix = f"{DECODER_PREFIX}/h_._{i}/"
-            param_names.append(f"{converted_prefix}mlp/c_gate/weight:0")
-        return param_names
diff --git a/friendli/modules/quantizer/smoothquant/models/mpt.py b/friendli/modules/quantizer/smoothquant/models/mpt.py
deleted file mode 100644
index a72561fd..00000000
--- a/friendli/modules/quantizer/smoothquant/models/mpt.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli MPTForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type, cast
-
-import torch
-
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-
-
-class SmoothQuantMPTHook(SmoothQuantHook):
-    """SmoothQuant Hook for MPTForCausalLM."""
-
-    def iter_smooth_norm_weights(
-        self,
-        model: torch.nn.Module,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight per transformer block in MPTForCausalLM."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-
-        for index, decoder_layer in enumerate(
-            model.transformer.blocks  # type: ignore[union-attr, arg-type]
-        ):
-            # [LayerNorm 1] - [ QKV projection ] gets smoothed
-            yield (
-                [decoder_layer.norm_1.weight.data],
-                [decoder_layer.attn.Wqkv.weight.data],
-                f"{self.quantized_layer_prefix}{index}.attn.Wqkv",
-            )
-            # [LayerNorm 2] - [ MLP FF 1 ] gets smoothed
-            yield (
-                [decoder_layer.norm_2.weight.data],
-                [decoder_layer.ffn.up_proj.weight.data],  # [OutDim, InDim]
-                f"{self.quantized_layer_prefix}{index}.ffn.up_proj",
-            )
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data],
-                    [decoder_layer.attn.out_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                )
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data],
-                    [decoder_layer.ffn.down_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.ffn.down_proj",
-                )
-
-    def iter_tf_quant_inputs(self, model: torch.nn.Module) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of MPTForCausalLM."""
-        for index, decoder_layer in enumerate(
-            model.transformer.blocks  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.attn
-            q_outdim = (
-                self.converter.decoder_num_attention_heads
-                * self.converter.decoder_head_size
-            )
-            kv_outdim = (
-                self.converter.decoder_num_kv_attention_heads
-                * self.converter.decoder_head_size
-            )
-            qkv_outdim = self_attn.Wqkv.weight.size(0)
-            assert qkv_outdim == q_outdim + kv_outdim * 2
-            fc1 = decoder_layer.ffn.up_proj
-            fc2 = decoder_layer.ffn.down_proj
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    self_attn.Wqkv.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.Wqkv",
-                    0,
-                    q_outdim,
-                ),
-                k=QuantInput(
-                    self_attn.Wqkv.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.Wqkv",
-                    q_outdim,
-                    q_outdim + kv_outdim,
-                ),
-                v=QuantInput(
-                    self_attn.Wqkv.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.Wqkv",
-                    q_outdim + kv_outdim,
-                    qkv_outdim,
-                ),
-                attn_fc=QuantInput(
-                    self_attn.out_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.attn.out_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.ffn.up_proj",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.ffn.down_proj",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in MPTForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.attn.out_proj
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.ffn.down_proj
-
-    def get_tf_blocks(self, model: torch.nn.Module) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.transformer.blocks)
diff --git a/friendli/modules/quantizer/smoothquant/models/opt.py b/friendli/modules/quantizer/smoothquant/models/opt.py
deleted file mode 100644
index ed6d8292..00000000
--- a/friendli/modules/quantizer/smoothquant/models/opt.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli OPTForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers.models.opt import OPTForCausalLM  # type: ignore[import]
-
-from friendli.modules.quantizer.schema.config import SmoothQuantConfig
-from friendli.modules.quantizer.schema.data import ModuleName, QuantInput, TFQuantInputs
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantHook
-
-
-class SmoothQuantOPTHook(SmoothQuantHook):
-    """SmoothQuant Hook for OPTForCausalLM."""
-
-    def iter_smooth_norm_weights(
-        self, model: OPTForCausalLM
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of layernorm's weight and linear layer's weight per transformer block in OPTForCausalLM."""
-        quant_args = cast(SmoothQuantConfig, self.quant_config).smoothquant_args
-        for index, decoder_layer in enumerate(model.model.decoder.layers):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection ] gets smoothed
-            yield (
-                [
-                    decoder_layer.self_attn_layer_norm.weight.data,
-                    decoder_layer.self_attn_layer_norm.bias.data,
-                ],
-                [
-                    decoder_layer.self_attn.q_proj.weight.data,
-                    decoder_layer.self_attn.k_proj.weight.data,
-                    decoder_layer.self_attn.v_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",  # the input tensors fed into Q, K, V matrices are identical.
-            )
-            # [LayerNorm 2] - [ MLP FF 1 ] gets smoothed
-            yield (
-                [
-                    decoder_layer.final_layer_norm.weight.data,
-                    decoder_layer.final_layer_norm.bias.data,
-                ],
-                [decoder_layer.fc1.weight.data],
-                f"{self.quantized_layer_prefix}{index}.fc1",
-            )
-            if quant_args.attn_fc_smoothing:
-                yield (
-                    [decoder_layer.attn_fc_pre_smoother.scale.data],
-                    [decoder_layer.self_attn.out_proj.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.self_attn.out_proj",
-                )
-            if quant_args.ff2_smoothing:
-                yield (
-                    [decoder_layer.ff2_pre_smoother.scale.data],
-                    [decoder_layer.fc2.weight.data],
-                    f"{self.quantized_layer_prefix}{index}.fc2",
-                )
-
-    def iter_tf_quant_inputs(self, model: OPTForCausalLM) -> Iterator[TFQuantInputs]:
-        """Returns the layers which should be quantized in transformer block of OPTForCausalLM."""
-        for index, decoder_layer in enumerate(model.model.decoder.layers):
-            self_attn = decoder_layer.self_attn
-            fc1 = decoder_layer.fc1
-            fc2 = decoder_layer.fc2
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                q=QuantInput(
-                    self_attn.q_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",
-                    None,
-                    None,
-                ),
-                k=QuantInput(
-                    self_attn.k_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.k_proj",
-                    None,
-                    None,
-                ),
-                v=QuantInput(
-                    self_attn.v_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.v_proj",
-                    None,
-                    None,
-                ),
-                attn_fc=QuantInput(
-                    self_attn.out_proj.weight,
-                    f"{self.quantized_layer_prefix}{index}.self_attn.out_proj",
-                    None,
-                    None,
-                ),
-                ff1=QuantInput(
-                    fc1.weight,
-                    f"{self.quantized_layer_prefix}{index}.fc1",
-                    None,
-                    None,
-                ),
-                ff2=QuantInput(
-                    fc2.weight,
-                    f"{self.quantized_layer_prefix}{index}.fc2",
-                    None,
-                    None,
-                ),
-            )
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Returns the linear layer types in OPTForCausalLM."""
-        return (torch.nn.Linear,)
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after attention in the decoder layer."""
-        return decoder_layer.self_attn.out_proj
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.fc2
-
-    def get_tf_blocks(self, model: OPTForCausalLM) -> List[torch.nn.Module]:
-        """Returns the decoder layers(transformer blocks) in the model."""
-        return list(model.model.decoder.layers)
diff --git a/friendli/modules/quantizer/utils.py b/friendli/modules/quantizer/utils.py
deleted file mode 100644
index 1e47030b..00000000
--- a/friendli/modules/quantizer/utils.py
+++ /dev/null
@@ -1,514 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Quantizer Utils."""
-
-from __future__ import annotations
-
-import os
-from contextlib import contextmanager
-from itertools import islice
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Protocol,
-    Sequence,
-    Tuple,
-    Type,
-    TypeVar,
-    Union,
-)
-
-import datasets  # type: ignore[import]
-import torch
-from accelerate import cpu_offload_with_hook  # type: ignore
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-
-from friendli.enums import ModelDataType
-from friendli.errors import InvalidConfigError, QuantizationError
-from friendli.logging import logger
-from friendli.modules.quantizer.schema.config import CalibrationDatasetConfig
-from friendli.modules.quantizer.schema.data import (
-    ModuleName,
-    WeightActQuantResult,
-    WeightOnlyQuantResult,
-)
-
-
-def scale_reshape(
-    params: List[torch.Tensor],
-) -> torch.Tensor:
-    """Reshape scale/zero of quantized layers."""
-    if len(params) == 1:
-        t = params[0]
-    else:
-        t = torch.cat(params, dim=1)
-    return t
-
-
-def quantized_qkv_weight_reshape(
-    params: List[torch.Tensor],
-) -> torch.Tensor:
-    """Reshape weight of quantized qkv layers."""
-    assert len(params) == 3
-    qkv_weight = torch.concat(
-        params,
-        dim=0,
-    )  # [OutDim, InDim]
-
-    return qkv_weight.to(torch.uint8)
-
-
-def quantized_linear_weight_reshape(
-    params: List[torch.Tensor],
-) -> torch.Tensor:
-    """Reshape weight of quantized linear layers."""
-    assert len(params) == 1
-
-    return params[0].to(torch.uint8)
-
-
-def safe_load_datasets(data_cfg: CalibrationDatasetConfig) -> datasets.Dataset:
-    """Load dataset from calibration dataset config."""
-    data_path = data_cfg.path_or_name
-    data_split = data_cfg.split
-
-    try:
-        if os.path.exists(data_path):
-            dataset = datasets.load_dataset(
-                data_cfg.format,
-                data_files=data_path,
-                split=data_split,
-            )
-        else:
-            data_name_parts = data_path.split(":")
-            if len(data_name_parts) == 1:
-                dataset = datasets.load_dataset(data_path, split=data_split)
-            elif len(data_name_parts) == 2:
-                data_name, subset_name = data_name_parts
-                dataset = datasets.load_dataset(
-                    data_name, subset_name, split=data_split
-                )
-            else:
-                raise InvalidConfigError(
-                    "Dataset name is in invalid format. "
-                    "(valid format: '<dataset_name>' or '<dataset_name>:<subset_name>')"
-                )
-    except ValueError as err:
-        raise QuantizationError(f"datasets.load_dataset failed. {str(err)}") from err
-
-    if not isinstance(dataset, datasets.Dataset):
-        raise InvalidConfigError(
-            "This dataset format is not supported for the calibration."
-        )
-
-    return dataset
-
-
-T = TypeVar("T")
-
-
-def batched(it: Iterator[T], n: int) -> Iterator[List[T]]:
-    """Batch an iterator into lists of size n."""
-    # batched('ABCDEFG', 3) --> ABC DEF G
-    while True:
-        batch = list(islice(it, n))
-        if not batch:
-            return
-        yield batch
-
-
-def build_percentile_statistics(
-    scale_percentile: float,
-    symmetric: bool = True,
-) -> Tuple[Callable, Callable, Callable]:
-    """Builds the hooks for getting the max input and output activations of a model."""
-    logger.info(
-        "Building percentile statistics hooks. scale_percentile: (%s)",
-        scale_percentile,
-    )
-
-    max_input_M1: Dict[str, torch.Tensor] = {}
-    max_input_M2: Dict[str, torch.Tensor] = {}
-    max_input_num: Dict[str, torch.Tensor] = {}
-    max_output_M1: Dict[str, torch.Tensor] = {}
-    max_output_M2: Dict[str, torch.Tensor] = {}
-    max_output_num: Dict[str, torch.Tensor] = {}
-
-    def create_hook(name: ModuleName):
-        def update_stats(
-            max_M1: Dict[str, torch.Tensor],
-            max_M2: Dict[str, torch.Tensor],
-            max_num: Dict[str, int],
-            new_t: torch.Tensor,
-        ) -> None:
-            # Chan's method for computing mean and variance incrementally
-            new_t = new_t.detach().reshape(-1, new_t.size(-1))
-            new_numel = new_t.size(0)
-            new_t_M1 = new_t.to(torch.float64).mean(dim=0)
-            if symmetric:
-                # it is assumed samples are always centered on zero
-                # in the symmetric quantization scheme
-                new_t_M1.zero_()
-            new_t_M2 = ((new_t.to(torch.float64) - new_t_M1) ** 2).sum(dim=0)
-            try:
-                pre_numel = max_num[name]
-                max_num[name] += new_numel
-                delta = new_t_M1 - max_M1[name]
-                max_M1[name] += delta * (new_numel / max_num[name])
-                max_M2[name] += new_t_M2 + torch.pow(delta, 2) * (
-                    pre_numel * new_numel / max_num[name]
-                )
-            except KeyError:
-                max_num[name] = new_numel
-                max_M1[name] = new_t_M1
-                max_M2[name] = new_t_M2
-
-        def hook(module, in_t_tup, out_t):  # pylint: disable=unused-argument
-            with torch.no_grad():
-                in_t = in_t_tup[0]
-                update_stats(max_input_M1, max_input_M2, max_input_num, in_t)
-                update_stats(max_output_M1, max_output_M2, max_output_num, out_t)
-
-        return hook
-
-    def finish_input_stats():
-        return {
-            name: torch.distributions.Normal(
-                loc=max_input_M1[name],
-                scale=torch.sqrt(max_input_M2[name] / max_input_num[name]).clip(
-                    min=1e-7
-                ),
-            ).icdf(
-                torch.Tensor([(scale_percentile / 100.0) * 0.5 + 0.5]).to(
-                    max_input_M1[name].device
-                )
-            )
-            for name in list(max_input_M1.keys())
-        }
-
-    def finish_output_stats():
-        return {
-            name: torch.distributions.Normal(
-                loc=max_output_M1[name],
-                scale=torch.sqrt(max_output_M2[name] / max_output_num[name]).clip(
-                    min=1e-7
-                ),
-            ).icdf(
-                torch.Tensor([(scale_percentile / 100.0) * 0.5 + 0.5]).to(
-                    max_output_M1[name].device
-                )
-            )
-            for name in list(max_output_M1.keys())
-        }
-
-    return finish_input_stats, finish_output_stats, create_hook
-
-
-def build_max_statistics() -> Tuple[Callable, Callable, Callable]:
-    """Builds the hooks for getting the max input and output activations of a model."""
-    logger.info("Building max statistics hooks")
-    max_input_stats: Dict[str, torch.Tensor] = {}
-    max_output_stats: Dict[str, torch.Tensor] = {}
-
-    def create_hook(name: ModuleName):
-        def hook(modules, in_t_tup, out_t):  # pylint: disable=unused-argument
-            in_t = in_t_tup[0]
-            in_t = (
-                in_t.detach().abs().reshape(-1, in_t.size(-1)).max(dim=0).values
-            )  # reduce-max only leaving the hidden dim (supposing the last dim is the hidden dim)
-            out_t = out_t.detach().reshape(-1, out_t.size(-1))
-            out_t = out_t.abs().max(dim=0).values
-            try:
-                max_input_stats[name] = torch.maximum(max_input_stats[name], in_t)
-            except KeyError:
-                max_input_stats[name] = in_t
-            try:
-                max_output_stats[name] = torch.maximum(max_output_stats[name], out_t)
-            except KeyError:
-                max_output_stats[name] = out_t
-
-        return hook
-
-    def finish_input_stats():
-        return max_input_stats
-
-    def finish_output_stats():
-        return max_output_stats
-
-    return finish_input_stats, finish_output_stats, create_hook
-
-
-@torch.no_grad()
-def collect_stats(
-    model: torch.nn.Module,
-    device: str,
-    dataset: datasets.Dataset,
-    target_classes: Tuple[Type[torch.nn.Module], ...],
-    tqdm_desc: str,
-    percentile: float,
-    batch_size: int = 1,
-) -> Tuple[Dict[ModuleName, torch.Tensor], Dict[ModuleName, torch.Tensor]]:
-    """Collects the maximum values of input and output activations of a specific model.
-
-    Args:
-        model (torch.nn.Module): The model for which we want to collect the max statistics.
-        dataset (Dataset): Dataset that contains input tensors.
-        target_classes (Tuple[Type[torch.nn.Module], ...]): A tuple of the target classes.
-
-    Returns:
-        A tuple of two dictionaries: (max_input_stats, max_output_stats), where:
-        max_input_stats: The maximum input activation values for each module of the model.
-        max_output_stats: The maximum output activation values for each module of the model.
-
-    This function uses a forward hook to capture the maximum input and output activation values
-    of the specified target_classes. The max_batch_size parameter controls the size of the input
-    batches that are passed through the model.
-
-    The function returns two dictionaries containing the maximum input and output activation
-    values for each module of the model, respectively. These dictionaries can be used to calculate
-    scaling factors for weight quantization and activation smoothing.
-
-    """
-    # pylint: disable=too-many-locals
-    max_input_stats, max_output_stats, create_hook = (
-        build_percentile_statistics(percentile)
-        if percentile < 100.0
-        else build_max_statistics()
-    )
-    name_mods = [
-        (name, module)
-        for name, module in model.named_modules()
-        if isinstance(module, target_classes)
-    ]
-
-    calib_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
-    removables = []
-    for name, module in name_mods:
-        removables.append(module.register_forward_hook(create_hook(name)))
-    try:
-        for inputs in tqdm(calib_dataloader, desc=tqdm_desc):
-            model(inputs.to(device))
-    finally:
-        for removable in removables:
-            removable.remove()
-    return max_input_stats(), max_output_stats()
-
-
-def build_inps_hook():
-    """Builds the hooks for getting the input and output activations of a module."""
-    args_dict = {}
-    kwargs_dict = {}
-
-    def create_hook(name: ModuleName):
-        def hook(m, args, kwargs, y):  # pylint: disable=unused-argument
-            assert name not in args_dict
-            assert name not in kwargs_dict
-            # assumption: all positional arguments are torch.Tensor
-            args_dict[name] = [t.detach() for t in args]
-            kwargs_dict[name] = {
-                k: (v.detach() if isinstance(v, torch.Tensor) else v)
-                for k, v in kwargs.items()
-            }
-
-        return hook
-
-    return args_dict, kwargs_dict, create_hook
-
-
-def collect_inps(
-    module: torch.nn.Module,
-    module_args: Tuple[Any, ...],
-    module_kwargs: Dict[str, Any],
-    device: str,
-    target_classes: Tuple[Type[torch.nn.Module], ...],
-) -> Tuple[Dict[ModuleName, Tuple[Any]], Dict[ModuleName, Dict[str, Any]]]:
-    """Collects concated input and output activations of a specific module."""
-    args_dict, kwargs_dict, create_hook = build_inps_hook()
-    name_mods = [
-        (name, m) for name, m in module.named_modules() if isinstance(m, target_classes)
-    ]
-
-    removables = []
-    for name, m in name_mods:
-        removables.append(m.register_forward_hook(create_hook(name), with_kwargs=True))
-
-    module(
-        *((t.to(device) if isinstance(t, torch.Tensor) else t) for t in module_args),
-        **{
-            k: (v.to(device) if isinstance(v, torch.Tensor) else v)
-            for k, v in module_kwargs.items()
-        },
-    )
-
-    for removable in removables:
-        removable.remove()
-
-    return args_dict, kwargs_dict
-
-
-def get_torch_quant_dtype(q_bit: int = 8):
-    """Get torch quant data type from quant bit."""
-    if q_bit == 8:
-        return torch.int8
-    if q_bit == 4:
-        return torch.int32  # In AWQ, we use int32 to represent int4
-    raise ValueError(f"Invalid quant bit: {q_bit}")
-
-
-@torch.no_grad()
-def get_weight_act_quant_scales(
-    layer_name: str,
-    input_max: torch.Tensor,
-    target_weight: torch.Tensor,
-    weight: torch.Tensor,
-    output_max: torch.Tensor,
-    device: str = "cpu",
-    quant_dtype: ModelDataType = ModelDataType.INT8,
-) -> WeightActQuantResult:
-    """Get the quantization scales and int8 weight for a specific layer."""
-    # shape of input_max: [InChannels]
-    # shape of output_max: [OutChannels]
-    # shape of target_weight: [OutChannels, InChannels]
-    assert input_max.ndim == 1
-    assert output_max.ndim == 1
-
-    assert quant_dtype == ModelDataType.INT8
-
-    in_channels = input_max.size(0)
-    out_channels = output_max.size(0)
-    assert tuple(weight.size()) == (out_channels, in_channels)
-
-    max_val = 2 ** (8 - 1) - 1
-    min_val = -(2 ** (8 - 1))
-
-    act_scale = float(input_max.detach().abs().max().item()) / float(max_val)
-    weight_scale = float(target_weight.detach().abs().max().item()) / float(max_val)
-
-    q_weight = (
-        (weight.detach().float() / weight_scale)
-        .round()
-        .clip(min_val, max_val)
-        .to(get_torch_quant_dtype(8))
-        .to(device)
-    )
-
-    return WeightActQuantResult(
-        layer_name,
-        quant_dtype=quant_dtype,
-        zero_point=torch.tensor(0.0),
-        act_scale=torch.tensor(act_scale),
-        weight_scale=torch.tensor(weight_scale),
-        q_weight=q_weight,
-        q_group_size=-1,
-    )
-
-
-def get_weight_only_quant_scales(
-    w: torch.Tensor,
-    q_bit: int,
-    q_group_size: int,
-    layer_name: str = "",
-    device: Union[str, torch.device] = "cpu",
-) -> WeightOnlyQuantResult:
-    """Return the quantization scales of weight for a specific layer."""
-    assert q_bit in [4, 8]
-    org_w_shape = w.shape  # [OutDim, InDim]
-
-    w = w.reshape(-1, q_group_size)  # [OutDim x num_groups, group_size]
-    max_val = w.amax(dim=1, keepdim=True)
-    min_val = w.amin(dim=1, keepdim=True)
-
-    max_int = 2**q_bit - 1
-    min_int = 0
-
-    scales = (max_val - min_val).clamp(min=1e-5) / max_int
-    zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
-
-    assert torch.isnan(scales).sum() == 0
-
-    q_weight = torch.clamp(torch.round(w / scales) + zeros, min_int, max_int)
-    q_weight = q_weight.reshape(org_w_shape).detach().to(device)
-    scales = (
-        scales.view(org_w_shape[0], -1).transpose(0, 1).detach().to(device)
-    )  # [OutDim, num_groups]
-    zeros = (
-        zeros.view(org_w_shape[0], -1).transpose(0, 1).detach().to(device)
-    )  # [OutDim, num_groups]
-
-    assert torch.isnan(q_weight).sum() == 0
-
-    return WeightOnlyQuantResult(
-        layer_name,
-        quant_dtype=ModelDataType.INT4 if q_bit == 4 else ModelDataType.INT8,
-        zero_point=zeros,
-        q_group_size=q_group_size,
-        weight_scale=scales,
-        q_weight=q_weight,
-    )
-
-
-def send_model_to_device(
-    model: torch.nn.Module,
-    device: Union[str, torch.device],
-    *,
-    exclude: Iterable[torch.nn.Module] = (),
-):
-    """Send the model and its submodules onto device except for modules designated by `exclude`."""
-    exclude_set = set(exclude)
-
-    @torch.no_grad()
-    def recurse(m: torch.nn.Module):
-        if m in exclude_set:
-            return
-        for name, p in list(m.named_parameters(recurse=False)):
-            m.register_parameter(name, torch.nn.Parameter(p.to(device)))
-        for name, b in list(m.named_buffers(recurse=False)):
-            m.register_buffer(name, b.to(device))
-
-        for child in m.children():
-            recurse(child)
-
-    recurse(model)
-
-
-class RemovableOffloaderHook(Protocol):
-    """Hook protocol for cpu offloader."""
-
-    def offload(self) -> None:
-        """Offload the associated block onto CPU."""
-
-    def remove(self) -> None:
-        """Remove this hook."""
-
-
-@contextmanager
-def offload_module_sequence(
-    blocks: Sequence[torch.nn.Module], device: Union[str, torch.device]
-):
-    """Offload a sequence of torch modules automatically.
-
-    In the beginning, all blocks are supposed to reside on CPU.
-    When i-th block is called, it is loaded onto `device` on the fly.
-    And at the same time, it offloads (i-1)-th block back to CPU.
-    """
-    module_hooks: List[RemovableOffloaderHook] = []
-    if blocks:
-        prev_module_hook = None
-        for tf_block in blocks:
-            _, module_hook = cpu_offload_with_hook(
-                tf_block, device, prev_module_hook=prev_module_hook
-            )
-            prev_module_hook = module_hook
-            module_hooks.append(module_hook)
-    try:
-        yield
-    finally:
-        for hook in module_hooks:
-            hook.offload()
-        for hook in module_hooks:
-            hook.remove()
diff --git a/friendli/modules/quantizer_v2/__init__.py b/friendli/modules/quantizer_v2/__init__.py
deleted file mode 100644
index 9ee5a33d..00000000
--- a/friendli/modules/quantizer_v2/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantizer V2."""
diff --git a/friendli/modules/quantizer_v2/base.py b/friendli/modules/quantizer_v2/base.py
deleted file mode 100644
index 08c48f2d..00000000
--- a/friendli/modules/quantizer_v2/base.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantization Interface."""
-
-from __future__ import annotations
-
-import os
-from abc import ABC, abstractmethod
-from contextlib import contextmanager
-from typing import Any, Dict, Iterator, List, Tuple, Type
-
-import huggingface_hub  # type: ignore
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from transformers import PretrainedConfig, PreTrainedModel  # type: ignore
-
-from friendli.errors import NotSupportedQuantConfigError
-from friendli.logging import logger
-from friendli.modules.quantizer_v2.enums import QuantDatasetFormat
-from friendli.modules.quantizer_v2.layers import (
-    WeightActQuantizedLinearLayer,
-    WeightOnlyQuantizedLinearLayer,
-)
-from friendli.modules.quantizer_v2.schema.config import OneOfQuantConfig
-from friendli.modules.quantizer_v2.schema.data import TFQuantInputs
-from friendli.modules.quantizer_v2.utils import (
-    collect_stats,
-    get_weight_act_quant_scales,
-    get_weight_only_quant_scales,
-    offload_module_sequence,
-    send_model_to_device,
-)
-
-
-class AbstractQuantHookV2(ABC):
-    """Abstract Quantization Hook for a specific model."""
-
-    def __init__(self, quant_config: OneOfQuantConfig, model_config: PretrainedConfig):
-        """Initialize the Quantization Hook.
-
-        Args:
-            quant_config (OneOfQuantConfig): Quantization configuration.
-            model_config (PretrainedConfig): Model configuration.
-        """
-        self.quant_config = quant_config
-        self.model_config = model_config
-
-    @abstractmethod
-    def check_model_config(self) -> None:
-        """Check if the model is quantizable."""
-
-    @abstractmethod
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module], ...]:
-        """Get linear layer types in the model."""
-
-    @abstractmethod
-    def get_tf_blocks(self, model: PreTrainedModel) -> List[torch.nn.Module]:
-        """Get tensor fusion blocks in the model."""
-
-    @abstractmethod
-    def iter_tf_quant_inputs(self, model: PreTrainedModel) -> Iterator[TFQuantInputs]:
-        """Iterate over TFQuantInputs."""
-
-    @property
-    @abstractmethod
-    def quantized_layer_prefix(self) -> str:
-        """Returns the prefix of the transformer block name."""
-
-
-class AbstractQuantizerV2(ABC):
-    """Abstract class for quantizer."""
-
-    def __init__(self, hook: AbstractQuantHookV2, config: OneOfQuantConfig):
-        """Initialize AbstractQuantizer."""
-        self.config = config
-        self.hook = hook
-
-    def check_config(self) -> None:
-        """Check if the model is quantizable."""
-        self.hook.check_model_config()
-        calibration_dataset_config = self.config.calibration_dataset
-        data_path_or_name = calibration_dataset_config.path_or_name
-        percentile = self.config.percentile
-        if percentile <= 0 or percentile > 100:
-            raise NotSupportedQuantConfigError(
-                invalid_option=str(percentile),
-                valid_options=["0 < percentile <= 100"],
-            )
-        if not os.path.exists(data_path_or_name):
-            data_name = data_path_or_name.split(":")[0]
-            if data_name not in (
-                data.id for data in huggingface_hub.list_datasets(search=data_name)
-            ):
-                raise NotSupportedQuantConfigError(
-                    invalid_option=data_name,
-                    valid_options=["datasets on the huggingface hub", "local path"],
-                )
-        else:
-            if calibration_dataset_config.format not in QuantDatasetFormat:
-                raise NotSupportedQuantConfigError(
-                    invalid_option=calibration_dataset_config.format,
-                    valid_options=list(QuantDatasetFormat),
-                )
-        try:
-            torch.device(self.config.device)
-        except ValueError as err:
-            raise NotSupportedQuantConfigError(
-                invalid_option=self.config.device,
-                valid_options=["cpu", "cuda"],
-            ) from err
-
-    @contextmanager
-    def _try_offload_model(self, model: PreTrainedModel):
-        if not self.config.offload:
-            logger.info("Offloading not enabled. Skipping.")
-            model.to(self.config.device)
-            yield
-        else:
-            logger.info("Offloading enabled.")
-            tf_blocks = self.hook.get_tf_blocks(model)
-            send_model_to_device(model, self.config.device, exclude=tf_blocks)
-            with offload_module_sequence(tf_blocks, self.config.device):
-                yield
-
-    @abstractmethod
-    def quantize(self, model: PreTrainedModel) -> PreTrainedModel:
-        """Quantize model."""
-
-    def pre_quantize(self, model: PreTrainedModel) -> PreTrainedModel:
-        """Preprocess model before quantization."""
-
-    def post_quantize(self, model: PreTrainedModel) -> PreTrainedModel:
-        """Postprocess model after quantization."""
-
-    @abstractmethod
-    def get_quant_config(self) -> Dict[str, Any]:
-        """Get quantizer config."""
-
-
-class AbstractWeightOnlyQuantizer(AbstractQuantizerV2):
-    """Abstract class for weight only quantizer."""
-
-    def quantize(self, model: PreTrainedModel) -> PreTrainedModel:
-        """Return quantized model."""
-        with self._try_offload_model(model):
-            for tf_quant_inputs in tqdm(
-                self.hook.iter_tf_quant_inputs(model),
-                total=len(self.hook.get_tf_blocks(model)),
-                desc="Quantize model..",
-            ):
-                for quant_input in tf_quant_inputs.quant_inputs:
-                    parent_module, local_names, names = (
-                        quant_input.parent_module,
-                        quant_input.local_names,
-                        quant_input.target_names,
-                    )
-                    parent_modules_w_local_name = []
-                    if isinstance(parent_module, torch.nn.ModuleList):
-                        # For MoE models with seperate expert layers
-                        for p_module in parent_module:
-                            for local_name in local_names:
-                                parent_modules_w_local_name.append(
-                                    (p_module, local_name)
-                                )
-                    else:
-                        assert isinstance(parent_module, torch.nn.Module)
-                        for local_name in local_names:
-                            parent_modules_w_local_name.append(
-                                (parent_module, local_name)
-                            )
-                    layers = [
-                        p_module.get_submodule(local_name)
-                        for p_module, local_name in parent_modules_w_local_name
-                    ]
-                    assert self.config.quant_scale_dtype
-                    quant_results = get_weight_only_quant_scales(
-                        model,
-                        names,
-                        quant_dtype=self.config.quant_dtype,
-                        quant_scale_dtype=self.config.quant_scale_dtype,
-                        q_group_size=self.config.quant_group_size,
-                        use_symmetric=self.config.use_symmetric,
-                    )
-                    q_layers = [
-                        WeightOnlyQuantizedLinearLayer.from_layer(layer, quant_result)
-                        for layer, quant_result in zip(layers, quant_results)
-                    ]
-                    for (p_module, local_name), q_layer in zip(
-                        parent_modules_w_local_name, q_layers
-                    ):
-                        setattr(p_module, local_name, q_layer)
-        return model
-
-
-class AbstractWeightActQuantizer(AbstractQuantizerV2):
-    """Abstract class for weight and activation quantizer."""
-
-    @abstractmethod
-    def get_calib_dataloader(self) -> DataLoader:
-        """Get encoded calibration dataset."""
-
-    def quantize(self, model: PreTrainedModel) -> PreTrainedModel:
-        """Return quantized model."""
-        with self._try_offload_model(model):
-            max_input_stats, _ = collect_stats(
-                model,
-                self.config.device,
-                self.get_calib_dataloader(),
-                self.hook.get_linear_layer_types(),
-                percentile=self.config.percentile,
-                tqdm_desc="Collecting stats for Static Quantization.",
-            )
-            for tf_quant_inputs in tqdm(
-                self.hook.iter_tf_quant_inputs(model),
-                total=len(self.hook.get_tf_blocks(model)),
-                desc="Quantize model..",
-            ):
-                for quant_input in tf_quant_inputs.quant_inputs:
-                    parent_module, local_names, names = (
-                        quant_input.parent_module,
-                        quant_input.local_names,
-                        quant_input.target_names,
-                    )
-                    parent_modules_w_local_name = []
-                    if isinstance(parent_module, torch.nn.ModuleList):
-                        # For MoE models with seperate expert layers
-                        for p_module in parent_module:
-                            for local_name in local_names:
-                                parent_modules_w_local_name.append(
-                                    (p_module, local_name)
-                                )
-                    else:
-                        assert isinstance(parent_module, torch.nn.Module)
-                        for local_name in local_names:
-                            parent_modules_w_local_name.append((p_module, local_name))
-                    layers = [
-                        p_module.get_submodule(local_name)
-                        for p_module, local_name in parent_modules_w_local_name
-                    ]
-                    assert self.config.quant_scale_dtype
-                    quant_results = get_weight_act_quant_scales(
-                        model,
-                        names,
-                        max_input_stats,
-                        quant_scale_dtype=self.config.quant_scale_dtype,
-                        quant_dtype=self.config.quant_dtype,
-                    )
-                    q_layers = [
-                        WeightActQuantizedLinearLayer.from_layer(layer, quant_result)
-                        for layer, quant_result in zip(layers, quant_results)
-                    ]
-                    for (p_module, local_name), q_layer in zip(
-                        parent_modules_w_local_name, q_layers
-                    ):
-                        setattr(p_module, local_name, q_layer)
-        return model
diff --git a/friendli/modules/quantizer_v2/enums.py b/friendli/modules/quantizer_v2/enums.py
deleted file mode 100644
index 18bc60c7..00000000
--- a/friendli/modules/quantizer_v2/enums.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantizer Enums."""
-
-
-from __future__ import annotations
-
-from enum import Enum
-
-
-class QuantMode(str, Enum):
-    """Supported quantization modes."""
-
-    INT8 = "int8"
-    DUMMY = "dummy"
-
-
-class QuantDatasetFormat(str, Enum):
-    """Supported file format for calibration datasets for quantization."""
-
-    JSON = "json"
-    CSV = "csv"
-    PARQUET = "parquet"
-    TXT = "txt"
-
-
-class Int8QuantType(str, Enum):
-    """Int8Quant modes."""
-
-    DYNAMIC = "dynamic"
-
-
-class ModelDataType(str, Enum):
-    """Model dtype enums."""
-
-    BF16 = "bf16"
-    FP16 = "fp16"
-    FP32 = "fp32"
-    FP8_E4M3 = "fp8_e4m3"
-    INT8 = "int8"
-    INT4 = "int4"
diff --git a/friendli/modules/quantizer_v2/int8/__init__.py b/friendli/modules/quantizer_v2/int8/__init__.py
deleted file mode 100644
index 9f651b15..00000000
--- a/friendli/modules/quantizer_v2/int8/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Int8 Quantizer."""
diff --git a/friendli/modules/quantizer_v2/int8/base.py b/friendli/modules/quantizer_v2/int8/base.py
deleted file mode 100644
index 66e200a8..00000000
--- a/friendli/modules/quantizer_v2/int8/base.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Int8 Quantizer Base."""
-
-from __future__ import annotations
-
-from abc import abstractmethod
-from typing import Any, Dict, Iterator, List, Tuple, cast
-
-import torch
-from torch.utils.data import DataLoader
-from transformers import PreTrainedModel  # type: ignore
-
-from friendli.modules.converter.utils import get_tokenizer
-from friendli.modules.quantizer_v2.base import (
-    AbstractQuantHookV2,
-    AbstractQuantizerV2,
-    AbstractWeightActQuantizer,
-    AbstractWeightOnlyQuantizer,
-)
-from friendli.modules.quantizer_v2.int8.utils import perform_smoothing
-from friendli.modules.quantizer_v2.schema.config import Int8QuantConfig
-from friendli.modules.quantizer_v2.schema.data import ModuleName
-from friendli.modules.quantizer_v2.utils import collect_stats, safe_load_datasets
-
-
-class Int8QuantHook(AbstractQuantHookV2):
-    """Int8 Quant Hook Base."""
-
-    @abstractmethod
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the attention fc layer in the decoder block."""
-
-    @abstractmethod
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Returns the second feed-forward layer in the decoder block."""
-
-    @abstractmethod
-    def iter_pre_act_post_act_params(
-        self, model: PreTrainedModel
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Returns iterator of pre_act_params and post_act_params per transformer block."""
-
-
-class Int8Quantizer(AbstractQuantizerV2):
-    """Int8 Quantizer Base."""
-
-    def get_smoothing_calib_dataloader(self) -> DataLoader:
-        """Get calibration dataset for Int8."""
-        data_cfg = self.config.calibration_dataset
-        dataset = safe_load_datasets(data_cfg)
-        tokenizer = get_tokenizer(self.hook.model_config.name_or_path)
-        dataset = (
-            dataset.shuffle(self.config.seed)
-            .select(range(data_cfg.num_samples))
-            .select_columns([data_cfg.lookup_column_name])
-        )
-        encoded_dataset = tokenizer(
-            dataset[data_cfg.lookup_column_name],
-            return_tensors="pt",
-            truncation=True,
-            padding=True,
-            max_length=data_cfg.max_length,
-        )
-        return DataLoader(encoded_dataset["input_ids"], batch_size=data_cfg.batch_size)
-
-    def _smooth(
-        self,
-        model: PreTrainedModel,
-    ) -> None:
-        """Smooths the models before Quantization."""
-        model.eval()
-        # collect stats for Int8 quantization scale.
-        with self._try_offload_model(model):
-            calib_dataloader = self.get_smoothing_calib_dataloader()
-            quant_config = cast(Int8QuantConfig, self.config)
-            max_input_stats, _ = collect_stats(
-                model,
-                quant_config.device,
-                calib_dataloader,
-                self.hook.get_linear_layer_types(),
-                tqdm_desc="Collecting stats for Smoothing.",
-                percentile=100.0,
-            )
-
-            for pre_act_params, post_act_params, name in cast(
-                Int8QuantHook, self.hook
-            ).iter_pre_act_post_act_params(model):
-                perform_smoothing(
-                    pre_act_params,
-                    post_act_params,
-                    max_input_stats[name],
-                    migration_strength=quant_config.int8_args.migration_strength,
-                    inplace=True,
-                )
-
-    def pre_quantize(
-        self,
-        model: PreTrainedModel,
-    ) -> None:
-        """Pre-procedure that should be called before quantize() is called."""
-        self._smooth(model)
-
-    def quantize(self, model: PreTrainedModel) -> torch.nn.Module:
-        """Quantize the model."""
-        self.pre_quantize(model)
-        return super().quantize(model)
-
-    def get_quant_config(self) -> Dict[str, Any]:
-        """Get the quantization configuration."""
-        return {
-            "bits": 8,
-            "mode": cast(Int8QuantConfig, self.config).int8_args.quant_type.value,
-            "zero_point": False,
-            "quant_method": "int8",
-            "quant_group_size": self.config.quant_group_size,
-        }
-
-
-class Int8StaticQuantizer(Int8Quantizer, AbstractWeightActQuantizer):
-    """Int8 Dynamic Quantizer Base."""
-
-
-class Int8DynamicQuantizer(Int8Quantizer, AbstractWeightOnlyQuantizer):
-    """Int8 Dynamic Quantizer Base."""
diff --git a/friendli/modules/quantizer_v2/int8/utils.py b/friendli/modules/quantizer_v2/int8/utils.py
deleted file mode 100644
index c482f87d..00000000
--- a/friendli/modules/quantizer_v2/int8/utils.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Int8 Quantizer Base."""
-
-from __future__ import annotations
-
-from typing import List, Tuple
-
-import torch
-
-
-@torch.no_grad()
-def perform_smoothing(
-    pre_act_params: List[torch.Tensor],
-    post_act_params: List[torch.Tensor],
-    activation_max: torch.Tensor,
-    *,
-    migration_strength: float = 0.5,
-    epsilon: float = 1e-5,
-    inplace: bool = False,
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-    """Perform activation-weight smoothing in SmoothQuant.
-
-    Performs the activation-weight smoothing scheme described in SmoothQuant
-    (Xiao et al., 2023), which migrates the amplitude of outliers from activations
-    to weights of matmul layers. The function takes in the following parameters:
-
-    Args:
-        pre_act_params: torch.Tensors representing affine parameters
-            before each matmul layer.
-        post_act_params: torch.Tensors representing the weight matrices of the matmul layer.
-        activation_max: The maximum activation value of inputs of the matmul layer.
-        migration_strength: the strength of the activation migration. Default is 0.5.
-        epsilon: The epsilon used for numerical stability when calculating the scales.
-            Default is 1e-5.
-
-    Returns:
-        A tuple of three torch.Tensors: (smoothed_pre_act_params, smoothed_post_act_params)
-
-    The function calculates "scales" as `pow(|Activation|, migration_strength) /
-    pow(|Weight|, 1-migration_strength)` and applies the smoothing effect into
-    a normalization layer that exists before every matmul layer. This is done because
-    it is more efficient than introducing a new smoothing layer before every matmul layer.
-    Fusing the smoothing effect into the normalization layer results in a faster and
-    more efficient implementation of the smoothing scheme.
-
-    The function returns the smoothed normalization coefficients and the smoothed weight
-    matrices after the smoothing process.
-    """
-    # shape of activation norms: [InChannels]
-    # shape of fc weights: [OutChannels, InChannels]
-    # shape of activation_max: [InChannels]
-
-    # pylint: disable=too-many-locals
-    assert pre_act_params
-    assert post_act_params
-
-    in_channels = pre_act_params[0].size(0)
-    device = pre_act_params[0].device
-    dtype = pre_act_params[0].dtype
-
-    for pre_act_param in pre_act_params:
-        assert pre_act_param.device == device
-        assert pre_act_param.dtype == dtype
-
-    for weight in post_act_params:
-        assert weight.ndim == 2
-        assert weight.size(1) == in_channels, (weight.size(), in_channels)
-        assert weight.device == device
-
-    activation_max = activation_max.to(device=device)
-    weight_max = post_act_params[0].abs().max(dim=0).values
-    for weight in post_act_params[1:]:
-        weight_max = torch.maximum(weight_max, weight.abs().max(dim=0).values)
-
-    assert tuple(activation_max.size()) == (in_channels,)
-    assert tuple(weight_max.size()) == (in_channels,)
-    alpha = migration_strength
-    scales = (
-        (
-            activation_max.to(dtype=torch.float32).pow(alpha)
-            / weight_max.to(dtype=torch.float32).pow(1 - alpha)
-        )
-        .clamp(min=epsilon)
-        .to(dtype=dtype)
-    )
-
-    scaled_pre_act_params = [act_norm / scales for act_norm in pre_act_params]
-    scaled_weights = [w * scales.view(1, -1) for w in post_act_params]
-
-    if inplace:
-        for dst, src in zip(pre_act_params, scaled_pre_act_params):
-            dst.copy_(src)
-        for dst, src in zip(post_act_params, scaled_weights):
-            dst.copy_(src)
-
-    return scaled_pre_act_params, scaled_weights
diff --git a/friendli/modules/quantizer_v2/layers.py b/friendli/modules/quantizer_v2/layers.py
deleted file mode 100644
index 3a203210..00000000
--- a/friendli/modules/quantizer_v2/layers.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Quantization Layers."""
-
-from __future__ import annotations
-
-from typing import Optional, cast
-
-import torch
-
-from friendli.modules.quantizer_v2.schema.data import (
-    WeightActQuantResult,
-    WeightOnlyQuantResult,
-)
-
-
-class WeightOnlyQuantizedLinearLayer(torch.nn.Module):
-    """Linear Layer with weight only quantization."""
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        q_weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        zeros: Optional[torch.nn.Parameter] = None,
-        bias: Optional[torch.nn.Parameter] = None,
-    ):
-        """Initialize the Weight Only Quantized Linear Layer."""
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.weight_scale = torch.nn.Parameter(weight_scale)
-        self.weight = torch.nn.Parameter(q_weight, requires_grad=False)
-        self.register_parameter("zeros", zeros)
-        self.register_parameter("bias", bias)
-
-    @staticmethod
-    def from_layer(
-        layer: torch.nn.Module, quant_result: WeightOnlyQuantResult
-    ) -> torch.nn.Module:
-        """Returns the quantized layer from the original layer."""
-        zeros = (
-            torch.nn.Parameter(quant_result.zero_point)
-            if quant_result.zero_point
-            else None
-        )
-        return WeightOnlyQuantizedLinearLayer(
-            cast(torch.nn.Linear, layer).in_features,
-            cast(torch.nn.Linear, layer).out_features,
-            quant_result.q_weight,
-            quant_result.weight_scale,
-            zeros,
-            cast(torch.nn.Linear, layer).bias,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass with fake quantization. Not used in conversion."""
-        raise NotImplementedError("Not used in conversion.")
-
-
-class WeightActQuantizedLinearLayer(torch.nn.Module):
-    """Linear Layer with weight-act quantization."""
-
-    def __init__(  # pylint: disable=too-many-arguments
-        self,
-        q_weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        act_scale: torch.Tensor,
-        bias: Optional[torch.nn.Parameter] = None,
-    ):
-        """Initialize the Weight Only Quantized Linear Layer."""
-        super().__init__()
-        self.in_scale = torch.nn.Parameter(act_scale)
-        self.weight_scale = torch.nn.Parameter(weight_scale)
-        self.weight = torch.nn.Parameter(q_weight, requires_grad=False)
-        self.register_parameter("bias", bias)
-
-    @staticmethod
-    def from_layer(
-        layer: torch.nn.Module, quant_result: WeightActQuantResult
-    ) -> torch.nn.Module:
-        """Returns the quantized layer from the original layer."""
-        q_result = cast(WeightActQuantResult, quant_result)
-        return WeightActQuantizedLinearLayer(
-            q_result.q_weight,
-            q_result.weight_scale,
-            q_result.act_scale,
-            cast(torch.nn.Linear, layer).bias if hasattr(layer, "bias") else None,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass with fake quantization. Not used in conversion."""
-        raise NotImplementedError("Not used in conversion.")
diff --git a/friendli/modules/quantizer_v2/maps.py b/friendli/modules/quantizer_v2/maps.py
deleted file mode 100644
index 48e972eb..00000000
--- a/friendli/modules/quantizer_v2/maps.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Quantizer V2 Maps."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, Tuple, Type, cast
-
-import transformers  # type: ignore
-from transformers import (  # type: ignore
-    LlamaForCausalLM,
-    MistralForCausalLM,
-    Phi3ForCausalLM,
-    PretrainedConfig,
-    PreTrainedModel,
-)
-
-from friendli.errors import NotSupportedQuantModeError, QuantizationError
-from friendli.modules.quantizer_v2.base import AbstractQuantizerV2
-from friendli.modules.quantizer_v2.enums import Int8QuantType, QuantMode
-from friendli.modules.quantizer_v2.int8.base import Int8DynamicQuantizer, Int8QuantHook
-from friendli.modules.quantizer_v2.models.llama import LlamaInt8QuantHook
-from friendli.modules.quantizer_v2.models.phi3 import Phi3Int8QuantHook
-from friendli.modules.quantizer_v2.schema.config import (
-    Int8QuantConfig,
-    OneOfQuantConfig,
-)
-
-model_arch_int8_hook_map: Dict[PreTrainedModel, type[Int8QuantHook]] = {
-    LlamaForCausalLM: LlamaInt8QuantHook,
-    MistralForCausalLM: LlamaInt8QuantHook,
-    Phi3ForCausalLM: Phi3Int8QuantHook,
-}
-
-
-def get_quanthook_map(quant_mode: QuantMode) -> Dict[Type[PreTrainedModel], Any]:
-    """Get quantizer map."""
-    if quant_mode == QuantMode.INT8:
-        return model_arch_int8_hook_map
-    raise NotSupportedQuantModeError(
-        invalid_option=quant_mode,
-        valid_options=[e.value for e in QuantMode],
-    )
-
-
-def get_model_class(config: PretrainedConfig) -> PreTrainedModel:
-    """Get HuggingFace model architecture from config."""
-    model_arch_list = cast(List[str], cast(PretrainedConfig, config).architectures)
-    if len(model_arch_list) == 0:
-        raise QuantizationError("Model architecture not found in config.")
-    model_arch = model_arch_list[0]
-    try:
-        cls_type = getattr(transformers, model_arch, None)
-    except AttributeError as exc:
-        raise QuantizationError(str(exc)) from exc
-    return cls_type
-
-
-def get_quantizer_class(quant_config: OneOfQuantConfig) -> Type[AbstractQuantizerV2]:
-    """Get quantizer class."""
-    quant_mode = quant_config.mode
-    if quant_mode == QuantMode.INT8:
-        if (
-            cast(Int8QuantConfig, quant_config).int8_args.quant_type
-            == Int8QuantType.DYNAMIC
-        ):
-            return Int8DynamicQuantizer
-        raise QuantizationError(
-            "Only Dynamic quantization is supported for int8 quantization."
-        )
-    raise NotSupportedQuantModeError(
-        invalid_option=quant_mode,
-        valid_options=[e.value for e in QuantMode],
-    )
-
-
-def get_hf_quantizer_factory(
-    model_config: PretrainedConfig,
-    quant_config: OneOfQuantConfig,
-) -> Tuple[PreTrainedModel, AbstractQuantizerV2]:
-    """Get quantizer for specific model architecture with quant mode and args."""
-    hf_model_cls = get_model_class(model_config)
-    quantizer = get_quantizer_class(quant_config)
-    quanthook_map = get_quanthook_map(quant_config.mode)
-    quanthook = quanthook_map[hf_model_cls](quant_config, model_config)
-    return hf_model_cls, quantizer(quanthook, quant_config)
diff --git a/friendli/modules/quantizer_v2/models/llama.py b/friendli/modules/quantizer_v2/models/llama.py
deleted file mode 100644
index 649d8471..00000000
--- a/friendli/modules/quantizer_v2/models/llama.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli LlamaForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers import LlamaConfig, LlamaForCausalLM, PreTrainedModel
-
-from friendli.errors import NotSupportedCheckpointError, QuantizationError
-from friendli.modules.quantizer_v2.base import AbstractQuantHookV2
-from friendli.modules.quantizer_v2.int8.base import Int8QuantHook
-from friendli.modules.quantizer_v2.schema.config import Int8QuantConfig
-from friendli.modules.quantizer_v2.schema.data import (
-    ModuleName,
-    QuantInput,
-    TFQuantInputs,
-)
-
-
-class LlamaQuantHook(AbstractQuantHookV2):
-    """BaseQuantHook for LlamaForCausalLM."""
-
-    def check_model_config(self) -> None:
-        """Check if LLaMA architectures' config can be converted to Friendli format."""
-        try:
-            if cast(LlamaConfig, self.model_config).hidden_act not in ["silu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={cast(LlamaConfig, self.model_config).hidden_act}'",
-                    valid_options=["silu"],
-                )
-            if cast(LlamaConfig, self.model_config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if cast(LlamaConfig, self.model_config).rms_norm_eps not in (1e-5, 1e-6):
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'rms_norm_eps={cast(LlamaConfig, self.model_config).rms_norm_eps}'",
-                    valid_options=[1e-5, 1e-6],
-                )
-        except AttributeError as exc:
-            raise QuantizationError(str(exc)) from exc
-
-    def get_tf_blocks(self, model: PreTrainedModel) -> List[torch.nn.Module]:
-        """Return the transformer blocks in LlamaForCausalLM."""
-        return model.model.layers
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Return the linear layer types in LlamaForCausalLM."""
-        return (torch.nn.Linear,)
-
-    @property
-    def quantized_layer_prefix(self) -> str:
-        """The layer name prefix used before LLaMA's transformer block number."""
-        return "model.layers."
-
-
-class LlamaInt8QuantHook(LlamaQuantHook, Int8QuantHook):
-    """Int8QuantHook for LlamaForCausalLM."""
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Return the linear layer after attention in the decoder layer."""
-        return decoder_layer.self_attn.o_proj
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Return the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.down_proj
-
-    def iter_pre_act_post_act_params(
-        self,
-        model: LlamaForCausalLM,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Return iterator of layernorm's weight and linear layer's weight per transformer block in LlamaForCausalLM."""
-
-        for index, decoder_layer in enumerate(model.model.layers):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection ] gets smoothed
-            yield (
-                [
-                    decoder_layer.input_layernorm.weight.data,
-                ],
-                [
-                    decoder_layer.self_attn.q_proj.weight.data,
-                    decoder_layer.self_attn.k_proj.weight.data,
-                    decoder_layer.self_attn.v_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",  # the input tensors fed into Q, K, V matrices are identical.
-            )
-            # [LayerNorm 2] - [ MLP FF 1, MLP FF GATE ] gets smoothed
-            yield (
-                [
-                    decoder_layer.post_attention_layernorm.weight.data,
-                ],
-                [
-                    decoder_layer.mlp.up_proj.weight.data,
-                    decoder_layer.mlp.gate_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.mlp.up_proj",
-            )
-
-    def iter_tf_quant_inputs(self, model: PreTrainedModel) -> Iterator[TFQuantInputs]:
-        """Return the layers which should be quantized in transformer block of LlamaForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.self_attn
-            mlp = decoder_layer.mlp
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                quant_inputs=[
-                    QuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.q_proj",
-                        ],
-                        local_names=["q_proj"],
-                    ),
-                    QuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.k_proj",
-                        ],
-                        local_names=["k_proj"],
-                    ),
-                    QuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.v_proj",
-                        ],
-                        local_names=["v_proj"],
-                    ),
-                    QuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                        ],
-                        local_names=[
-                            "o_proj",
-                        ],
-                    ),
-                    QuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.up_proj",
-                        ],
-                        local_names=["up_proj"],
-                    ),
-                    QuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.gate_proj",
-                        ],
-                        local_names=["gate_proj"],
-                    ),
-                    QuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.down_proj"
-                        ],
-                        local_names=["down_proj"],
-                    ),
-                ],
-            )
diff --git a/friendli/modules/quantizer_v2/models/phi3.py b/friendli/modules/quantizer_v2/models/phi3.py
deleted file mode 100644
index 0fdc095f..00000000
--- a/friendli/modules/quantizer_v2/models/phi3.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Phi3ForCausalLM QuantizerHook."""
-
-# mypy: ignore-errors
-
-from __future__ import annotations
-
-from typing import Iterator, List, Tuple, Type, cast
-
-import torch
-from transformers import Phi3Config, Phi3ForCausalLM, PreTrainedModel
-
-from friendli.errors import NotSupportedCheckpointError, QuantizationError
-from friendli.modules.quantizer_v2.base import AbstractQuantHookV2
-from friendli.modules.quantizer_v2.int8.base import Int8QuantHook
-from friendli.modules.quantizer_v2.schema.data import (
-    ModuleName,
-    QuantInput,
-    TFQuantInputs,
-)
-
-
-class Phi3QuantHook(AbstractQuantHookV2):
-    """BaseQuantHook for Phi3ForCausalLM."""
-
-    def check_model_config(self) -> None:
-        """Check if Phi3 architectures' config can be converted to Friendli format."""
-        try:
-            if cast(Phi3Config, self.model_config).hidden_act not in ["silu"]:
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'hidden_act={cast(Phi3Config, self.model_config).hidden_act}'",
-                    valid_options=["silu"],
-                )
-            if cast(Phi3Config, self.model_config).tie_word_embeddings:
-                raise NotSupportedCheckpointError(
-                    invalid_option="'tie_word_embeddings=True'",
-                    valid_options=[False],
-                )
-            if cast(Phi3Config, self.model_config).rms_norm_eps not in (1e-5, 1e-6):
-                raise NotSupportedCheckpointError(
-                    invalid_option=f"'rms_norm_eps={cast(Phi3Config, self.model_config).rms_norm_eps}'",
-                    valid_options=[1e-5, 1e-6],
-                )
-        except AttributeError as exc:
-            raise QuantizationError(str(exc)) from exc
-
-    def get_tf_blocks(self, model: PreTrainedModel) -> List[torch.nn.Module]:
-        """Return the transformer blocks in Phi3ForCausalLM."""
-        return model.model.layers
-
-    def get_linear_layer_types(self) -> Tuple[Type[torch.nn.Module]]:
-        """Return the linear layer types in Phi3ForCausalLM."""
-        return (torch.nn.Linear,)
-
-    @property
-    def quantized_layer_prefix(self) -> str:
-        """The layer name prefix used before Phi3's transformer block number."""
-        return "model.layers."
-
-
-class Phi3Int8QuantHook(Phi3QuantHook, Int8QuantHook):
-    """Int8QuantHook for Phi3ForCausalLM."""
-
-    def get_attn_fc_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Return the linear layer after attention in the decoder layer."""
-        return decoder_layer.self_attn.o_proj
-
-    def get_ff2_layer(self, decoder_layer: torch.nn.Module) -> torch.nn.Linear:
-        """Return the linear layer after FF1 in the decoder layer."""
-        return decoder_layer.mlp.down_proj
-
-    def iter_pre_act_post_act_params(
-        self,
-        model: Phi3ForCausalLM,
-    ) -> Iterator[Tuple[List[torch.Tensor], List[torch.Tensor], ModuleName]]:
-        """Return iterator of layernorm's weight and linear layer's weight per transformer block in Phi3ForCausalLM."""
-
-        for index, decoder_layer in enumerate(model.model.layers):  # type: ignore[union-attr]
-            # [LayerNorm 1] - [ QKV projection ] gets smoothed
-            yield (
-                [
-                    decoder_layer.input_layernorm.weight.data,
-                ],
-                [
-                    decoder_layer.self_attn.qkv_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.self_attn.qkv_proj",
-            )
-            # [LayerNorm 2] - [ MLP FF 1, MLP FF GATE ] gets smoothed
-            yield (
-                [
-                    decoder_layer.post_attention_layernorm.weight.data,
-                ],
-                [
-                    decoder_layer.mlp.gate_up_proj.weight.data,
-                ],
-                f"{self.quantized_layer_prefix}{index}.mlp.gate_up_proj",
-            )
-
-    def iter_tf_quant_inputs(self, model: PreTrainedModel) -> Iterator[TFQuantInputs]:
-        """Return the layers which should be quantized in transformer block of Phi3ForCausalLM."""
-        for index, decoder_layer in enumerate(
-            self.get_tf_blocks(model)  # type: ignore[union-attr, arg-type]
-        ):
-            self_attn = decoder_layer.self_attn
-            mlp = decoder_layer.mlp
-
-            yield TFQuantInputs(
-                layer_index=index,
-                block=decoder_layer,
-                quant_inputs=[
-                    QuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.qkv_proj",
-                        ],
-                        local_names=["qkv_proj"],
-                    ),
-                    QuantInput(
-                        parent_module=self_attn,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.self_attn.o_proj",
-                        ],
-                        local_names=[
-                            "o_proj",
-                        ],
-                    ),
-                    QuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.gate_up_proj",
-                        ],
-                        local_names=["gate_up_proj"],
-                    ),
-                    QuantInput(
-                        parent_module=mlp,
-                        target_names=[
-                            f"{self.quantized_layer_prefix}{index}.mlp.down_proj"
-                        ],
-                        local_names=["down_proj"],
-                    ),
-                ],
-            )
diff --git a/friendli/modules/quantizer_v2/quantize.py b/friendli/modules/quantizer_v2/quantize.py
deleted file mode 100644
index 8187db5f..00000000
--- a/friendli/modules/quantizer_v2/quantize.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Converter."""
-
-from __future__ import annotations
-
-from typing import Optional
-
-from friendli.errors import TokenizerNotFoundError
-from friendli.logging import logger
-from friendli.modules.quantizer_v2.maps import get_hf_quantizer_factory
-from friendli.modules.quantizer_v2.schema.config import OneOfQuantConfig
-from friendli.modules.quantizer_v2.utils import (
-    get_model_dtype,
-    get_model_pretrained_config,
-    save_tokenizer,
-)
-
-
-def quantize_checkpoint(
-    model_name_or_path: str,
-    output_dir: str,
-    quant_config: OneOfQuantConfig,
-    *,
-    cache_dir: Optional[str] = None,
-    dry_run: bool = False,
-) -> None:
-    """Quantize HuggingFace model checkpoint to Friendli format.
-
-    Args:
-        model_name_or_path (str): Hugging Face model name or local path to the checkpoint.
-        output_dir (str) : Directory path to save the converted checkpoint and the attribute YAML,
-            and tokenizer configuration file.
-        quant_config (OneOfQuantConfig): Quantization configuration.
-        cache_dir (Optional[str], optional): Path for downloading checkpoint. Defaults to None.
-        dry_run (bool, optional): Check only if checkpoint is convertable. Defaults to False.
-
-    Raises:
-        InValidconfigError: Raised when data_type is not supported.
-        NotFoundError: Raised when `model_name_or_path` or `tokenizer_output_dir` is not found.
-        NotSupportedCheckpointError: Raised when model architecture is not supported to quantize.
-    """
-    model_config = get_model_pretrained_config(
-        model_name_or_path, output_dir, cache_dir
-    )
-    if quant_config.quant_scale_dtype is None:
-        model_dtype = get_model_dtype(model_config.torch_dtype)
-        quant_config.quant_scale_dtype = model_dtype
-        logger.warn(
-            "quant_scale_dtype is not set. Set to %s, same as hf model dtype.",
-            model_dtype,
-        )
-    hf_factory, quantizer = get_hf_quantizer_factory(model_config, quant_config)
-    dtype = model_config.torch_dtype
-    quantizer.check_config()
-
-    if not dry_run:
-        logger.info(
-            "Start loading Hugging Face checkpoint(%s) for conversion...",
-            model_name_or_path,
-        )
-        model = hf_factory.from_pretrained(
-            model_name_or_path,
-            torch_dtype=dtype,
-            cache_dir=cache_dir,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            # `low_cpu_mem_usage` is for model loading faster and using ~1x model size CPU memory.
-            # https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained.example
-        )
-        logger.info(
-            "Hugging Face checkpoint(%s) is successfully loaded!",
-            model_name_or_path,
-        )
-        model = quantizer.quantize(model)
-        model.config.update({"quantization_config": quantizer.get_quant_config()})
-        model.save_pretrained(output_dir)
-        try:
-            save_tokenizer(
-                model_name_or_path=model_name_or_path,
-                cache_dir=cache_dir,
-                save_dir=output_dir,
-            )
-        except TokenizerNotFoundError as exc:
-            logger.warn(str(exc))
-        logger.info(
-            "Hugging Face checkpoint (%s) is successfully quantized to Friendli format!",
-            model_name_or_path,
-        )
diff --git a/friendli/modules/quantizer_v2/schema/__init__.py b/friendli/modules/quantizer_v2/schema/__init__.py
deleted file mode 100644
index f5d8dd04..00000000
--- a/friendli/modules/quantizer_v2/schema/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantizer Schema."""
diff --git a/friendli/modules/quantizer_v2/schema/config.py b/friendli/modules/quantizer_v2/schema/config.py
deleted file mode 100644
index 37b481c2..00000000
--- a/friendli/modules/quantizer_v2/schema/config.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantizer Config Schema."""
-
-from __future__ import annotations
-
-from typing import Literal, Optional, Union
-
-from pydantic import BaseModel, Field
-from typing_extensions import Annotated
-
-from friendli.modules.quantizer_v2.enums import (
-    Int8QuantType,
-    ModelDataType,
-    QuantDatasetFormat,
-    QuantMode,
-)
-
-
-class CalibrationDatasetConfig(BaseModel):
-    """Calibration dataset config."""
-
-    path_or_name: str = "cnn_dailymail:3.0.0"
-    format: QuantDatasetFormat = QuantDatasetFormat.JSON
-    split: str = "validation"
-    lookup_column_name: str = "article"
-    num_samples: int = 512
-    max_length: int = 512
-    batch_size: int = 1
-
-
-class AbstractQuantConfig(BaseModel):
-    """Abstract quantization config."""
-
-    mode: QuantMode
-    device: str = "cuda:0"
-    offload: bool = True
-    seed: int = 42
-    percentile: float = 100.0
-    quant_dtype: ModelDataType = ModelDataType.INT8
-    quant_scale_dtype: Optional[ModelDataType] = None
-    use_symmetric: bool = True
-    quant_group_size: int = -1  # no grouping
-    calibration_dataset: CalibrationDatasetConfig = Field(
-        default_factory=CalibrationDatasetConfig
-    )
-
-
-class Int8QuantArtgs(BaseModel):
-    """Int8Quant args."""
-
-    migration_strength: float = 0.5
-    quant_type: Int8QuantType = Int8QuantType.DYNAMIC
-
-
-class Int8QuantConfig(AbstractQuantConfig):
-    """Int8Quant config."""
-
-    mode: Literal[QuantMode.INT8] = QuantMode.INT8
-    int8_args: Int8QuantArtgs = Field(default_factory=Int8QuantArtgs)
-
-
-class DummyQuantConfig(AbstractQuantConfig):
-    """Dummy quant config."""
-
-    mode: Literal[QuantMode.DUMMY] = QuantMode.DUMMY
-
-
-OneOfQuantConfig = Annotated[
-    Union[Int8QuantConfig, DummyQuantConfig], Field(discriminator="mode")
-]
-
-
-class QuantConfig(BaseModel):
-    """Quantization config."""
-
-    config: OneOfQuantConfig
diff --git a/friendli/modules/quantizer_v2/schema/data.py b/friendli/modules/quantizer_v2/schema/data.py
deleted file mode 100644
index a5d8e29d..00000000
--- a/friendli/modules/quantizer_v2/schema/data.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Model Quantizer Data Schema."""
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import List, Optional
-
-import torch
-
-ModuleName = str
-
-
-@dataclass
-class BaseQuantResult:
-    """Dataclass for quantization result per layer."""
-
-    q_group_size: int
-    zero_point: Optional[torch.Tensor]
-    q_weight: torch.Tensor
-    weight_scale: torch.Tensor
-
-
-@dataclass
-class WeightOnlyQuantResult(BaseQuantResult):
-    """Dataclass for weight-only quantization result per layer."""
-
-
-@dataclass
-class WeightActQuantResult(BaseQuantResult):
-    """Dataclass for weight-activation quantization result per layer."""
-
-    act_scale: torch.Tensor
-    q_group_size: int
-
-
-@dataclass
-class QuantInput:
-    """Dataclass for quantization input of each layer in transformer block.
-
-    When you want to quantize specific layers at once, the target layers should be
-    included in this dataclass. For example, if the quantization scale of the q_proj,
-    k_proj, and v_proj layers in the self-attention layer are calculated together,
-    the target_names and local_names of these layers should be included in the
-    same QuantInput dataclass.
-
-    Attributes:
-        parent_module: module contains target layers.
-        target_names: list of target module's full name
-                    (ex. model.model.layers.0.self_attn.q_proj, )
-        local_names: list of target module's name using when access from parent_module
-                    (ex. q_proj, k_proj, v_proj )
-    """
-
-    parent_module: torch.nn.Module
-    target_names: List[ModuleName]
-    local_names: str
-
-
-@dataclass
-class TFQuantInputs:
-    """Dataclass for Container of  per transformer block."""
-
-    layer_index: int
-    block: torch.nn.Module
-    quant_inputs: List[QuantInput]
diff --git a/friendli/modules/quantizer_v2/utils.py b/friendli/modules/quantizer_v2/utils.py
deleted file mode 100644
index 368ba95b..00000000
--- a/friendli/modules/quantizer_v2/utils.py
+++ /dev/null
@@ -1,565 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Friendli Quantizer Utils."""
-
-from __future__ import annotations
-
-import os
-from contextlib import contextmanager
-from pathlib import Path
-from typing import (
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Protocol,
-    Sequence,
-    Tuple,
-    Type,
-    Union,
-)
-
-import datasets  # type: ignore[import]
-import torch
-from accelerate import cpu_offload_with_hook  # type: ignore
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from transformers import (  # type: ignore
-    AutoConfig,
-    AutoTokenizer,
-    PretrainedConfig,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-)
-
-from friendli.errors import (
-    InvalidConfigError,
-    NotFoundError,
-    QuantizationError,
-    TokenizerNotFoundError,
-)
-from friendli.logging import logger
-from friendli.modules.quantizer_v2.enums import ModelDataType
-from friendli.modules.quantizer_v2.schema.config import CalibrationDatasetConfig
-from friendli.modules.quantizer_v2.schema.data import (
-    ModuleName,
-    WeightActQuantResult,
-    WeightOnlyQuantResult,
-)
-
-
-def get_tokenizer(
-    model_name_or_path: str,
-    *,
-    cache_dir: Optional[str] = None,
-) -> PreTrainedTokenizer:
-    """Try to get tokenizer of a pretrained model."""
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name_or_path,
-            cache_dir=cache_dir,
-            trust_remote_code=True,
-        )
-    except OSError as exc:
-        raise TokenizerNotFoundError(str(exc)) from exc
-
-    if not tokenizer.is_fast:
-        raise TokenizerNotFoundError(
-            "This model does not support Friendli-compatible tokenizer"
-        )
-
-    if tokenizer.pad_token != "<unk>":
-        tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    return tokenizer
-
-
-def save_tokenizer(
-    model_name_or_path: str,
-    *,
-    cache_dir: Optional[str] = None,
-    save_dir: str,
-) -> Tuple[str, ...]:
-    """Try to save `tokenizer.json` of a pretrained model."""
-    if not os.path.isdir(save_dir):
-        raise NotFoundError(f"Directory '{save_dir}' is not found.")
-
-    tokenizer = get_tokenizer(model_name_or_path, cache_dir=cache_dir)
-    saved_file_paths = tokenizer.save_pretrained(save_directory=save_dir)
-    tokenizer_json_path = None
-    for path in saved_file_paths:
-        if "tokenizer.json" == os.path.basename(path):
-            tokenizer_json_path = path
-            break
-
-    if tokenizer_json_path is None:
-        raise TokenizerNotFoundError(
-            "This model has the Friendli-compatible tokenizer implementation, but "
-            "'tokenizer.json' file is not found."
-        )
-    return saved_file_paths
-
-
-def get_model_pretrained_config(
-    model_name_or_path: str, model_output_path: str, cache_dir: Optional[str] = None
-) -> PretrainedConfig:
-    """Get HuggingFace model configs."""
-    try:
-        config = AutoConfig.from_pretrained(
-            model_name_or_path, cache_dir=cache_dir, trust_remote_code=True
-        )
-    except OSError as exc:  # from AutoConfig.from_pretrained()
-        config_dir = Path(model_name_or_path)
-        model_output_dir = Path(model_output_path).parent
-        if config_dir.exists() and model_output_dir.absolute() == config_dir.absolute():
-            raise NotFoundError(
-                f"'output_dir' ({model_output_dir.as_posix()}) and "
-                f"'model_name_or_path' ({model_name_or_path}) are the same. "
-                "In such a case, checkpoints should be prepared in 'output_dir'."
-            ) from exc
-        raise NotFoundError(str(exc)) from exc
-
-    return config
-
-
-def safe_load_datasets(data_cfg: CalibrationDatasetConfig) -> datasets.Dataset:
-    """Load dataset from calibration dataset config."""
-    data_path = data_cfg.path_or_name
-    data_split = data_cfg.split
-
-    try:
-        if os.path.exists(data_path):
-            dataset = datasets.load_dataset(
-                data_cfg.format,
-                data_files=data_path,
-                split=data_split,
-            )
-        else:
-            data_name_parts = data_path.split(":")
-            if len(data_name_parts) == 1:
-                dataset = datasets.load_dataset(data_path, split=data_split)
-            elif len(data_name_parts) == 2:
-                data_name, subset_name = data_name_parts
-                dataset = datasets.load_dataset(
-                    data_name, subset_name, split=data_split
-                )
-            else:
-                raise InvalidConfigError(
-                    "Dataset name is in invalid format. "
-                    "(valid format: '<dataset_name>' or '<dataset_name>:<subset_name>')"
-                )
-    except ValueError as err:
-        raise QuantizationError(f"datasets.load_dataset failed. {str(err)}") from err
-
-    if not isinstance(dataset, datasets.Dataset):
-        raise InvalidConfigError(
-            "This dataset format is not supported for the calibration."
-        )
-
-    return dataset
-
-
-def build_percentile_statistics(
-    scale_percentile: float,
-    symmetric: bool = True,
-) -> Tuple[Callable, Callable, Callable]:
-    """Builds the hooks for getting the max input and output activations of a model."""
-    logger.info(
-        "Building percentile statistics hooks. scale_percentile: (%s)",
-        scale_percentile,
-    )
-
-    max_input_M1: Dict[str, torch.Tensor] = {}
-    max_input_M2: Dict[str, torch.Tensor] = {}
-    max_input_num: Dict[str, torch.Tensor] = {}
-    max_output_M1: Dict[str, torch.Tensor] = {}
-    max_output_M2: Dict[str, torch.Tensor] = {}
-    max_output_num: Dict[str, torch.Tensor] = {}
-
-    def create_hook(name: ModuleName):
-        def update_stats(
-            max_M1: Dict[str, torch.Tensor],
-            max_M2: Dict[str, torch.Tensor],
-            max_num: Dict[str, int],
-            new_t: torch.Tensor,
-        ) -> None:
-            # Chan's method for computing mean and variance incrementally
-            new_t = new_t.detach().reshape(-1, new_t.size(-1))
-            new_numel = new_t.size(0)
-            new_t_M1 = new_t.to(torch.float64).mean(dim=0)
-            if symmetric:
-                # it is assumed samples are always centered on zero
-                # in the symmetric quantization scheme
-                new_t_M1.zero_()
-            new_t_M2 = ((new_t.to(torch.float64) - new_t_M1) ** 2).sum(dim=0)
-            try:
-                pre_numel = max_num[name]
-                max_num[name] += new_numel
-                delta = new_t_M1 - max_M1[name]
-                max_M1[name] += delta * (new_numel / max_num[name])
-                max_M2[name] += new_t_M2 + torch.pow(delta, 2) * (
-                    pre_numel * new_numel / max_num[name]
-                )
-            except KeyError:
-                max_num[name] = new_numel
-                max_M1[name] = new_t_M1
-                max_M2[name] = new_t_M2
-
-        def hook(module, in_t_tup, out_t):  # pylint: disable=unused-argument
-            with torch.no_grad():
-                in_t = in_t_tup[0]
-                update_stats(max_input_M1, max_input_M2, max_input_num, in_t)
-                update_stats(max_output_M1, max_output_M2, max_output_num, out_t)
-
-        return hook
-
-    def finish_input_stats():
-        return {
-            name: torch.distributions.Normal(
-                loc=max_input_M1[name],
-                scale=torch.sqrt(max_input_M2[name] / max_input_num[name]).clip(
-                    min=1e-7
-                ),
-            ).icdf(
-                torch.Tensor([(scale_percentile / 100.0) * 0.5 + 0.5]).to(
-                    max_input_M1[name].device
-                )
-            )
-            for name in list(max_input_M1.keys())
-        }
-
-    def finish_output_stats():
-        return {
-            name: torch.distributions.Normal(
-                loc=max_output_M1[name],
-                scale=torch.sqrt(max_output_M2[name] / max_output_num[name]).clip(
-                    min=1e-7
-                ),
-            ).icdf(
-                torch.Tensor([(scale_percentile / 100.0) * 0.5 + 0.5]).to(
-                    max_output_M1[name].device
-                )
-            )
-            for name in list(max_output_M1.keys())
-        }
-
-    return finish_input_stats, finish_output_stats, create_hook
-
-
-def build_max_statistics() -> Tuple[Callable, Callable, Callable]:
-    """Builds the hooks for getting the max input and output activations of a model."""
-    logger.info("Building max statistics hooks")
-    max_input_stats: Dict[str, torch.Tensor] = {}
-    max_output_stats: Dict[str, torch.Tensor] = {}
-
-    def create_hook(name: ModuleName):
-        def hook(modules, in_t_tup, out_t):  # pylint: disable=unused-argument
-            in_t = in_t_tup[0]
-            in_t = (
-                in_t.detach().abs().reshape(-1, in_t.size(-1)).max(dim=0).values
-            )  # reduce-max only leaving the hidden dim (supposing the last dim is the hidden dim)
-            out_t = out_t.detach().reshape(-1, out_t.size(-1))
-            out_t = out_t.abs().max(dim=0).values
-            try:
-                max_input_stats[name] = torch.maximum(max_input_stats[name], in_t)
-            except KeyError:
-                max_input_stats[name] = in_t
-            try:
-                max_output_stats[name] = torch.maximum(max_output_stats[name], out_t)
-            except KeyError:
-                max_output_stats[name] = out_t
-
-        return hook
-
-    def finish_input_stats():
-        return max_input_stats
-
-    def finish_output_stats():
-        return max_output_stats
-
-    return finish_input_stats, finish_output_stats, create_hook
-
-
-@torch.no_grad()
-def collect_stats(
-    model: PreTrainedModel,
-    device: str,
-    calib_dataloader: DataLoader,
-    target_classes: Tuple[Type[torch.nn.Module], ...],
-    tqdm_desc: str,
-    percentile: float,
-) -> Tuple[Dict[ModuleName, torch.Tensor], Dict[ModuleName, torch.Tensor]]:
-    """Collects the maximum values of input and output activations of a specific model.
-
-    Args:
-        model (torch.nn.Module): The model for which we want to collect the max statistics.
-        dataset (Dataset): Dataset that contains input tensors.
-        target_classes (Tuple[Type[torch.nn.Module], ...]): A tuple of the target classes.
-
-    Returns:
-        A tuple of two dictionaries: (max_input_stats, max_output_stats), where:
-        max_input_stats: The maximum input activation values for each module of the model.
-        max_output_stats: The maximum output activation values for each module of the model.
-
-    This function uses a forward hook to capture the maximum input and output activation values
-    of the specified target_classes. The max_batch_size parameter controls the size of the input
-    batches that are passed through the model.
-
-    The function returns two dictionaries containing the maximum input and output activation
-    values for each module of the model, respectively. These dictionaries can be used to calculate
-    scaling factors for weight quantization and activation smoothing.
-
-    """
-    # pylint: disable=too-many-locals
-    max_input_stats, max_output_stats, create_hook = (
-        build_percentile_statistics(percentile)
-        if percentile < 100.0
-        else build_max_statistics()
-    )
-    name_mods = [
-        (name, module)
-        for name, module in model.named_modules()
-        if isinstance(module, target_classes)
-    ]
-
-    removables = []
-    for name, module in name_mods:
-        removables.append(module.register_forward_hook(create_hook(name)))
-    try:
-        for inputs in tqdm(calib_dataloader, desc=tqdm_desc):
-            model(inputs.to(device))
-    finally:
-        for removable in removables:
-            removable.remove()
-    return max_input_stats(), max_output_stats()
-
-
-def convert_tensor_to_quant_dtype(
-    param: torch.Tensor,
-    quant_dtype: ModelDataType,
-) -> torch.Tensor:
-    """Convert tensor format to the given data type.
-
-    Args:
-        param (torch.Tensor): The tensor to be converted.
-        data_type (ModelDataType): The data type of the tensor.
-
-    Returns:
-        torch.Tensor: The converted tensor.
-
-    """
-    assert quant_dtype in [ModelDataType.INT4, ModelDataType.INT8]
-    if quant_dtype is ModelDataType.INT4:
-        pack_num = 8 // 4
-        int4_param = torch.zeros(
-            (param.shape[0], param.shape[1] // pack_num),
-            dtype=torch.uint8,
-            device=param.device,
-        )
-        for col in range(int4_param.shape[1]):
-            for i in range(pack_num):
-                int4_param[:, col] |= param[:, col * pack_num + i] << (i * 4)
-        param = int4_param.to(torch.int8)
-
-    elif quant_dtype is ModelDataType.INT8:
-        param = param.to(torch.int8)
-
-    return param.detach().to("cpu")
-
-
-@torch.no_grad()
-def get_weight_act_quant_scales(
-    model: PreTrainedModel,
-    layer_names: List[str],
-    max_input_stats: Dict[ModuleName, torch.Tensor],
-    device: str = "cpu",
-    quant_dtype: ModelDataType = ModelDataType.INT8,
-    quant_scale_dtype: ModelDataType = ModelDataType.FP32,
-) -> List[WeightActQuantResult]:
-    """Get the quantization scales and int8 weight for a specific layer."""
-    input_max = torch.concat([max_input_stats[name] for name in layer_names])
-    target_weights = [model.get_submodule(name).weight for name in layer_names]
-    target_weight = torch.concat(target_weights)
-
-    max_val = 2 ** (8 - 1) - 1
-    min_val = -(2 ** (8 - 1))
-
-    act_scale = float(input_max.detach().abs().max().item()) / float(max_val)
-    weight_scale = float(target_weight.detach().abs().max().item()) / float(max_val)
-
-    q_weights = [
-        (
-            convert_tensor_to_quant_dtype(
-                (weight.detach().float() / weight_scale).clip(min_val, max_val),
-                quant_dtype,
-            ).to(device)
-        )
-        for weight in target_weights
-    ]
-    quant_scale_torch_dtype = get_torch_data_type(quant_scale_dtype)
-    return [
-        WeightActQuantResult(
-            act_scale=torch.tensor(act_scale, dtype=quant_scale_torch_dtype),
-            weight_scale=torch.tensor(weight_scale, dtype=quant_scale_torch_dtype),
-            q_weight=q_weight,
-            q_group_size=-1,
-            zero_point=None,
-        )
-        for _, q_weight in zip(layer_names, q_weights)
-    ]
-
-
-def get_weight_only_quant_scales(
-    model: PreTrainedModel,
-    layer_names: List[str],
-    quant_dtype: ModelDataType,
-    quant_scale_dtype: ModelDataType,
-    q_group_size: int = -1,
-    use_symmetric: bool = True,
-    device: Union[str, torch.device] = "cpu",
-) -> List[WeightOnlyQuantResult]:
-    """Return the quantization scales of weight for a specific layer."""
-    # pylint: disable=too-many-locals
-    assert quant_dtype in [ModelDataType.INT4, ModelDataType.INT8]
-    q_bit = 4 if quant_dtype == ModelDataType.INT4 else 8
-    target_weights = [model.get_submodule(name).weight for name in layer_names]
-    org_w_shape = target_weights[0].shape  # [OutDim, InDim]
-    w = torch.concat(target_weights)
-
-    if q_group_size != -1:
-        w = w.reshape(-1, q_group_size)  # [OutDim x num_groups, group_size]
-
-    if use_symmetric:
-        max_val = w.abs().amax(dim=1, keepdim=True)
-        max_int = 2 ** (q_bit - 1) - 1
-        min_int = -(2 ** (q_bit - 1))
-        scales = (max_val / float(max_int)).clamp(min=1e-5)
-        zeros = torch.zeros_like(max_val)
-    else:
-        max_val = w.amax(dim=1, keepdim=True)
-        min_val = w.amin(dim=1, keepdim=True)
-        max_int = 2**q_bit - 1
-        min_int = 0
-
-        scales = (max_val - min_val).clamp(min=1e-5) / max_int
-        zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
-
-    q_weights = [
-        convert_tensor_to_quant_dtype(
-            torch.clamp(torch.round(w / scales) + zeros, min_int, max_int)
-            .reshape(org_w_shape)
-            .detach(),
-            quant_dtype,
-        ).to(device)
-        for w in target_weights
-    ]
-    quant_scale_torch_dtype = get_torch_data_type(quant_scale_dtype)
-    scales = (
-        scales.view(org_w_shape[0], -1).detach().transpose(0, 1).to(device)
-    )  # [num_groups, OutDim]
-    zeros = (
-        zeros.view(org_w_shape[0], -1).detach().transpose(0, 1).to(device)
-    )  # [num_groups, OutDim]
-
-    if q_group_size == -1:
-        scales = scales.squeeze(0)
-        zeros = zeros.squeeze(0)
-
-    return [
-        WeightOnlyQuantResult(
-            zero_point=None if use_symmetric else zeros.to(quant_scale_torch_dtype),
-            q_group_size=q_group_size,
-            weight_scale=scales.to(quant_scale_torch_dtype),
-            q_weight=q_weight,
-        )
-        for q_weight in q_weights
-    ]
-
-
-def get_model_dtype(torch_dtype: torch.dtype) -> ModelDataType:
-    """Get torch data type from Enum."""
-    if torch_dtype == torch.float16:
-        return ModelDataType.FP16
-    if torch_dtype == torch.float32:
-        return ModelDataType.FP32
-    if torch_dtype == torch.bfloat16:
-        return ModelDataType.BF16
-    raise QuantizationError(f"{torch_dtype} is not valid dtype for hf model dtype.")
-
-
-def get_torch_data_type(data_type: str) -> torch.dtype:
-    """Get torch data type from Enum."""
-    if data_type == ModelDataType.FP16:
-        return torch.float16
-    if data_type == ModelDataType.FP32:
-        return torch.float32
-    if data_type == ModelDataType.BF16:
-        return torch.bfloat16
-    raise QuantizationError(
-        f"Can't not converted original param to {data_type}. Only FP16, FP32, BF16 are supported."
-    )
-
-
-def send_model_to_device(
-    model: PreTrainedModel,
-    device: Union[str, torch.device],
-    *,
-    exclude: Iterable[torch.nn.Module] = (),
-):
-    """Send the model and its submodules onto device except for modules designated by `exclude`."""
-    exclude_set = set(exclude)
-
-    @torch.no_grad()
-    def recurse(m: torch.nn.Module):
-        if m in exclude_set:
-            return
-        for name, p in list(m.named_parameters(recurse=False)):
-            m.register_parameter(name, torch.nn.Parameter(p.to(device)))
-        for name, b in list(m.named_buffers(recurse=False)):
-            m.register_buffer(name, b.to(device))
-
-        for child in m.children():
-            recurse(child)
-
-    recurse(model)
-
-
-class RemovableOffloaderHook(Protocol):
-    """Hook protocol for cpu offloader."""
-
-    def offload(self) -> None:
-        """Offload the associated block onto CPU."""
-
-    def remove(self) -> None:
-        """Remove this hook."""
-
-
-@contextmanager
-def offload_module_sequence(
-    blocks: Sequence[torch.nn.Module], device: Union[str, torch.device]
-):
-    """Offload a sequence of torch modules automatically.
-
-    In the beginning, all blocks are supposed to reside on CPU.
-    When i-th block is called, it is loaded onto `device` on the fly.
-    And at the same time, it offloads (i-1)-th block back to CPU.
-    """
-    module_hooks: List[RemovableOffloaderHook] = []
-    if blocks:
-        prev_module_hook = None
-        for tf_block in blocks:
-            _, module_hook = cpu_offload_with_hook(
-                tf_block, device, prev_module_hook=prev_module_hook
-            )
-            prev_module_hook = module_hook
-            module_hooks.append(module_hook)
-    try:
-        yield
-    finally:
-        for hook in module_hooks:
-            hook.offload()
-        for hook in module_hooks:
-            hook.remove()
diff --git a/friendli/sdk/api/base.py b/friendli/sdk/api/base.py
index 2e6aad86..8c803c50 100644
--- a/friendli/sdk/api/base.py
+++ b/friendli/sdk/api/base.py
@@ -24,7 +24,12 @@
 
 from friendli.auth import get_auth_header
 from friendli.errors import APIError
-from friendli.utils.request import DEFAULT_REQ_TIMEOUT, transform_request_data
+from friendli.utils.request import (
+    DEFAULT_CONNECTION_LIMITS,
+    DEFAULT_REQ_TIMEOUT,
+    DEFAULT_TIMEOUT,
+    transform_request_data,
+)
 
 _GenerationLine = TypeVar("_GenerationLine", bound=BaseModel)
 
@@ -93,10 +98,26 @@ async def __anext__(self) -> _GenerationLine:
 _ProtoMsgType = TypeVar("_ProtoMsgType", bound=Type[pb_message.Message])
 
 
+class _DefaultHttpxClient(httpx.Client):
+    def __init__(self, **kwargs: Any) -> None:
+        kwargs.setdefault("timeout", DEFAULT_TIMEOUT)
+        kwargs.setdefault("limits", DEFAULT_CONNECTION_LIMITS)
+        kwargs.setdefault("follow_redirects", True)
+        super().__init__(**kwargs)
+
+
+class _DefaultAsyncHttpxClient(httpx.AsyncClient):
+    def __init__(self, **kwargs: Any) -> None:
+        kwargs.setdefault("timeout", DEFAULT_TIMEOUT)
+        kwargs.setdefault("limits", DEFAULT_CONNECTION_LIMITS)
+        kwargs.setdefault("follow_redirects", True)
+        super().__init__(**kwargs)
+
+
 class BaseAPI(ABC, Generic[_HttpxClient, _ProtoMsgType]):
     """Base API interface."""
 
-    _client: _HttpxClient
+    _http_client: _HttpxClient
 
     def __init__(
         self,
@@ -133,13 +154,12 @@ def _build_http_request(
         self, data: dict[str, Any], model: Optional[str] = None
     ) -> httpx.Request:
         """Build request."""
-        return self._client.build_request(
+        return self._http_client.build_request(
             method=self._method,
             url=self._build_http_url(),
             content=self._build_content(data, model),
             files=self._build_files(data),
             headers=self._get_headers(),
-            timeout=DEFAULT_REQ_TIMEOUT,
         )
 
     def _build_http_url(self) -> httpx.URL:
@@ -213,7 +233,7 @@ def __init__(
         endpoint_id: Optional[str] = None,
         use_protobuf: bool = False,
         use_grpc: bool = False,
-        client: Optional[httpx.Client] = None,
+        http_client: Optional[httpx.Client] = None,
         grpc_channel: Optional[grpc.Channel] = None,
     ) -> None:
         """Initializes ServingAPI."""
@@ -224,7 +244,7 @@ def __init__(
         )
 
         self._use_grpc = use_grpc
-        self._client = client or httpx.Client()
+        self._http_client = http_client or _DefaultHttpxClient()
         self._grpc_channel = grpc_channel
         self._grpc_stub = None
 
@@ -240,7 +260,7 @@ def close(self) -> None:
         """Close the gRPC channel and HTTP client."""
         if self._grpc_channel:
             self._grpc_channel.close()
-        self._client.close()
+        self._http_client.close()
 
     def _get_grpc_stub(self, channel: grpc.Channel) -> Any:
         raise NotImplementedError  # pragma: no cover
@@ -274,7 +294,7 @@ def _request(
             return grpc_response
 
         http_request = self._build_http_request(data=data, model=model)
-        http_response = self._client.send(request=http_request, stream=stream)
+        http_response = self._http_client.send(request=http_request, stream=stream)
         self._check_http_error(http_response)
         return http_response
 
@@ -302,7 +322,7 @@ def __init__(
         endpoint_id: Optional[str] = None,
         use_protobuf: bool = False,
         use_grpc: bool = False,
-        client: Optional[httpx.AsyncClient] = None,
+        http_client: Optional[httpx.AsyncClient] = None,
         grpc_channel: Optional[grpc.aio.Channel] = None,
     ) -> None:
         """Initializes AsyncServingAPI."""
@@ -311,7 +331,7 @@ def __init__(
         )
 
         self._use_grpc = use_grpc
-        self._client = client or httpx.AsyncClient()
+        self._http_client = http_client or _DefaultAsyncHttpxClient()
         self._grpc_channel = grpc_channel
         self._grpc_stub = None
 
@@ -327,7 +347,7 @@ async def close(self) -> None:
         """Close the gRPC channel and HTTP client."""
         if self._grpc_channel:
             await self._grpc_channel.close(grace=None)
-        await self._client.aclose()
+        await self._http_client.aclose()
 
     def _get_grpc_stub(self, channel: grpc.aio.Channel) -> Any:
         raise NotImplementedError  # pragma: no cover
@@ -363,7 +383,9 @@ async def _request(
             return grpc_response
 
         http_request = self._build_http_request(data=data, model=model)
-        http_response = await self._client.send(request=http_request, stream=stream)
+        http_response = await self._http_client.send(
+            request=http_request, stream=stream
+        )
         await self._check_http_error(http_response)
 
         return http_response
diff --git a/friendli/sdk/api/chat/chat.py b/friendli/sdk/api/chat/chat.py
index 9741794d..7351ac01 100644
--- a/friendli/sdk/api/chat/chat.py
+++ b/friendli/sdk/api/chat/chat.py
@@ -24,7 +24,7 @@ def __init__(
         endpoint_id: Optional[str] = None,
         use_protobuf: bool = False,
         use_grpc: bool = False,
-        client: Optional[httpx.Client] = None,
+        http_client: Optional[httpx.Client] = None,
         grpc_channel: Optional[grpc.Channel] = None,
     ) -> None:
         """Initializes Chat."""
@@ -33,7 +33,7 @@ def __init__(
             endpoint_id=endpoint_id,
             use_protobuf=use_protobuf,
             use_grpc=use_grpc,
-            client=client,
+            http_client=http_client,
             grpc_channel=grpc_channel,
         )
 
@@ -53,7 +53,7 @@ def __init__(
         endpoint_id: Optional[str] = None,
         use_protobuf: bool = False,
         use_grpc: bool = False,
-        client: Optional[httpx.AsyncClient] = None,
+        http_client: Optional[httpx.AsyncClient] = None,
         grpc_channel: Optional[grpc.aio.Channel] = None,
     ) -> None:
         """Initializes AsyncChat."""
@@ -62,7 +62,7 @@ def __init__(
             endpoint_id=endpoint_id,
             use_protobuf=use_protobuf,
             use_grpc=use_grpc,
-            client=client,
+            http_client=http_client,
             grpc_channel=grpc_channel,
         )
 
diff --git a/friendli/sdk/api/images/images.py b/friendli/sdk/api/images/images.py
index 1901c501..fb58e658 100644
--- a/friendli/sdk/api/images/images.py
+++ b/friendli/sdk/api/images/images.py
@@ -20,11 +20,11 @@ def __init__(
         self,
         base_url: Optional[str] = None,
         endpoint_id: Optional[str] = None,
-        client: Optional[httpx.Client] = None,
+        http_client: Optional[httpx.Client] = None,
     ) -> None:
         """Initialize Images."""
         self.text_to_image = TextToImage(
-            base_url=base_url, endpoint_id=endpoint_id, client=client
+            base_url=base_url, endpoint_id=endpoint_id, http_client=http_client
         )
 
     def close(self) -> None:
@@ -41,11 +41,11 @@ def __init__(
         self,
         base_url: Optional[str] = None,
         endpoint_id: Optional[str] = None,
-        client: Optional[httpx.AsyncClient] = None,
+        http_client: Optional[httpx.AsyncClient] = None,
     ) -> None:
         """Initialize Images."""
         self.text_to_image = AsyncTextToImage(
-            base_url=base_url, endpoint_id=endpoint_id, client=client
+            base_url=base_url, endpoint_id=endpoint_id, http_client=http_client
         )
 
     async def close(self) -> None:
diff --git a/friendli/sdk/client.py b/friendli/sdk/client.py
index 6ef97e53..6930641c 100644
--- a/friendli/sdk/client.py
+++ b/friendli/sdk/client.py
@@ -8,6 +8,7 @@
 
 import grpc
 import grpc.aio
+import httpx
 
 import friendli
 from friendli.client.graphql.endpoint import EndpointGqlClient
@@ -34,6 +35,7 @@ def __init__(
         base_url: Optional[str] = None,
         use_protobuf: bool = False,
         use_grpc: bool = False,
+        http_client: Optional[Union[httpx.Client, httpx.AsyncClient]] = None,
         grpc_channel: Optional[Union[grpc.Channel, grpc.aio.Channel]] = None,
     ):
         """Initializes FriendliClientBase."""
@@ -52,6 +54,8 @@ def __init__(
                 raise ValueError(
                     "One of `base_url` and `grpc_channel` should be set when `use_grpc=True`."
                 )
+            if http_client is not None:
+                raise ValueError("You cannot use HTTP client when `use_grpc=True`.")
         else:
             if grpc_channel is not None:
                 raise ValueError(
@@ -80,6 +84,7 @@ def __init__(
         base_url: Optional[str] = None,
         use_protobuf: bool = False,
         use_grpc: bool = False,
+        http_client: Optional[httpx.Client] = None,
         grpc_channel: Optional[grpc.Channel] = None,
     ):
         """Initializes Friendli."""
@@ -91,6 +96,7 @@ def __init__(
             base_url=base_url,
             use_protobuf=use_protobuf,
             use_grpc=use_grpc,
+            http_client=http_client,
             grpc_channel=grpc_channel,
         )
 
@@ -99,6 +105,7 @@ def __init__(
             endpoint_id=self._endpoint_id,
             use_protobuf=use_protobuf,
             use_grpc=use_grpc,
+            http_client=http_client,
             grpc_channel=grpc_channel,
         )
         self.chat = Chat(
@@ -106,9 +113,14 @@ def __init__(
             endpoint_id=self._endpoint_id,
             use_protobuf=use_protobuf,
             use_grpc=use_grpc,
+            http_client=http_client,
             grpc_channel=grpc_channel,
         )
-        self.images = Images(base_url=self._base_url, endpoint_id=self._endpoint_id)
+        self.images = Images(
+            base_url=self._base_url,
+            endpoint_id=self._endpoint_id,
+            http_client=http_client,
+        )
 
         endpoint_client = EndpointGqlClient()
         model_client = ModelGqlClient()
@@ -147,6 +159,7 @@ def __init__(
         base_url: Optional[str] = None,
         use_protobuf: bool = False,
         use_grpc: bool = False,
+        http_client: Optional[httpx.AsyncClient] = None,
         grpc_channel: Optional[grpc.aio.Channel] = None,
     ):
         """Initializes AsyncFriendli."""
@@ -158,6 +171,7 @@ def __init__(
             base_url=base_url,
             use_protobuf=use_protobuf,
             use_grpc=use_grpc,
+            http_client=http_client,
             grpc_channel=grpc_channel,
         )
 
@@ -166,6 +180,7 @@ def __init__(
             endpoint_id=self._endpoint_id,
             use_protobuf=use_protobuf,
             use_grpc=use_grpc,
+            http_client=http_client,
             grpc_channel=grpc_channel,
         )
         self.chat = AsyncChat(
@@ -173,10 +188,13 @@ def __init__(
             endpoint_id=self._endpoint_id,
             use_protobuf=use_protobuf,
             use_grpc=use_grpc,
+            http_client=http_client,
             grpc_channel=grpc_channel,
         )
         self.images = AsyncImages(
-            base_url=self._base_url, endpoint_id=self._endpoint_id
+            base_url=self._base_url,
+            endpoint_id=self._endpoint_id,
+            http_client=http_client,
         )
 
     async def __aenter__(self) -> AsyncFriendli:
diff --git a/friendli/utils/request.py b/friendli/utils/request.py
index 49157af5..9fb9e6bf 100644
--- a/friendli/utils/request.py
+++ b/friendli/utils/request.py
@@ -6,14 +6,19 @@
 
 from typing import Any
 
+import httpx
 import pydantic
 from requests.exceptions import HTTPError
 
 from friendli.utils.compat import model_dump
 from friendli.utils.url import discuss_url
 
-DEFAULT_REQ_TIMEOUT = 30
+DEFAULT_REQ_TIMEOUT = 600.0
 MAX_RETRIES = 3
+DEFAULT_TIMEOUT = httpx.Timeout(timeout=DEFAULT_REQ_TIMEOUT, connect=5.0)
+DEFAULT_CONNECTION_LIMITS = httpx.Limits(
+    max_connections=1000, max_keepalive_connections=100
+)
 
 
 def decode_http_err(exc: HTTPError) -> str:
diff --git a/friendli/utils/validate.py b/friendli/utils/validate.py
index 8557d666..0e395266 100644
--- a/friendli/utils/validate.py
+++ b/friendli/utils/validate.py
@@ -6,7 +6,6 @@
 
 from datetime import datetime
 from enum import Enum
-from importlib.util import find_spec
 from typing import Any, Dict, Optional, Type
 
 import typer
@@ -81,16 +80,3 @@ def validate_enums(val: Any, enum_cls: Type[Enum]) -> Any:
         raise InvalidConfigError(
             f"Invalid value. Please provide one of {supported_values}"
         ) from exc
-
-
-def validate_convert_imports() -> None:
-    """Validate the import modules for checkpoint conversion."""
-    if find_spec("torch") is None:
-        raise ModuleNotFoundError(
-            "To convert the checkpoint, you must install 'torch'."
-        )
-    if find_spec("transformers") is None or find_spec("accelerate") is None:
-        raise ModuleNotFoundError(
-            "To convert the checkpoint,"
-            " your must install the package with 'pip install \"friendli-client[mllib]\"'"
-        )
diff --git a/friendli/utils/version.py b/friendli/utils/version.py
index 55cf390b..ba7e9574 100644
--- a/friendli/utils/version.py
+++ b/friendli/utils/version.py
@@ -48,8 +48,3 @@ def check_dependencies_installed(deps: List[str]) -> bool:
             return False
 
     return True
-
-
-def check_extras_installed() -> bool:
-    """Check extra package dependencies are installed."""
-    return check_dependencies_installed(["torch", "transformers"])
diff --git a/poetry.lock b/poetry.lock
index 763e0739..b2a31c21 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,142 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
-
-[[package]]
-name = "accelerate"
-version = "0.21.0"
-description = "Accelerate"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "accelerate-0.21.0-py3-none-any.whl", hash = "sha256:e2609d37f2c6a56e36a0612feae6ff6d9daac9759f4899432b86b1dc97024ebb"},
-    {file = "accelerate-0.21.0.tar.gz", hash = "sha256:e2959a0bf74d97c0b3c0e036ed96065142a060242281d27970d4c4e34f11ca59"},
-]
-
-[package.dependencies]
-numpy = ">=1.17"
-packaging = ">=20.0"
-psutil = "*"
-pyyaml = "*"
-torch = ">=1.10.0"
-
-[package.extras]
-dev = ["black (>=23.1,<24.0)", "datasets", "deepspeed", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.0.241)", "scikit-learn", "scipy", "tqdm", "transformers", "urllib3 (<2.0.0)"]
-quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.0.241)", "urllib3 (<2.0.0)"]
-rich = ["rich"]
-sagemaker = ["sagemaker"]
-test-dev = ["datasets", "deepspeed", "evaluate", "scikit-learn", "scipy", "tqdm", "transformers"]
-test-prod = ["parameterized", "pytest", "pytest-subtests", "pytest-xdist"]
-test-trackers = ["comet-ml", "tensorboard", "wandb"]
-testing = ["datasets", "deepspeed", "evaluate", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "tqdm", "transformers"]
-
-[[package]]
-name = "aiohttp"
-version = "3.9.3"
-description = "Async http client/server framework (asyncio)"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "aiohttp-3.9.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:939677b61f9d72a4fa2a042a5eee2a99a24001a67c13da113b2e30396567db54"},
-    {file = "aiohttp-3.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1f5cd333fcf7590a18334c90f8c9147c837a6ec8a178e88d90a9b96ea03194cc"},
-    {file = "aiohttp-3.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:82e6aa28dd46374f72093eda8bcd142f7771ee1eb9d1e223ff0fa7177a96b4a5"},
-    {file = "aiohttp-3.9.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f56455b0c2c7cc3b0c584815264461d07b177f903a04481dfc33e08a89f0c26b"},
-    {file = "aiohttp-3.9.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bca77a198bb6e69795ef2f09a5f4c12758487f83f33d63acde5f0d4919815768"},
-    {file = "aiohttp-3.9.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e083c285857b78ee21a96ba1eb1b5339733c3563f72980728ca2b08b53826ca5"},
-    {file = "aiohttp-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab40e6251c3873d86ea9b30a1ac6d7478c09277b32e14745d0d3c6e76e3c7e29"},
-    {file = "aiohttp-3.9.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df822ee7feaaeffb99c1a9e5e608800bd8eda6e5f18f5cfb0dc7eeb2eaa6bbec"},
-    {file = "aiohttp-3.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:acef0899fea7492145d2bbaaaec7b345c87753168589cc7faf0afec9afe9b747"},
-    {file = "aiohttp-3.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:cd73265a9e5ea618014802ab01babf1940cecb90c9762d8b9e7d2cc1e1969ec6"},
-    {file = "aiohttp-3.9.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a78ed8a53a1221393d9637c01870248a6f4ea5b214a59a92a36f18151739452c"},
-    {file = "aiohttp-3.9.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:6b0e029353361f1746bac2e4cc19b32f972ec03f0f943b390c4ab3371840aabf"},
-    {file = "aiohttp-3.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7cf5c9458e1e90e3c390c2639f1017a0379a99a94fdfad3a1fd966a2874bba52"},
-    {file = "aiohttp-3.9.3-cp310-cp310-win32.whl", hash = "sha256:3e59c23c52765951b69ec45ddbbc9403a8761ee6f57253250c6e1536cacc758b"},
-    {file = "aiohttp-3.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:055ce4f74b82551678291473f66dc9fb9048a50d8324278751926ff0ae7715e5"},
-    {file = "aiohttp-3.9.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6b88f9386ff1ad91ace19d2a1c0225896e28815ee09fc6a8932fded8cda97c3d"},
-    {file = "aiohttp-3.9.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c46956ed82961e31557b6857a5ca153c67e5476972e5f7190015018760938da2"},
-    {file = "aiohttp-3.9.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:07b837ef0d2f252f96009e9b8435ec1fef68ef8b1461933253d318748ec1acdc"},
-    {file = "aiohttp-3.9.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad46e6f620574b3b4801c68255492e0159d1712271cc99d8bdf35f2043ec266"},
-    {file = "aiohttp-3.9.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ed3e046ea7b14938112ccd53d91c1539af3e6679b222f9469981e3dac7ba1ce"},
-    {file = "aiohttp-3.9.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:039df344b45ae0b34ac885ab5b53940b174530d4dd8a14ed8b0e2155b9dddccb"},
-    {file = "aiohttp-3.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7943c414d3a8d9235f5f15c22ace69787c140c80b718dcd57caaade95f7cd93b"},
-    {file = "aiohttp-3.9.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84871a243359bb42c12728f04d181a389718710129b36b6aad0fc4655a7647d4"},
-    {file = "aiohttp-3.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5eafe2c065df5401ba06821b9a054d9cb2848867f3c59801b5d07a0be3a380ae"},
-    {file = "aiohttp-3.9.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:9d3c9b50f19704552f23b4eaea1fc082fdd82c63429a6506446cbd8737823da3"},
-    {file = "aiohttp-3.9.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:f033d80bc6283092613882dfe40419c6a6a1527e04fc69350e87a9df02bbc283"},
-    {file = "aiohttp-3.9.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:2c895a656dd7e061b2fd6bb77d971cc38f2afc277229ce7dd3552de8313a483e"},
-    {file = "aiohttp-3.9.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1f5a71d25cd8106eab05f8704cd9167b6e5187bcdf8f090a66c6d88b634802b4"},
-    {file = "aiohttp-3.9.3-cp311-cp311-win32.whl", hash = "sha256:50fca156d718f8ced687a373f9e140c1bb765ca16e3d6f4fe116e3df7c05b2c5"},
-    {file = "aiohttp-3.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:5fe9ce6c09668063b8447f85d43b8d1c4e5d3d7e92c63173e6180b2ac5d46dd8"},
-    {file = "aiohttp-3.9.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:38a19bc3b686ad55804ae931012f78f7a534cce165d089a2059f658f6c91fa60"},
-    {file = "aiohttp-3.9.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:770d015888c2a598b377bd2f663adfd947d78c0124cfe7b959e1ef39f5b13869"},
-    {file = "aiohttp-3.9.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee43080e75fc92bf36219926c8e6de497f9b247301bbf88c5c7593d931426679"},
-    {file = "aiohttp-3.9.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52df73f14ed99cee84865b95a3d9e044f226320a87af208f068ecc33e0c35b96"},
-    {file = "aiohttp-3.9.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc9b311743a78043b26ffaeeb9715dc360335e5517832f5a8e339f8a43581e4d"},
-    {file = "aiohttp-3.9.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b955ed993491f1a5da7f92e98d5dad3c1e14dc175f74517c4e610b1f2456fb11"},
-    {file = "aiohttp-3.9.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:504b6981675ace64c28bf4a05a508af5cde526e36492c98916127f5a02354d53"},
-    {file = "aiohttp-3.9.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a6fe5571784af92b6bc2fda8d1925cccdf24642d49546d3144948a6a1ed58ca5"},
-    {file = "aiohttp-3.9.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ba39e9c8627edc56544c8628cc180d88605df3892beeb2b94c9bc857774848ca"},
-    {file = "aiohttp-3.9.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e5e46b578c0e9db71d04c4b506a2121c0cb371dd89af17a0586ff6769d4c58c1"},
-    {file = "aiohttp-3.9.3-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:938a9653e1e0c592053f815f7028e41a3062e902095e5a7dc84617c87267ebd5"},
-    {file = "aiohttp-3.9.3-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:c3452ea726c76e92f3b9fae4b34a151981a9ec0a4847a627c43d71a15ac32aa6"},
-    {file = "aiohttp-3.9.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ff30218887e62209942f91ac1be902cc80cddb86bf00fbc6783b7a43b2bea26f"},
-    {file = "aiohttp-3.9.3-cp312-cp312-win32.whl", hash = "sha256:38f307b41e0bea3294a9a2a87833191e4bcf89bb0365e83a8be3a58b31fb7f38"},
-    {file = "aiohttp-3.9.3-cp312-cp312-win_amd64.whl", hash = "sha256:b791a3143681a520c0a17e26ae7465f1b6f99461a28019d1a2f425236e6eedb5"},
-    {file = "aiohttp-3.9.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0ed621426d961df79aa3b963ac7af0d40392956ffa9be022024cd16297b30c8c"},
-    {file = "aiohttp-3.9.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7f46acd6a194287b7e41e87957bfe2ad1ad88318d447caf5b090012f2c5bb528"},
-    {file = "aiohttp-3.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:feeb18a801aacb098220e2c3eea59a512362eb408d4afd0c242044c33ad6d542"},
-    {file = "aiohttp-3.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f734e38fd8666f53da904c52a23ce517f1b07722118d750405af7e4123933511"},
-    {file = "aiohttp-3.9.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b40670ec7e2156d8e57f70aec34a7216407848dfe6c693ef131ddf6e76feb672"},
-    {file = "aiohttp-3.9.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fdd215b7b7fd4a53994f238d0f46b7ba4ac4c0adb12452beee724ddd0743ae5d"},
-    {file = "aiohttp-3.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:017a21b0df49039c8f46ca0971b3a7fdc1f56741ab1240cb90ca408049766168"},
-    {file = "aiohttp-3.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e99abf0bba688259a496f966211c49a514e65afa9b3073a1fcee08856e04425b"},
-    {file = "aiohttp-3.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:648056db9a9fa565d3fa851880f99f45e3f9a771dd3ff3bb0c048ea83fb28194"},
-    {file = "aiohttp-3.9.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8aacb477dc26797ee089721536a292a664846489c49d3ef9725f992449eda5a8"},
-    {file = "aiohttp-3.9.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:522a11c934ea660ff8953eda090dcd2154d367dec1ae3c540aff9f8a5c109ab4"},
-    {file = "aiohttp-3.9.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5bce0dc147ca85caa5d33debc4f4d65e8e8b5c97c7f9f660f215fa74fc49a321"},
-    {file = "aiohttp-3.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b4af9f25b49a7be47c0972139e59ec0e8285c371049df1a63b6ca81fdd216a2"},
-    {file = "aiohttp-3.9.3-cp38-cp38-win32.whl", hash = "sha256:298abd678033b8571995650ccee753d9458dfa0377be4dba91e4491da3f2be63"},
-    {file = "aiohttp-3.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:69361bfdca5468c0488d7017b9b1e5ce769d40b46a9f4a2eed26b78619e9396c"},
-    {file = "aiohttp-3.9.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0fa43c32d1643f518491d9d3a730f85f5bbaedcbd7fbcae27435bb8b7a061b29"},
-    {file = "aiohttp-3.9.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:835a55b7ca49468aaaac0b217092dfdff370e6c215c9224c52f30daaa735c1c1"},
-    {file = "aiohttp-3.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:06a9b2c8837d9a94fae16c6223acc14b4dfdff216ab9b7202e07a9a09541168f"},
-    {file = "aiohttp-3.9.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abf151955990d23f84205286938796c55ff11bbfb4ccfada8c9c83ae6b3c89a3"},
-    {file = "aiohttp-3.9.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59c26c95975f26e662ca78fdf543d4eeaef70e533a672b4113dd888bd2423caa"},
-    {file = "aiohttp-3.9.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f95511dd5d0e05fd9728bac4096319f80615aaef4acbecb35a990afebe953b0e"},
-    {file = "aiohttp-3.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:595f105710293e76b9dc09f52e0dd896bd064a79346234b521f6b968ffdd8e58"},
-    {file = "aiohttp-3.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7c8b816c2b5af5c8a436df44ca08258fc1a13b449393a91484225fcb7545533"},
-    {file = "aiohttp-3.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f1088fa100bf46e7b398ffd9904f4808a0612e1d966b4aa43baa535d1b6341eb"},
-    {file = "aiohttp-3.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f59dfe57bb1ec82ac0698ebfcdb7bcd0e99c255bd637ff613760d5f33e7c81b3"},
-    {file = "aiohttp-3.9.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:361a1026c9dd4aba0109e4040e2aecf9884f5cfe1b1b1bd3d09419c205e2e53d"},
-    {file = "aiohttp-3.9.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:363afe77cfcbe3a36353d8ea133e904b108feea505aa4792dad6585a8192c55a"},
-    {file = "aiohttp-3.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e2c45c208c62e955e8256949eb225bd8b66a4c9b6865729a786f2aa79b72e9d"},
-    {file = "aiohttp-3.9.3-cp39-cp39-win32.whl", hash = "sha256:f7217af2e14da0856e082e96ff637f14ae45c10a5714b63c77f26d8884cf1051"},
-    {file = "aiohttp-3.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:27468897f628c627230dba07ec65dc8d0db566923c48f29e084ce382119802bc"},
-    {file = "aiohttp-3.9.3.tar.gz", hash = "sha256:90842933e5d1ff760fae6caca4b2b3edba53ba8f4b71e95dacf2818a2aca06f7"},
-]
-
-[package.dependencies]
-aiosignal = ">=1.1.2"
-async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
-attrs = ">=17.3.0"
-frozenlist = ">=1.1.1"
-multidict = ">=4.5,<7.0"
-yarl = ">=1.0,<2.0"
-
-[package.extras]
-speedups = ["Brotli", "aiodns", "brotlicffi"]
-
-[[package]]
-name = "aiosignal"
-version = "1.3.1"
-description = "aiosignal: a list of registered asynchronous callbacks"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
-    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
-]
-
-[package.dependencies]
-frozenlist = ">=1.1.0"
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -192,17 +54,6 @@ wrapt = [
     {version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
 ]
 
-[[package]]
-name = "async-timeout"
-version = "4.0.3"
-description = "Timeout context manager for asyncio programs"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
-    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
-]
-
 [[package]]
 name = "attrs"
 version = "23.2.0"
@@ -495,50 +346,6 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1
 [package.extras]
 toml = ["tomli"]
 
-[[package]]
-name = "datasets"
-version = "2.16.0"
-description = "HuggingFace community-driven open-source library of datasets"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "datasets-2.16.0-py3-none-any.whl", hash = "sha256:301cc39b3d81cd751100b79c85f8ae8626c17b0b113819ba2831c204d90b43f2"},
-    {file = "datasets-2.16.0.tar.gz", hash = "sha256:91b06f7a8f0329179e7d603004102a6cc7a424a2f599315297a061caa1f8fa64"},
-]
-
-[package.dependencies]
-aiohttp = "*"
-dill = ">=0.3.0,<0.3.8"
-filelock = "*"
-fsspec = {version = ">=2023.1.0,<=2023.10.0", extras = ["http"]}
-huggingface-hub = ">=0.19.4"
-multiprocess = "*"
-numpy = ">=1.17"
-packaging = "*"
-pandas = "*"
-pyarrow = ">=8.0.0"
-pyarrow-hotfix = "*"
-pyyaml = ">=5.1"
-requests = ">=2.19.0"
-tqdm = ">=4.62.1"
-xxhash = "*"
-
-[package.extras]
-apache-beam = ["apache-beam (>=2.26.0,<2.44.0)"]
-audio = ["librosa", "soundfile (>=0.12.1)"]
-benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"]
-dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.1.5)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"]
-docs = ["s3fs", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos", "torch", "transformers"]
-jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"]
-metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"]
-quality = ["ruff (>=0.1.5)"]
-s3 = ["s3fs"]
-tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos"]
-tensorflow-gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"]
-tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"]
-torch = ["torch"]
-vision = ["Pillow (>=6.2.1)"]
-
 [[package]]
 name = "dill"
 version = "0.3.7"
@@ -573,17 +380,6 @@ idna = ["idna (>=2.1)"]
 trio = ["trio (>=0.14)"]
 wmi = ["wmi (>=1.5.1)"]
 
-[[package]]
-name = "einops"
-version = "0.6.1"
-description = "A new flavour of deep learning operations"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "einops-0.6.1-py3-none-any.whl", hash = "sha256:99149e46cc808956b174932fe563d920db4d6e5dadb8c6ecdaa7483b7ef7cfc3"},
-    {file = "einops-0.6.1.tar.gz", hash = "sha256:f95f8d00f4ded90dbc4b19b6f98b177332614b0357dde66997f3ae5d474dc8c8"},
-]
-
 [[package]]
 name = "email-validator"
 version = "2.1.0.post1"
@@ -633,147 +429,6 @@ typing-extensions = ">=4.8.0"
 [package.extras]
 all = ["email-validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
 
-[[package]]
-name = "filelock"
-version = "3.13.1"
-description = "A platform independent file lock."
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"},
-    {file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"},
-]
-
-[package.extras]
-docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
-typing = ["typing-extensions (>=4.8)"]
-
-[[package]]
-name = "frozenlist"
-version = "1.4.1"
-description = "A list-like structure which implements collections.abc.MutableSequence"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
-    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
-    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
-]
-
-[[package]]
-name = "fsspec"
-version = "2023.10.0"
-description = "File-system specification"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "fsspec-2023.10.0-py3-none-any.whl", hash = "sha256:346a8f024efeb749d2a5fca7ba8854474b1ff9af7c3faaf636a4548781136529"},
-    {file = "fsspec-2023.10.0.tar.gz", hash = "sha256:330c66757591df346ad3091a53bd907e15348c2ba17d63fd54f5c39c4457d2a5"},
-]
-
-[package.dependencies]
-aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""}
-requests = {version = "*", optional = true, markers = "extra == \"http\""}
-
-[package.extras]
-abfs = ["adlfs"]
-adl = ["adlfs"]
-arrow = ["pyarrow (>=1)"]
-dask = ["dask", "distributed"]
-devel = ["pytest", "pytest-cov"]
-dropbox = ["dropbox", "dropboxdrivefs", "requests"]
-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
-fuse = ["fusepy"]
-gcs = ["gcsfs"]
-git = ["pygit2"]
-github = ["requests"]
-gs = ["gcsfs"]
-gui = ["panel"]
-hdfs = ["pyarrow (>=1)"]
-http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"]
-libarchive = ["libarchive-c"]
-oci = ["ocifs"]
-s3 = ["s3fs"]
-sftp = ["paramiko"]
-smb = ["smbprotocol"]
-ssh = ["paramiko"]
-tqdm = ["tqdm"]
-
 [[package]]
 name = "gql"
 version = "3.5.0"
@@ -956,43 +611,6 @@ files = [
     {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
 ]
 
-[[package]]
-name = "h5py"
-version = "3.10.0"
-description = "Read and write HDF5 files from Python"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "h5py-3.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b963fb772964fc1d1563c57e4e2e874022ce11f75ddc6df1a626f42bd49ab99f"},
-    {file = "h5py-3.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:012ab448590e3c4f5a8dd0f3533255bc57f80629bf7c5054cf4c87b30085063c"},
-    {file = "h5py-3.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:781a24263c1270a62cd67be59f293e62b76acfcc207afa6384961762bb88ea03"},
-    {file = "h5py-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f42e6c30698b520f0295d70157c4e202a9e402406f50dc08f5a7bc416b24e52d"},
-    {file = "h5py-3.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:93dd840bd675787fc0b016f7a05fc6efe37312a08849d9dd4053fd0377b1357f"},
-    {file = "h5py-3.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2381e98af081b6df7f6db300cd88f88e740649d77736e4b53db522d8874bf2dc"},
-    {file = "h5py-3.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:667fe23ab33d5a8a6b77970b229e14ae3bb84e4ea3382cc08567a02e1499eedd"},
-    {file = "h5py-3.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90286b79abd085e4e65e07c1bd7ee65a0f15818ea107f44b175d2dfe1a4674b7"},
-    {file = "h5py-3.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c013d2e79c00f28ffd0cc24e68665ea03ae9069e167087b2adb5727d2736a52"},
-    {file = "h5py-3.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:92273ce69ae4983dadb898fd4d3bea5eb90820df953b401282ee69ad648df684"},
-    {file = "h5py-3.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c97d03f87f215e7759a354460fb4b0d0f27001450b18b23e556e7856a0b21c3"},
-    {file = "h5py-3.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:86df4c2de68257b8539a18646ceccdcf2c1ce6b1768ada16c8dcfb489eafae20"},
-    {file = "h5py-3.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba9ab36be991119a3ff32d0c7cbe5faf9b8d2375b5278b2aea64effbeba66039"},
-    {file = "h5py-3.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:2c8e4fda19eb769e9a678592e67eaec3a2f069f7570c82d2da909c077aa94339"},
-    {file = "h5py-3.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:492305a074327e8d2513011fa9fffeb54ecb28a04ca4c4227d7e1e9616d35641"},
-    {file = "h5py-3.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9450464b458cca2c86252b624279115dcaa7260a40d3cb1594bf2b410a2bd1a3"},
-    {file = "h5py-3.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd6f6d1384a9f491732cee233b99cd4bfd6e838a8815cc86722f9d2ee64032af"},
-    {file = "h5py-3.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3074ec45d3dc6e178c6f96834cf8108bf4a60ccb5ab044e16909580352010a97"},
-    {file = "h5py-3.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:212bb997a91e6a895ce5e2f365ba764debeaef5d2dca5c6fb7098d66607adf99"},
-    {file = "h5py-3.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5dfc65ac21fa2f630323c92453cadbe8d4f504726ec42f6a56cf80c2f90d6c52"},
-    {file = "h5py-3.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d4682b94fd36ab217352be438abd44c8f357c5449b8995e63886b431d260f3d3"},
-    {file = "h5py-3.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aece0e2e1ed2aab076c41802e50a0c3e5ef8816d60ece39107d68717d4559824"},
-    {file = "h5py-3.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43a61b2c2ad65b1fabc28802d133eed34debcc2c8b420cb213d3d4ef4d3e2229"},
-    {file = "h5py-3.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:ae2f0201c950059676455daf92700eeb57dcf5caaf71b9e1328e6e6593601770"},
-    {file = "h5py-3.10.0.tar.gz", hash = "sha256:d93adc48ceeb33347eb24a634fb787efc7ae4644e6ea4ba733d099605045c049"},
-]
-
-[package.dependencies]
-numpy = ">=1.17.3"
-
 [[package]]
 name = "httpcore"
 version = "0.17.3"
@@ -1037,40 +655,6 @@ cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
 http2 = ["h2 (>=3,<5)"]
 socks = ["socksio (==1.*)"]
 
-[[package]]
-name = "huggingface-hub"
-version = "0.23.2"
-description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "huggingface_hub-0.23.2-py3-none-any.whl", hash = "sha256:48727a16e704d409c4bb5913613308499664f22a99743435dc3a13b23c485827"},
-    {file = "huggingface_hub-0.23.2.tar.gz", hash = "sha256:f6829b62d5fdecb452a76fdbec620cba4c1573655a8d710c1df71735fd9edbd2"},
-]
-
-[package.dependencies]
-filelock = "*"
-fsspec = ">=2023.5.0"
-packaging = ">=20.9"
-pyyaml = ">=5.1"
-requests = "*"
-tqdm = ">=4.42.1"
-typing-extensions = ">=3.7.4.3"
-
-[package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
-cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
-fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
-hf-transfer = ["hf-transfer (>=0.1.4)"]
-inference = ["aiohttp", "minijinja (>=1.0)"]
-quality = ["mypy (==1.5.1)", "ruff (>=0.3.0)"]
-tensorflow = ["graphviz", "pydot", "tensorflow"]
-tensorflow-testing = ["keras (<3.0)", "tensorflow"]
-testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
-torch = ["safetensors", "torch"]
-typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
-
 [[package]]
 name = "idna"
 version = "3.6"
@@ -1142,23 +726,6 @@ files = [
 [package.extras]
 colors = ["colorama (>=0.4.6)"]
 
-[[package]]
-name = "jinja2"
-version = "3.1.3"
-description = "A very fast and expressive template engine."
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
-    {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
-]
-
-[package.dependencies]
-MarkupSafe = ">=2.0"
-
-[package.extras]
-i18n = ["Babel (>=2.7)"]
-
 [[package]]
 name = "jsonschema"
 version = "4.21.1"
@@ -1243,75 +810,6 @@ files = [
     {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
 ]
 
-[[package]]
-name = "markupsafe"
-version = "2.1.5"
-description = "Safely add untrusted strings to HTML/XML markup."
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"},
-    {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
-]
-
 [[package]]
 name = "mccabe"
 version = "0.7.0"
@@ -1323,23 +821,6 @@ files = [
     {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
 ]
 
-[[package]]
-name = "mpmath"
-version = "1.3.0"
-description = "Python library for arbitrary-precision floating-point arithmetic"
-optional = true
-python-versions = "*"
-files = [
-    {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
-    {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
-]
-
-[package.extras]
-develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"]
-docs = ["sphinx"]
-gmpy = ["gmpy2 (>=2.1.0a4)"]
-tests = ["pytest (>=4.6)"]
-
 [[package]]
 name = "multidict"
 version = "6.0.5"
@@ -1439,34 +920,6 @@ files = [
     {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
 ]
 
-[[package]]
-name = "multiprocess"
-version = "0.70.15"
-description = "better multiprocessing and multithreading in Python"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "multiprocess-0.70.15-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:aa36c7ed16f508091438687fe9baa393a7a8e206731d321e443745e743a0d4e5"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:20e024018c46d0d1602024c613007ac948f9754659e3853b0aa705e83f6931d8"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_i686.whl", hash = "sha256:e576062981c91f0fe8a463c3d52506e598dfc51320a8dd8d78b987dfca91c5db"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:e73f497e6696a0f5433ada2b3d599ae733b87a6e8b008e387c62ac9127add177"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73db2e7b32dcc7f9b0f075c2ffa45c90b6729d3f1805f27e88534c8d321a1be5"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_i686.whl", hash = "sha256:4271647bd8a49c28ecd6eb56a7fdbd3c212c45529ad5303b40b3c65fc6928e5f"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:cf981fb998d6ec3208cb14f0cf2e9e80216e834f5d51fd09ebc937c32b960902"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:18f9f2c7063346d1617bd1684fdcae8d33380ae96b99427260f562e1a1228b67"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_i686.whl", hash = "sha256:0eac53214d664c49a34695e5824872db4006b1a465edd7459a251809c3773370"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1a51dd34096db47fb21fa2b839e615b051d51b97af9a67afbcdaa67186b44883"},
-    {file = "multiprocess-0.70.15-py310-none-any.whl", hash = "sha256:7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a"},
-    {file = "multiprocess-0.70.15-py311-none-any.whl", hash = "sha256:134f89053d82c9ed3b73edd3a2531eb791e602d4f4156fc92a79259590bd9670"},
-    {file = "multiprocess-0.70.15-py37-none-any.whl", hash = "sha256:f7d4a1629bccb433114c3b4885f69eccc200994323c80f6feee73b0edc9199c5"},
-    {file = "multiprocess-0.70.15-py38-none-any.whl", hash = "sha256:bee9afba476c91f9ebee7beeee0601face9eff67d822e893f9a893725fbd6316"},
-    {file = "multiprocess-0.70.15-py39-none-any.whl", hash = "sha256:3e0953f5d52b4c76f1c973eaf8214554d146f2be5decb48e928e55c7a2d19338"},
-    {file = "multiprocess-0.70.15.tar.gz", hash = "sha256:f20eed3036c0ef477b07a4177cf7c1ba520d9a2677870a4f47fe026f0cd6787e"},
-]
-
-[package.dependencies]
-dill = ">=0.3.7"
-
 [[package]]
 name = "mypy"
 version = "1.8.0"
@@ -1525,203 +978,6 @@ files = [
     {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
 ]
 
-[[package]]
-name = "networkx"
-version = "3.1"
-description = "Python package for creating and manipulating graphs and networks"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
-    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
-]
-
-[package.extras]
-default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
-developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
-doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
-extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
-test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
-
-[[package]]
-name = "numpy"
-version = "1.24.4"
-description = "Fundamental package for array computing in Python"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"},
-    {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"},
-    {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"},
-    {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"},
-    {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"},
-    {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"},
-    {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"},
-    {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"},
-    {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"},
-    {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"},
-    {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"},
-    {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"},
-    {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"},
-    {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"},
-    {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"},
-    {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"},
-    {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"},
-    {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"},
-    {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"},
-    {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"},
-    {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"},
-    {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"},
-    {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"},
-    {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"},
-    {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"},
-    {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"},
-    {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"},
-    {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"},
-]
-
-[[package]]
-name = "nvidia-cublas-cu12"
-version = "12.1.3.1"
-description = "CUBLAS native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"},
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"},
-]
-
-[[package]]
-name = "nvidia-cuda-cupti-cu12"
-version = "12.1.105"
-description = "CUDA profiling tools runtime libs."
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"},
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"},
-]
-
-[[package]]
-name = "nvidia-cuda-nvrtc-cu12"
-version = "12.1.105"
-description = "NVRTC native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"},
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"},
-]
-
-[[package]]
-name = "nvidia-cuda-runtime-cu12"
-version = "12.1.105"
-description = "CUDA Runtime native Libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"},
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"},
-]
-
-[[package]]
-name = "nvidia-cudnn-cu12"
-version = "8.9.2.26"
-description = "cuDNN runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", hash = "sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9"},
-]
-
-[package.dependencies]
-nvidia-cublas-cu12 = "*"
-
-[[package]]
-name = "nvidia-cufft-cu12"
-version = "11.0.2.54"
-description = "CUFFT native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"},
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"},
-]
-
-[[package]]
-name = "nvidia-curand-cu12"
-version = "10.3.2.106"
-description = "CURAND native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"},
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"},
-]
-
-[[package]]
-name = "nvidia-cusolver-cu12"
-version = "11.4.5.107"
-description = "CUDA solver native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"},
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"},
-]
-
-[package.dependencies]
-nvidia-cublas-cu12 = "*"
-nvidia-cusparse-cu12 = "*"
-nvidia-nvjitlink-cu12 = "*"
-
-[[package]]
-name = "nvidia-cusparse-cu12"
-version = "12.1.0.106"
-description = "CUSPARSE native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"},
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"},
-]
-
-[package.dependencies]
-nvidia-nvjitlink-cu12 = "*"
-
-[[package]]
-name = "nvidia-nccl-cu12"
-version = "2.19.3"
-description = "NVIDIA Collective Communication Library (NCCL) Runtime"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl", hash = "sha256:a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d"},
-]
-
-[[package]]
-name = "nvidia-nvjitlink-cu12"
-version = "12.3.101"
-description = "Nvidia JIT LTO Library"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_nvjitlink_cu12-12.3.101-py3-none-manylinux1_x86_64.whl", hash = "sha256:64335a8088e2b9d196ae8665430bc6a2b7e6ef2eb877a9c735c804bd4ff6467c"},
-    {file = "nvidia_nvjitlink_cu12-12.3.101-py3-none-manylinux2014_aarch64.whl", hash = "sha256:211a63e7b30a9d62f1a853e19928fbb1a750e3f17a13a3d1f98ff0ced19478dd"},
-    {file = "nvidia_nvjitlink_cu12-12.3.101-py3-none-win_amd64.whl", hash = "sha256:1b2e317e437433753530792f13eece58f0aec21a2b05903be7bffe58a606cbd1"},
-]
-
-[[package]]
-name = "nvidia-nvtx-cu12"
-version = "12.1.105"
-description = "NVIDIA Tools Extension"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"},
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
-]
-
 [[package]]
 name = "packaging"
 version = "23.2"
@@ -1733,73 +989,6 @@ files = [
     {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"},
 ]
 
-[[package]]
-name = "pandas"
-version = "2.0.3"
-description = "Powerful data structures for data analysis, time series, and statistics"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"},
-    {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"},
-    {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"},
-    {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"},
-    {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"},
-    {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"},
-    {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"},
-    {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"},
-    {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"},
-    {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"},
-    {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"},
-    {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"},
-    {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"},
-    {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"},
-    {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"},
-    {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"},
-    {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"},
-    {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"},
-    {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"},
-    {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"},
-    {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"},
-    {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"},
-    {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"},
-    {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"},
-    {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"},
-]
-
-[package.dependencies]
-numpy = [
-    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
-    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
-]
-python-dateutil = ">=2.8.2"
-pytz = ">=2020.1"
-tzdata = ">=2022.1"
-
-[package.extras]
-all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"]
-aws = ["s3fs (>=2021.08.0)"]
-clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"]
-compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"]
-computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"]
-excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"]
-feather = ["pyarrow (>=7.0.0)"]
-fss = ["fsspec (>=2021.07.0)"]
-gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"]
-hdf5 = ["tables (>=3.6.1)"]
-html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"]
-mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"]
-output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"]
-parquet = ["pyarrow (>=7.0.0)"]
-performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"]
-plot = ["matplotlib (>=3.6.1)"]
-postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"]
-spss = ["pyreadstat (>=1.1.2)"]
-sql-other = ["SQLAlchemy (>=1.4.16)"]
-test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
-xml = ["lxml (>=4.6.3)"]
-
 [[package]]
 name = "pathspec"
 version = "0.9.0"
@@ -1811,34 +1000,6 @@ files = [
     {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
 ]
 
-[[package]]
-name = "peft"
-version = "0.6.0"
-description = "Parameter-Efficient Fine-Tuning (PEFT)"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "peft-0.6.0-py3-none-any.whl", hash = "sha256:d7fb6335beb20074f70d464aa1f2bb1ddca0875126316320a2781b04364f72a6"},
-    {file = "peft-0.6.0.tar.gz", hash = "sha256:6c381208f705cd38f2cc91dc2943ac4df2615680bd75d7320d010f8f2e48e65d"},
-]
-
-[package.dependencies]
-accelerate = ">=0.21.0"
-numpy = ">=1.17"
-packaging = ">=20.0"
-psutil = "*"
-pyyaml = "*"
-safetensors = "*"
-torch = ">=1.13.0"
-tqdm = "*"
-transformers = "*"
-
-[package.extras]
-dev = ["black (>=22.0,<23.0)", "hf-doc-builder", "ruff (>=0.0.241)", "urllib3 (<=2.0.0)"]
-docs-specific = ["hf-doc-builder"]
-quality = ["black (>=22.0,<23.0)", "ruff (>=0.0.241)", "urllib3 (<=2.0.0)"]
-test = ["black (>=22.0,<23.0)", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameterized", "pytest", "pytest-cov", "pytest-xdist", "ruff (>=0.0.241)", "urllib3 (<=2.0.0)"]
-
 [[package]]
 name = "pkgutil-resolve-name"
 version = "1.3.10"
@@ -1900,93 +1061,6 @@ files = [
     {file = "protobuf-5.27.1.tar.gz", hash = "sha256:df5e5b8e39b7d1c25b186ffdf9f44f40f810bbcc9d2b71d9d3156fee5a9adf15"},
 ]
 
-[[package]]
-name = "psutil"
-version = "5.9.8"
-description = "Cross-platform lib for process and system monitoring in Python."
-optional = true
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
-files = [
-    {file = "psutil-5.9.8-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8"},
-    {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73"},
-    {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7"},
-    {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36"},
-    {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d"},
-    {file = "psutil-5.9.8-cp27-none-win32.whl", hash = "sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e"},
-    {file = "psutil-5.9.8-cp27-none-win_amd64.whl", hash = "sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631"},
-    {file = "psutil-5.9.8-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81"},
-    {file = "psutil-5.9.8-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421"},
-    {file = "psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4"},
-    {file = "psutil-5.9.8-cp36-cp36m-win32.whl", hash = "sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee"},
-    {file = "psutil-5.9.8-cp36-cp36m-win_amd64.whl", hash = "sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2"},
-    {file = "psutil-5.9.8-cp37-abi3-win32.whl", hash = "sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0"},
-    {file = "psutil-5.9.8-cp37-abi3-win_amd64.whl", hash = "sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf"},
-    {file = "psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8"},
-    {file = "psutil-5.9.8.tar.gz", hash = "sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c"},
-]
-
-[package.extras]
-test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
-
-[[package]]
-name = "pyarrow"
-version = "15.0.0"
-description = "Python library for Apache Arrow"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "pyarrow-15.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:0a524532fd6dd482edaa563b686d754c70417c2f72742a8c990b322d4c03a15d"},
-    {file = "pyarrow-15.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:60a6bdb314affa9c2e0d5dddf3d9cbb9ef4a8dddaa68669975287d47ece67642"},
-    {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:66958fd1771a4d4b754cd385835e66a3ef6b12611e001d4e5edfcef5f30391e2"},
-    {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f500956a49aadd907eaa21d4fff75f73954605eaa41f61cb94fb008cf2e00c6"},
-    {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6f87d9c4f09e049c2cade559643424da84c43a35068f2a1c4653dc5b1408a929"},
-    {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85239b9f93278e130d86c0e6bb455dcb66fc3fd891398b9d45ace8799a871a1e"},
-    {file = "pyarrow-15.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5b8d43e31ca16aa6e12402fcb1e14352d0d809de70edd185c7650fe80e0769e3"},
-    {file = "pyarrow-15.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:fa7cd198280dbd0c988df525e50e35b5d16873e2cdae2aaaa6363cdb64e3eec5"},
-    {file = "pyarrow-15.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8780b1a29d3c8b21ba6b191305a2a607de2e30dab399776ff0aa09131e266340"},
-    {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe0ec198ccc680f6c92723fadcb97b74f07c45ff3fdec9dd765deb04955ccf19"},
-    {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:036a7209c235588c2f07477fe75c07e6caced9b7b61bb897c8d4e52c4b5f9555"},
-    {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2bd8a0e5296797faf9a3294e9fa2dc67aa7f10ae2207920dbebb785c77e9dbe5"},
-    {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e8ebed6053dbe76883a822d4e8da36860f479d55a762bd9e70d8494aed87113e"},
-    {file = "pyarrow-15.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:17d53a9d1b2b5bd7d5e4cd84d018e2a45bc9baaa68f7e6e3ebed45649900ba99"},
-    {file = "pyarrow-15.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9950a9c9df24090d3d558b43b97753b8f5867fb8e521f29876aa021c52fda351"},
-    {file = "pyarrow-15.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:003d680b5e422d0204e7287bb3fa775b332b3fce2996aa69e9adea23f5c8f970"},
-    {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f75fce89dad10c95f4bf590b765e3ae98bcc5ba9f6ce75adb828a334e26a3d40"},
-    {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca9cb0039923bec49b4fe23803807e4ef39576a2bec59c32b11296464623dc2"},
-    {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ed5a78ed29d171d0acc26a305a4b7f83c122d54ff5270810ac23c75813585e4"},
-    {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6eda9e117f0402dfcd3cd6ec9bfee89ac5071c48fc83a84f3075b60efa96747f"},
-    {file = "pyarrow-15.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a3a6180c0e8f2727e6f1b1c87c72d3254cac909e609f35f22532e4115461177"},
-    {file = "pyarrow-15.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:19a8918045993349b207de72d4576af0191beef03ea655d8bdb13762f0cd6eac"},
-    {file = "pyarrow-15.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d0ec076b32bacb6666e8813a22e6e5a7ef1314c8069d4ff345efa6246bc38593"},
-    {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5db1769e5d0a77eb92344c7382d6543bea1164cca3704f84aa44e26c67e320fb"},
-    {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2617e3bf9df2a00020dd1c1c6dce5cc343d979efe10bc401c0632b0eef6ef5b"},
-    {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:d31c1d45060180131caf10f0f698e3a782db333a422038bf7fe01dace18b3a31"},
-    {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:c8c287d1d479de8269398b34282e206844abb3208224dbdd7166d580804674b7"},
-    {file = "pyarrow-15.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:07eb7f07dc9ecbb8dace0f58f009d3a29ee58682fcdc91337dfeb51ea618a75b"},
-    {file = "pyarrow-15.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:47af7036f64fce990bb8a5948c04722e4e3ea3e13b1007ef52dfe0aa8f23cf7f"},
-    {file = "pyarrow-15.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93768ccfff85cf044c418bfeeafce9a8bb0cee091bd8fd19011aff91e58de540"},
-    {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6ee87fd6892700960d90abb7b17a72a5abb3b64ee0fe8db6c782bcc2d0dc0b4"},
-    {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:001fca027738c5f6be0b7a3159cc7ba16a5c52486db18160909a0831b063c4e4"},
-    {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:d1c48648f64aec09accf44140dccb92f4f94394b8d79976c426a5b79b11d4fa7"},
-    {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:972a0141be402bb18e3201448c8ae62958c9c7923dfaa3b3d4530c835ac81aed"},
-    {file = "pyarrow-15.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:f01fc5cf49081426429127aa2d427d9d98e1cb94a32cb961d583a70b7c4504e6"},
-    {file = "pyarrow-15.0.0.tar.gz", hash = "sha256:876858f549d540898f927eba4ef77cd549ad8d24baa3207cf1b72e5788b50e83"},
-]
-
-[package.dependencies]
-numpy = ">=1.16.6,<2"
-
-[[package]]
-name = "pyarrow-hotfix"
-version = "0.6"
-description = ""
-optional = true
-python-versions = ">=3.5"
-files = [
-    {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"},
-    {file = "pyarrow_hotfix-0.6.tar.gz", hash = "sha256:79d3e030f7ff890d408a100ac16d6f00b14d44a502d7897cd9fc3e3a534e9945"},
-]
-
 [[package]]
 name = "pydantic"
 version = "2.6.1"
@@ -2216,31 +1290,6 @@ pytest = ">=4.6"
 [package.extras]
 testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"]
 
-[[package]]
-name = "python-dateutil"
-version = "2.8.2"
-description = "Extensions to the standard Python datetime module"
-optional = true
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-files = [
-    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
-    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
-]
-
-[package.dependencies]
-six = ">=1.5"
-
-[[package]]
-name = "pytz"
-version = "2024.1"
-description = "World timezone definitions, modern and historical"
-optional = true
-python-versions = "*"
-files = [
-    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
-    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
-]
-
 [[package]]
 name = "pyyaml"
 version = "6.0.1"
@@ -2316,108 +1365,6 @@ files = [
 attrs = ">=22.2.0"
 rpds-py = ">=0.7.0"
 
-[[package]]
-name = "regex"
-version = "2023.12.25"
-description = "Alternative regular expression module, to replace re."
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5"},
-    {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b014333bd0217ad3d54c143de9d4b9a3ca1c5a29a6d0d554952ea071cff0f1f8"},
-    {file = "regex-2023.12.25-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d865984b3f71f6d0af64d0d88f5733521698f6c16f445bb09ce746c92c97c586"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e0eabac536b4cc7f57a5f3d095bfa557860ab912f25965e08fe1545e2ed8b4c"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c25a8ad70e716f96e13a637802813f65d8a6760ef48672aa3502f4c24ea8b400"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9b6d73353f777630626f403b0652055ebfe8ff142a44ec2cf18ae470395766e"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9cc99d6946d750eb75827cb53c4371b8b0fe89c733a94b1573c9dd16ea6c9e4"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88d1f7bef20c721359d8675f7d9f8e414ec5003d8f642fdfd8087777ff7f94b5"},
-    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cb3fe77aec8f1995611f966d0c656fdce398317f850d0e6e7aebdfe61f40e1cd"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7aa47c2e9ea33a4a2a05f40fcd3ea36d73853a2aae7b4feab6fc85f8bf2c9704"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:df26481f0c7a3f8739fecb3e81bc9da3fcfae34d6c094563b9d4670b047312e1"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c40281f7d70baf6e0db0c2f7472b31609f5bc2748fe7275ea65a0b4601d9b392"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:d94a1db462d5690ebf6ae86d11c5e420042b9898af5dcf278bd97d6bda065423"},
-    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ba1b30765a55acf15dce3f364e4928b80858fa8f979ad41f862358939bdd1f2f"},
-    {file = "regex-2023.12.25-cp310-cp310-win32.whl", hash = "sha256:150c39f5b964e4d7dba46a7962a088fbc91f06e606f023ce57bb347a3b2d4630"},
-    {file = "regex-2023.12.25-cp310-cp310-win_amd64.whl", hash = "sha256:09da66917262d9481c719599116c7dc0c321ffcec4b1f510c4f8a066f8768105"},
-    {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1b9d811f72210fa9306aeb88385b8f8bcef0dfbf3873410413c00aa94c56c2b6"},
-    {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d902a43085a308cef32c0d3aea962524b725403fd9373dea18110904003bac97"},
-    {file = "regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d166eafc19f4718df38887b2bbe1467a4f74a9830e8605089ea7a30dd4da8887"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7ad32824b7f02bb3c9f80306d405a1d9b7bb89362d68b3c5a9be53836caebdb"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:636ba0a77de609d6510235b7f0e77ec494d2657108f777e8765efc060094c98c"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fda75704357805eb953a3ee15a2b240694a9a514548cd49b3c5124b4e2ad01b"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f72cbae7f6b01591f90814250e636065850c5926751af02bb48da94dfced7baa"},
-    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db2a0b1857f18b11e3b0e54ddfefc96af46b0896fb678c85f63fb8c37518b3e7"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7502534e55c7c36c0978c91ba6f61703faf7ce733715ca48f499d3dbbd7657e0"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e8c7e08bb566de4faaf11984af13f6bcf6a08f327b13631d41d62592681d24fe"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:283fc8eed679758de38fe493b7d7d84a198b558942b03f017b1f94dda8efae80"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f44dd4d68697559d007462b0a3a1d9acd61d97072b71f6d1968daef26bc744bd"},
-    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:67d3ccfc590e5e7197750fcb3a2915b416a53e2de847a728cfa60141054123d4"},
-    {file = "regex-2023.12.25-cp311-cp311-win32.whl", hash = "sha256:68191f80a9bad283432385961d9efe09d783bcd36ed35a60fb1ff3f1ec2efe87"},
-    {file = "regex-2023.12.25-cp311-cp311-win_amd64.whl", hash = "sha256:7d2af3f6b8419661a0c421584cfe8aaec1c0e435ce7e47ee2a97e344b98f794f"},
-    {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8a0ccf52bb37d1a700375a6b395bff5dd15c50acb745f7db30415bae3c2b0715"},
-    {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c3c4a78615b7762740531c27cf46e2f388d8d727d0c0c739e72048beb26c8a9d"},
-    {file = "regex-2023.12.25-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad83e7545b4ab69216cef4cc47e344d19622e28aabec61574b20257c65466d6a"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7a635871143661feccce3979e1727c4e094f2bdfd3ec4b90dfd4f16f571a87a"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d498eea3f581fbe1b34b59c697512a8baef88212f92e4c7830fcc1499f5b45a5"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:43f7cd5754d02a56ae4ebb91b33461dc67be8e3e0153f593c509e21d219c5060"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51f4b32f793812714fd5307222a7f77e739b9bc566dc94a18126aba3b92b98a3"},
-    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba99d8077424501b9616b43a2d208095746fb1284fc5ba490139651f971d39d9"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4bfc2b16e3ba8850e0e262467275dd4d62f0d045e0e9eda2bc65078c0110a11f"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8c2c19dae8a3eb0ea45a8448356ed561be843b13cbc34b840922ddf565498c1c"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:60080bb3d8617d96f0fb7e19796384cc2467447ef1c491694850ebd3670bc457"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b77e27b79448e34c2c51c09836033056a0547aa360c45eeeb67803da7b0eedaf"},
-    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:518440c991f514331f4850a63560321f833979d145d7d81186dbe2f19e27ae3d"},
-    {file = "regex-2023.12.25-cp312-cp312-win32.whl", hash = "sha256:e2610e9406d3b0073636a3a2e80db05a02f0c3169b5632022b4e81c0364bcda5"},
-    {file = "regex-2023.12.25-cp312-cp312-win_amd64.whl", hash = "sha256:cc37b9aeebab425f11f27e5e9e6cf580be7206c6582a64467a14dda211abc232"},
-    {file = "regex-2023.12.25-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:da695d75ac97cb1cd725adac136d25ca687da4536154cdc2815f576e4da11c69"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d126361607b33c4eb7b36debc173bf25d7805847346dd4d99b5499e1fef52bc7"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4719bb05094d7d8563a450cf8738d2e1061420f79cfcc1fa7f0a44744c4d8f73"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dd58946bce44b53b06d94aa95560d0b243eb2fe64227cba50017a8d8b3cd3e2"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22a86d9fff2009302c440b9d799ef2fe322416d2d58fc124b926aa89365ec482"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2aae8101919e8aa05ecfe6322b278f41ce2994c4a430303c4cd163fef746e04f"},
-    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e692296c4cc2873967771345a876bcfc1c547e8dd695c6b89342488b0ea55cd8"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:263ef5cc10979837f243950637fffb06e8daed7f1ac1e39d5910fd29929e489a"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d6f7e255e5fa94642a0724e35406e6cb7001c09d476ab5fce002f652b36d0c39"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:88ad44e220e22b63b0f8f81f007e8abbb92874d8ced66f32571ef8beb0643b2b"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:3a17d3ede18f9cedcbe23d2daa8a2cd6f59fe2bf082c567e43083bba3fb00347"},
-    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d15b274f9e15b1a0b7a45d2ac86d1f634d983ca40d6b886721626c47a400bf39"},
-    {file = "regex-2023.12.25-cp37-cp37m-win32.whl", hash = "sha256:ed19b3a05ae0c97dd8f75a5d8f21f7723a8c33bbc555da6bbe1f96c470139d3c"},
-    {file = "regex-2023.12.25-cp37-cp37m-win_amd64.whl", hash = "sha256:a6d1047952c0b8104a1d371f88f4ab62e6275567d4458c1e26e9627ad489b445"},
-    {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b43523d7bc2abd757119dbfb38af91b5735eea45537ec6ec3a5ec3f9562a1c53"},
-    {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:efb2d82f33b2212898f1659fb1c2e9ac30493ac41e4d53123da374c3b5541e64"},
-    {file = "regex-2023.12.25-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b7fca9205b59c1a3d5031f7e64ed627a1074730a51c2a80e97653e3e9fa0d415"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086dd15e9435b393ae06f96ab69ab2d333f5d65cbe65ca5a3ef0ec9564dfe770"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e81469f7d01efed9b53740aedd26085f20d49da65f9c1f41e822a33992cb1590"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:34e4af5b27232f68042aa40a91c3b9bb4da0eeb31b7632e0091afc4310afe6cb"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9852b76ab558e45b20bf1893b59af64a28bd3820b0c2efc80e0a70a4a3ea51c1"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff100b203092af77d1a5a7abe085b3506b7eaaf9abf65b73b7d6905b6cb76988"},
-    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cc038b2d8b1470364b1888a98fd22d616fba2b6309c5b5f181ad4483e0017861"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:094ba386bb5c01e54e14434d4caabf6583334090865b23ef58e0424a6286d3dc"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5cd05d0f57846d8ba4b71d9c00f6f37d6b97d5e5ef8b3c3840426a475c8f70f4"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:9aa1a67bbf0f957bbe096375887b2505f5d8ae16bf04488e8b0f334c36e31360"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:98a2636994f943b871786c9e82bfe7883ecdaba2ef5df54e1450fa9869d1f756"},
-    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37f8e93a81fc5e5bd8db7e10e62dc64261bcd88f8d7e6640aaebe9bc180d9ce2"},
-    {file = "regex-2023.12.25-cp38-cp38-win32.whl", hash = "sha256:d78bd484930c1da2b9679290a41cdb25cc127d783768a0369d6b449e72f88beb"},
-    {file = "regex-2023.12.25-cp38-cp38-win_amd64.whl", hash = "sha256:b521dcecebc5b978b447f0f69b5b7f3840eac454862270406a39837ffae4e697"},
-    {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f7bc09bc9c29ebead055bcba136a67378f03d66bf359e87d0f7c759d6d4ffa31"},
-    {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e14b73607d6231f3cc4622809c196b540a6a44e903bcfad940779c80dffa7be7"},
-    {file = "regex-2023.12.25-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9eda5f7a50141291beda3edd00abc2d4a5b16c29c92daf8d5bd76934150f3edc"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc6bb9aa69aacf0f6032c307da718f61a40cf970849e471254e0e91c56ffca95"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:298dc6354d414bc921581be85695d18912bea163a8b23cac9a2562bbcd5088b1"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f4e475a80ecbd15896a976aa0b386c5525d0ed34d5c600b6d3ebac0a67c7ddf"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:531ac6cf22b53e0696f8e1d56ce2396311254eb806111ddd3922c9d937151dae"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22f3470f7524b6da61e2020672df2f3063676aff444db1daa283c2ea4ed259d6"},
-    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:89723d2112697feaa320c9d351e5f5e7b841e83f8b143dba8e2d2b5f04e10923"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0ecf44ddf9171cd7566ef1768047f6e66975788258b1c6c6ca78098b95cf9a3d"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:905466ad1702ed4acfd67a902af50b8db1feeb9781436372261808df7a2a7bca"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:4558410b7a5607a645e9804a3e9dd509af12fb72b9825b13791a37cd417d73a5"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:7e316026cc1095f2a3e8cc012822c99f413b702eaa2ca5408a513609488cb62f"},
-    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3b1de218d5375cd6ac4b5493e0b9f3df2be331e86520f23382f216c137913d20"},
-    {file = "regex-2023.12.25-cp39-cp39-win32.whl", hash = "sha256:11a963f8e25ab5c61348d090bf1b07f1953929c13bd2309a0662e9ff680763c9"},
-    {file = "regex-2023.12.25-cp39-cp39-win_amd64.whl", hash = "sha256:e693e233ac92ba83a87024e1d32b5f9ab15ca55ddd916d878146f4e3406b5c91"},
-    {file = "regex-2023.12.25.tar.gz", hash = "sha256:29171aa128da69afdf4bde412d5bedc335f2ca8fcfe4489038577d05f16181e5"},
-]
-
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -2585,125 +1532,6 @@ files = [
     {file = "rpds_py-0.18.0.tar.gz", hash = "sha256:42821446ee7a76f5d9f71f9e33a4fb2ffd724bb3e7f93386150b61a43115788d"},
 ]
 
-[[package]]
-name = "safetensors"
-version = "0.4.1"
-description = ""
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "safetensors-0.4.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:cba01c6b76e01ec453933b3b3c0157c59b52881c83eaa0f7666244e71aa75fd1"},
-    {file = "safetensors-0.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a8f6f679d97ea0135c7935c202feefbd042c149aa70ee759855e890c01c7814"},
-    {file = "safetensors-0.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbc2ce1f5ae5143a7fb72b71fa71db6a42b4f6cf912aa3acdc6b914084778e68"},
-    {file = "safetensors-0.4.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2d87d993eaefe6611a9c241a8bd364a5f1ffed5771c74840363a6c4ed8d868f6"},
-    {file = "safetensors-0.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:097e9af2efa8778cd2f0cba451784253e62fa7cc9fc73c0744d27212f7294e25"},
-    {file = "safetensors-0.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d10a9f7bae608ccfdc009351f01dc3d8535ff57f9488a58a4c38e45bf954fe93"},
-    {file = "safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:270b99885ec14abfd56c1d7f28ada81740a9220b4bae960c3de1c6fe84af9e4d"},
-    {file = "safetensors-0.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:285b52a481e7ba93e29ad4ec5841ef2c4479ef0a6c633c4e2629e0508453577b"},
-    {file = "safetensors-0.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c3c9f0ca510e0de95abd6424789dcbc879942a3a4e29b0dfa99d9427bf1da75c"},
-    {file = "safetensors-0.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:88b4653059c903015284a9722f9a46838c654257173b279c8f6f46dbe80b612d"},
-    {file = "safetensors-0.4.1-cp310-none-win32.whl", hash = "sha256:2fe6926110e3d425c4b684a4379b7796fdc26ad7d16922ea1696c8e6ea7e920f"},
-    {file = "safetensors-0.4.1-cp310-none-win_amd64.whl", hash = "sha256:a79e16222106b2f5edbca1b8185661477d8971b659a3c814cc6f15181a9b34c8"},
-    {file = "safetensors-0.4.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:d93321eea0dd7e81b283e47a1d20dee6069165cc158286316d0d06d340de8fe8"},
-    {file = "safetensors-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8ff8e41c8037db17de0ea2a23bc684f43eaf623be7d34906fe1ac10985b8365e"},
-    {file = "safetensors-0.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39d36f1d88468a87c437a1bc27c502e71b6ca44c385a9117a9f9ba03a75cc9c6"},
-    {file = "safetensors-0.4.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7ef010e9afcb4057fb6be3d0a0cfa07aac04fe97ef73fe4a23138d8522ba7c17"},
-    {file = "safetensors-0.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b287304f2b2220d51ccb51fd857761e78bcffbeabe7b0238f8dc36f2edfd9542"},
-    {file = "safetensors-0.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e09000b2599e1836314430f81a3884c66a5cbabdff5d9f175b5d560d4de38d78"},
-    {file = "safetensors-0.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9c80ce0001efa16066358d2dd77993adc25f5a6c61850e4ad096a2232930bce"},
-    {file = "safetensors-0.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:413e1f6ac248f7d1b755199a06635e70c3515493d3b41ba46063dec33aa2ebb7"},
-    {file = "safetensors-0.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3ac139377cfe71ba04573f1cda66e663b7c3e95be850e9e6c2dd4b5984bd513"},
-    {file = "safetensors-0.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:04157d008385bea66d12fe90844a80d4a76dc25ec5230b5bd9a630496d1b7c03"},
-    {file = "safetensors-0.4.1-cp311-none-win32.whl", hash = "sha256:5f25297148ec665f0deb8bd67e9564634d8d6841041ab5393ccfe203379ea88b"},
-    {file = "safetensors-0.4.1-cp311-none-win_amd64.whl", hash = "sha256:b2f8877990a72ff595507b80f4b69036a9a1986a641f8681adf3425d97d3d2a5"},
-    {file = "safetensors-0.4.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:eb2c1da1cc39509d1a55620a5f4d14f8911c47a89c926a96e6f4876e864375a3"},
-    {file = "safetensors-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:303d2c0415cf15a28f8d7f17379ea3c34c2b466119118a34edd9965983a1a8a6"},
-    {file = "safetensors-0.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb4cb3e37a9b961ddd68e873b29fe9ab4a081e3703412e34aedd2b7a8e9cafd9"},
-    {file = "safetensors-0.4.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ae5497adc68669db2fed7cb2dad81e6a6106e79c9a132da3efdb6af1db1014fa"},
-    {file = "safetensors-0.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b30abd0cddfe959d1daedf92edcd1b445521ebf7ddefc20860ed01486b33c90"},
-    {file = "safetensors-0.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d784a98c492c751f228a4a894c3b8a092ff08b24e73b5568938c28b8c0e8f8df"},
-    {file = "safetensors-0.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e57a5ab08b0ec7a7caf30d2ac79bb30c89168431aca4f8854464bb9461686925"},
-    {file = "safetensors-0.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:edcf3121890b5f0616aa5a54683b1a5d2332037b970e507d6bb7841a3a596556"},
-    {file = "safetensors-0.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fdb58dee173ef33634c3016c459d671ca12d11e6acf9db008261cbe58107e579"},
-    {file = "safetensors-0.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:780dc21eb3fd32ddd0e8c904bdb0290f2454f4ac21ae71e94f9ce72db1900a5a"},
-    {file = "safetensors-0.4.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:48901bd540f8a3c1791314bc5c8a170927bf7f6acddb75bf0a263d081a3637d4"},
-    {file = "safetensors-0.4.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:3b0b7b2d5976fbed8a05e2bbdce5816a59e6902e9e7c7e07dc723637ed539787"},
-    {file = "safetensors-0.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f69903ff49cb30b9227fb5d029bea276ea20d04b06803877a420c5b1b74c689"},
-    {file = "safetensors-0.4.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0ddd050e01f3e843aa8c1c27bf68675b8a08e385d0045487af4d70418c3cb356"},
-    {file = "safetensors-0.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a82bc2bd7a9a0e08239bdd6d7774d64121f136add93dfa344a2f1a6d7ef35fa"},
-    {file = "safetensors-0.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6ace9e66a40f98a216ad661245782483cf79cf56eb2b112650bb904b0baa9db5"},
-    {file = "safetensors-0.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82cbb8f4d022f2e94498cbefca900698b8ded3d4f85212f47da614001ff06652"},
-    {file = "safetensors-0.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:791edc10a3c359a2f5f52d5cddab0df8a45107d91027d86c3d44e57162e5d934"},
-    {file = "safetensors-0.4.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:83c2cfbe8c6304f0891e7bb378d56f66d2148972eeb5f747cd8a2246886f0d8c"},
-    {file = "safetensors-0.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:04dd14f53f5500eb4c4149674216ba1000670efbcf4b1b5c2643eb244e7882ea"},
-    {file = "safetensors-0.4.1-cp37-none-win32.whl", hash = "sha256:d5b3defa74f3723a388bfde2f5d488742bc4879682bd93267c09a3bcdf8f869b"},
-    {file = "safetensors-0.4.1-cp37-none-win_amd64.whl", hash = "sha256:25a043cbb59d4f75e9dd87fdf5c009dd8830105a2c57ace49b72167dd9808111"},
-    {file = "safetensors-0.4.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:3f6a520af7f2717c5ecba112041f2c8af1ca6480b97bf957aba81ed9642e654c"},
-    {file = "safetensors-0.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c3807ac3b16288dffebb3474b555b56fe466baa677dfc16290dcd02dca1ab228"},
-    {file = "safetensors-0.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b58ba13a9e82b4bc3fc221914f6ef237fe6c2adb13cede3ace64d1aacf49610"},
-    {file = "safetensors-0.4.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:dac4bb42f8679aadc59bd91a4c5a1784a758ad49d0912995945cd674089f628e"},
-    {file = "safetensors-0.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:911b48dc09e321a194def3a7431662ff4f03646832f3a8915bbf0f449b8a5fcb"},
-    {file = "safetensors-0.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82571d20288c975c1b30b08deb9b1c3550f36b31191e1e81fae87669a92217d0"},
-    {file = "safetensors-0.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da52ee0dc8ba03348ffceab767bd8230842fdf78f8a996e2a16445747143a778"},
-    {file = "safetensors-0.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2536b11ce665834201072e9397404170f93f3be10cca9995b909f023a04501ee"},
-    {file = "safetensors-0.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:998fbac99ca956c3a09fe07cc0b35fac26a521fa8865a690686d889f0ff4e4a6"},
-    {file = "safetensors-0.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:845be0aafabf2a60c2d482d4e93023fecffe5e5443d801d7a7741bae9de41233"},
-    {file = "safetensors-0.4.1-cp38-none-win32.whl", hash = "sha256:ce7a28bc8af685a69d7e869d09d3e180a275e3281e29cf5f1c7319e231932cc7"},
-    {file = "safetensors-0.4.1-cp38-none-win_amd64.whl", hash = "sha256:e056fb9e22d118cc546107f97dc28b449d88274207dd28872bd668c86216e4f6"},
-    {file = "safetensors-0.4.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:bdc0d039e44a727824639824090bd8869535f729878fa248addd3dc01db30eae"},
-    {file = "safetensors-0.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3c1b1d510c7aba71504ece87bf393ea82638df56303e371e5e2cf09d18977dd7"},
-    {file = "safetensors-0.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bd0afd95c1e497f520e680ea01e0397c0868a3a3030e128438cf6e9e3fcd671"},
-    {file = "safetensors-0.4.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f603bdd8deac6726d39f41688ed353c532dd53935234405d79e9eb53f152fbfb"},
-    {file = "safetensors-0.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d8a85e3e47e0d4eebfaf9a58b40aa94f977a56050cb5598ad5396a9ee7c087c6"},
-    {file = "safetensors-0.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0ccb5aa0f3be2727117e5631200fbb3a5b3a2b3757545a92647d6dd8be6658f"},
-    {file = "safetensors-0.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d784938534e255473155e4d9f276ee69eb85455b6af1292172c731409bf9adee"},
-    {file = "safetensors-0.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a257de175c254d39ccd6a21341cd62eb7373b05c1e618a78096a56a857e0c316"},
-    {file = "safetensors-0.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6fd80f7794554091836d4d613d33a7d006e2b8d6ba014d06f97cebdfda744f64"},
-    {file = "safetensors-0.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:35803201d980efcf964b75a0a2aee97fe5e9ecc5f3ad676b38fafdfe98e0620d"},
-    {file = "safetensors-0.4.1-cp39-none-win32.whl", hash = "sha256:7ff8a36e0396776d3ed9a106fc9a9d7c55d4439ca9a056a24bf66d343041d3e6"},
-    {file = "safetensors-0.4.1-cp39-none-win_amd64.whl", hash = "sha256:bfa2e20342b81921b98edba52f8deb68843fa9c95250739a56b52ceda5ea5c61"},
-    {file = "safetensors-0.4.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:ae2d5a31cfb8a973a318f7c4d2cffe0bd1fe753cdf7bb41a1939d45a0a06f964"},
-    {file = "safetensors-0.4.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1a45dbf03e8334d3a5dc93687d98b6dc422f5d04c7d519dac09b84a3c87dd7c6"},
-    {file = "safetensors-0.4.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297b359d91126c0f9d4fd17bae3cfa2fe3a048a6971b8db07db746ad92f850c"},
-    {file = "safetensors-0.4.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bda3d98e2bcece388232cfc551ebf063b55bdb98f65ab54df397da30efc7dcc5"},
-    {file = "safetensors-0.4.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8934bdfd202ebd0697040a3dff40dd77bc4c5bbf3527ede0532f5e7fb4d970f"},
-    {file = "safetensors-0.4.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:42c3710cec7e5c764c7999697516370bee39067de0aa089b7e2cfb97ac8c6b20"},
-    {file = "safetensors-0.4.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:53134226053e56bd56e73f7db42596e7908ed79f3c9a1016e4c1dade593ac8e5"},
-    {file = "safetensors-0.4.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:257d59e40a1b367cb544122e7451243d65b33c3f34d822a347f4eea6fdf97fdf"},
-    {file = "safetensors-0.4.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d54c2f1826e790d1eb2d2512bfd0ee443f0206b423d6f27095057c7f18a0687"},
-    {file = "safetensors-0.4.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:645b3f1138fce6e818e79d4128afa28f0657430764cc045419c1d069ff93f732"},
-    {file = "safetensors-0.4.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e9a7ffb1e551c6df51d267f5a751f042b183df22690f6feceac8d27364fd51d7"},
-    {file = "safetensors-0.4.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:44e230fbbe120de564b64f63ef3a8e6ff02840fa02849d9c443d56252a1646d4"},
-    {file = "safetensors-0.4.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:9d16b3b2fcc6fca012c74bd01b5619c655194d3e3c13e4d4d0e446eefa39a463"},
-    {file = "safetensors-0.4.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:5d95ea4d8b32233910734a904123bdd3979c137c461b905a5ed32511defc075f"},
-    {file = "safetensors-0.4.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:dab431699b5d45e0ca043bc580651ce9583dda594e62e245b7497adb32e99809"},
-    {file = "safetensors-0.4.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16d8bbb7344e39cb9d4762e85c21df94ebeb03edac923dd94bb9ed8c10eac070"},
-    {file = "safetensors-0.4.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1faf5111c66a6ba91f85dff2e36edaaf36e6966172703159daeef330de4ddc7b"},
-    {file = "safetensors-0.4.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:660ca1d8bff6c7bc7c6b30b9b32df74ef3ab668f5df42cefd7588f0d40feadcb"},
-    {file = "safetensors-0.4.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:ae2f67f04ed0bb2e56fd380a8bd3eef03f609df53f88b6f5c7e89c08e52aae00"},
-    {file = "safetensors-0.4.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:c8ed5d2c04cdc1afc6b3c28d59580448ac07732c50d94c15e14670f9c473a2ce"},
-    {file = "safetensors-0.4.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:2b6a2814278b6660261aa9a9aae524616de9f1ec364e3716d219b6ed8f91801f"},
-    {file = "safetensors-0.4.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3cfd1ca35eacc635f0eaa894e5c5ed83ffebd0f95cac298fd430014fa7323631"},
-    {file = "safetensors-0.4.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4177b456c6b0c722d82429127b5beebdaf07149d265748e97e0a34ff0b3694c8"},
-    {file = "safetensors-0.4.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:313e8472197bde54e3ec54a62df184c414582979da8f3916981b6a7954910a1b"},
-    {file = "safetensors-0.4.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fdb4adb76e21bad318210310590de61c9f4adcef77ee49b4a234f9dc48867869"},
-    {file = "safetensors-0.4.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:1d568628e9c43ca15eb96c217da73737c9ccb07520fafd8a1eba3f2750614105"},
-    {file = "safetensors-0.4.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:573b6023a55a2f28085fc0a84e196c779b6cbef4d9e73acea14c8094fee7686f"},
-    {file = "safetensors-0.4.1.tar.gz", hash = "sha256:2304658e6ada81a5223225b4efe84748e760c46079bffedf7e321763cafb36c9"},
-]
-
-[package.extras]
-all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"]
-dev = ["safetensors[all]"]
-jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "safetensors[numpy]"]
-numpy = ["numpy (>=1.21.6)"]
-paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"]
-pinned-tf = ["safetensors[numpy]", "tensorflow (==2.11.0)"]
-quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"]
-tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
-testing = ["h5py (>=3.7.0)", "huggingface_hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools_rust (>=1.5.2)"]
-torch = ["safetensors[numpy]", "torch (>=1.10)"]
-
 [[package]]
 name = "setuptools"
 version = "70.0.0"
@@ -2770,137 +1598,6 @@ typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""
 [package.extras]
 full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"]
 
-[[package]]
-name = "sympy"
-version = "1.12"
-description = "Computer algebra system (CAS) in Python"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"},
-    {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"},
-]
-
-[package.dependencies]
-mpmath = ">=0.19"
-
-[[package]]
-name = "tokenizers"
-version = "0.19.1"
-description = ""
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "tokenizers-0.19.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:952078130b3d101e05ecfc7fc3640282d74ed26bcf691400f872563fca15ac97"},
-    {file = "tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:82c8b8063de6c0468f08e82c4e198763e7b97aabfe573fd4cf7b33930ca4df77"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f03727225feaf340ceeb7e00604825addef622d551cbd46b7b775ac834c1e1c4"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:453e4422efdfc9c6b6bf2eae00d5e323f263fff62b29a8c9cd526c5003f3f642"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:02e81bf089ebf0e7f4df34fa0207519f07e66d8491d963618252f2e0729e0b46"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b07c538ba956843833fee1190cf769c60dc62e1cf934ed50d77d5502194d63b1"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28cab1582e0eec38b1f38c1c1fb2e56bce5dc180acb1724574fc5f47da2a4fe"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e"},
-    {file = "tokenizers-0.19.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7fb297edec6c6841ab2e4e8f357209519188e4a59b557ea4fafcf4691d1b4c98"},
-    {file = "tokenizers-0.19.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2e8a3dd055e515df7054378dc9d6fa8c8c34e1f32777fb9a01fea81496b3f9d3"},
-    {file = "tokenizers-0.19.1-cp310-none-win32.whl", hash = "sha256:7ff898780a155ea053f5d934925f3902be2ed1f4d916461e1a93019cc7250837"},
-    {file = "tokenizers-0.19.1-cp310-none-win_amd64.whl", hash = "sha256:bea6f9947e9419c2fda21ae6c32871e3d398cba549b93f4a65a2d369662d9403"},
-    {file = "tokenizers-0.19.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:5c88d1481f1882c2e53e6bb06491e474e420d9ac7bdff172610c4f9ad3898059"},
-    {file = "tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ddf672ed719b4ed82b51499100f5417d7d9f6fb05a65e232249268f35de5ed14"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:dadc509cc8a9fe460bd274c0e16ac4184d0958117cf026e0ea8b32b438171594"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfedf31824ca4915b511b03441784ff640378191918264268e6923da48104acc"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac11016d0a04aa6487b1513a3a36e7bee7eec0e5d30057c9c0408067345c48d2"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76951121890fea8330d3a0df9a954b3f2a37e3ec20e5b0530e9a0044ca2e11fe"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b342d2ce8fc8d00f376af068e3274e2e8649562e3bc6ae4a67784ded6b99428d"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d16ff18907f4909dca9b076b9c2d899114dd6abceeb074eca0c93e2353f943aa"},
-    {file = "tokenizers-0.19.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:706a37cc5332f85f26efbe2bdc9ef8a9b372b77e4645331a405073e4b3a8c1c6"},
-    {file = "tokenizers-0.19.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:16baac68651701364b0289979ecec728546133e8e8fe38f66fe48ad07996b88b"},
-    {file = "tokenizers-0.19.1-cp311-none-win32.whl", hash = "sha256:9ed240c56b4403e22b9584ee37d87b8bfa14865134e3e1c3fb4b2c42fafd3256"},
-    {file = "tokenizers-0.19.1-cp311-none-win_amd64.whl", hash = "sha256:ad57d59341710b94a7d9dbea13f5c1e7d76fd8d9bcd944a7a6ab0b0da6e0cc66"},
-    {file = "tokenizers-0.19.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:621d670e1b1c281a1c9698ed89451395d318802ff88d1fc1accff0867a06f153"},
-    {file = "tokenizers-0.19.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d924204a3dbe50b75630bd16f821ebda6a5f729928df30f582fb5aade90c818a"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4f3fefdc0446b1a1e6d81cd4c07088ac015665d2e812f6dbba4a06267d1a2c95"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9620b78e0b2d52ef07b0d428323fb34e8ea1219c5eac98c2596311f20f1f9266"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04ce49e82d100594715ac1b2ce87d1a36e61891a91de774755f743babcd0dd52"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5c2ff13d157afe413bf7e25789879dd463e5a4abfb529a2d8f8473d8042e28f"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3174c76efd9d08f836bfccaca7cfec3f4d1c0a4cf3acbc7236ad577cc423c840"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c9d5b6c0e7a1e979bec10ff960fae925e947aab95619a6fdb4c1d8ff3708ce3"},
-    {file = "tokenizers-0.19.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a179856d1caee06577220ebcfa332af046d576fb73454b8f4d4b0ba8324423ea"},
-    {file = "tokenizers-0.19.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:952b80dac1a6492170f8c2429bd11fcaa14377e097d12a1dbe0ef2fb2241e16c"},
-    {file = "tokenizers-0.19.1-cp312-none-win32.whl", hash = "sha256:01d62812454c188306755c94755465505836fd616f75067abcae529c35edeb57"},
-    {file = "tokenizers-0.19.1-cp312-none-win_amd64.whl", hash = "sha256:b70bfbe3a82d3e3fb2a5e9b22a39f8d1740c96c68b6ace0086b39074f08ab89a"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:bb9dfe7dae85bc6119d705a76dc068c062b8b575abe3595e3c6276480e67e3f1"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:1f0360cbea28ea99944ac089c00de7b2e3e1c58f479fb8613b6d8d511ce98267"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:71e3ec71f0e78780851fef28c2a9babe20270404c921b756d7c532d280349214"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b82931fa619dbad979c0ee8e54dd5278acc418209cc897e42fac041f5366d626"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e8ff5b90eabdcdaa19af697885f70fe0b714ce16709cf43d4952f1f85299e73a"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e742d76ad84acbdb1a8e4694f915fe59ff6edc381c97d6dfdd054954e3478ad4"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d8c5d59d7b59885eab559d5bc082b2985555a54cda04dda4c65528d90ad252ad"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b2da5c32ed869bebd990c9420df49813709e953674c0722ff471a116d97b22d"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:638e43936cc8b2cbb9f9d8dde0fe5e7e30766a3318d2342999ae27f68fdc9bd6"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:78e769eb3b2c79687d9cb0f89ef77223e8e279b75c0a968e637ca7043a84463f"},
-    {file = "tokenizers-0.19.1-cp37-none-win32.whl", hash = "sha256:72791f9bb1ca78e3ae525d4782e85272c63faaef9940d92142aa3eb79f3407a3"},
-    {file = "tokenizers-0.19.1-cp37-none-win_amd64.whl", hash = "sha256:f3bbb7a0c5fcb692950b041ae11067ac54826204318922da754f908d95619fbc"},
-    {file = "tokenizers-0.19.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:07f9295349bbbcedae8cefdbcfa7f686aa420be8aca5d4f7d1ae6016c128c0c5"},
-    {file = "tokenizers-0.19.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:10a707cc6c4b6b183ec5dbfc5c34f3064e18cf62b4a938cb41699e33a99e03c1"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6309271f57b397aa0aff0cbbe632ca9d70430839ca3178bf0f06f825924eca22"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ad23d37d68cf00d54af184586d79b84075ada495e7c5c0f601f051b162112dc"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:427c4f0f3df9109314d4f75b8d1f65d9477033e67ffaec4bca53293d3aca286d"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e83a31c9cf181a0a3ef0abad2b5f6b43399faf5da7e696196ddd110d332519ee"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c27b99889bd58b7e301468c0838c5ed75e60c66df0d4db80c08f43462f82e0d3"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac0b0eb952412b0b196ca7a40e7dce4ed6f6926489313414010f2e6b9ec2adf"},
-    {file = "tokenizers-0.19.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8a6298bde623725ca31c9035a04bf2ef63208d266acd2bed8c2cb7d2b7d53ce6"},
-    {file = "tokenizers-0.19.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:08a44864e42fa6d7d76d7be4bec62c9982f6f6248b4aa42f7302aa01e0abfd26"},
-    {file = "tokenizers-0.19.1-cp38-none-win32.whl", hash = "sha256:1de5bc8652252d9357a666e609cb1453d4f8e160eb1fb2830ee369dd658e8975"},
-    {file = "tokenizers-0.19.1-cp38-none-win_amd64.whl", hash = "sha256:0bcce02bf1ad9882345b34d5bd25ed4949a480cf0e656bbd468f4d8986f7a3f1"},
-    {file = "tokenizers-0.19.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:0b9394bd204842a2a1fd37fe29935353742be4a3460b6ccbaefa93f58a8df43d"},
-    {file = "tokenizers-0.19.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4692ab92f91b87769d950ca14dbb61f8a9ef36a62f94bad6c82cc84a51f76f6a"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6258c2ef6f06259f70a682491c78561d492e885adeaf9f64f5389f78aa49a051"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85cf76561fbd01e0d9ea2d1cbe711a65400092bc52b5242b16cfd22e51f0c58"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670b802d4d82bbbb832ddb0d41df7015b3e549714c0e77f9bed3e74d42400fbe"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:85aa3ab4b03d5e99fdd31660872249df5e855334b6c333e0bc13032ff4469c4a"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbf001afbbed111a79ca47d75941e9e5361297a87d186cbfc11ed45e30b5daba"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c89aa46c269e4e70c4d4f9d6bc644fcc39bb409cb2a81227923404dd6f5227"},
-    {file = "tokenizers-0.19.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:39c1ec76ea1027438fafe16ecb0fb84795e62e9d643444c1090179e63808c69d"},
-    {file = "tokenizers-0.19.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c2a0d47a89b48d7daa241e004e71fb5a50533718897a4cd6235cb846d511a478"},
-    {file = "tokenizers-0.19.1-cp39-none-win32.whl", hash = "sha256:61b7fe8886f2e104d4caf9218b157b106207e0f2a4905c9c7ac98890688aabeb"},
-    {file = "tokenizers-0.19.1-cp39-none-win_amd64.whl", hash = "sha256:f97660f6c43efd3e0bfd3f2e3e5615bf215680bad6ee3d469df6454b8c6e8256"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3b11853f17b54c2fe47742c56d8a33bf49ce31caf531e87ac0d7d13d327c9334"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d26194ef6c13302f446d39972aaa36a1dda6450bc8949f5eb4c27f51191375bd"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e8d1ed93beda54bbd6131a2cb363a576eac746d5c26ba5b7556bc6f964425594"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca407133536f19bdec44b3da117ef0d12e43f6d4b56ac4c765f37eca501c7bda"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce05fde79d2bc2e46ac08aacbc142bead21614d937aac950be88dc79f9db9022"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:35583cd46d16f07c054efd18b5d46af4a2f070a2dd0a47914e66f3ff5efb2b1e"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:43350270bfc16b06ad3f6f07eab21f089adb835544417afda0f83256a8bf8b75"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b4399b59d1af5645bcee2072a463318114c39b8547437a7c2d6a186a1b5a0e2d"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6852c5b2a853b8b0ddc5993cd4f33bfffdca4fcc5d52f89dd4b8eada99379285"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bcd266ae85c3d39df2f7e7d0e07f6c41a55e9a3123bb11f854412952deacd828"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ecb2651956eea2aa0a2d099434134b1b68f1c31f9a5084d6d53f08ed43d45ff2"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:b279ab506ec4445166ac476fb4d3cc383accde1ea152998509a94d82547c8e2a"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:89183e55fb86e61d848ff83753f64cded119f5d6e1f553d14ffee3700d0a4a49"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2edbc75744235eea94d595a8b70fe279dd42f3296f76d5a86dde1d46e35f574"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:0e64bfde9a723274e9a71630c3e9494ed7b4c0f76a1faacf7fe294cd26f7ae7c"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0b5ca92bfa717759c052e345770792d02d1f43b06f9e790ca0a1db62838816f3"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f8a20266e695ec9d7a946a019c1d5ca4eddb6613d4f466888eee04f16eedb85"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63c38f45d8f2a2ec0f3a20073cccb335b9f99f73b3c69483cd52ebc75369d8a1"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dd26e3afe8a7b61422df3176e06664503d3f5973b94f45d5c45987e1cb711876"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:eddd5783a4a6309ce23432353cdb36220e25cbb779bfa9122320666508b44b88"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:56ae39d4036b753994476a1b935584071093b55c7a72e3b8288e68c313ca26e7"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f9939ca7e58c2758c01b40324a59c034ce0cebad18e0d4563a9b1beab3018243"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6c330c0eb815d212893c67a032e9dc1b38a803eccb32f3e8172c19cc69fbb439"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec11802450a2487cdf0e634b750a04cbdc1c4d066b97d94ce7dd2cb51ebb325b"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2b718f316b596f36e1dae097a7d5b91fc5b85e90bf08b01ff139bd8953b25af"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:ed69af290c2b65169f0ba9034d1dc39a5db9459b32f1dd8b5f3f32a3fcf06eab"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f8a9c828277133af13f3859d1b6bf1c3cb6e9e1637df0e45312e6b7c2e622b1f"},
-    {file = "tokenizers-0.19.1.tar.gz", hash = "sha256:ee59e6680ed0fdbe6b724cf38bd70400a0c1dd623b07ac729087270caeac88e3"},
-]
-
-[package.dependencies]
-huggingface-hub = ">=0.16.4,<1.0"
-
-[package.extras]
-dev = ["tokenizers[testing]"]
-docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
-testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
-
 [[package]]
 name = "toml"
 version = "0.10.2"
@@ -2934,64 +1631,6 @@ files = [
     {file = "tomlkit-0.12.3.tar.gz", hash = "sha256:75baf5012d06501f07bee5bf8e801b9f343e7aac5a92581f20f80ce632e6b5a4"},
 ]
 
-[[package]]
-name = "torch"
-version = "2.2.0"
-description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:d366158d6503a3447e67f8c0ad1328d54e6c181d88572d688a625fac61b13a97"},
-    {file = "torch-2.2.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:707f2f80402981e9f90d0038d7d481678586251e6642a7a6ef67fc93511cb446"},
-    {file = "torch-2.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:15c8f0a105c66b28496092fca1520346082e734095f8eaf47b5786bac24b8a31"},
-    {file = "torch-2.2.0-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:0ca4df4b728515ad009b79f5107b00bcb2c63dc202d991412b9eb3b6a4f24349"},
-    {file = "torch-2.2.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:3d3eea2d5969b9a1c9401429ca79efc668120314d443d3463edc3289d7f003c7"},
-    {file = "torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0d1c580e379c0d48f0f0a08ea28d8e373295aa254de4f9ad0631f9ed8bc04c24"},
-    {file = "torch-2.2.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:9328e3c1ce628a281d2707526b4d1080eae7c4afab4f81cea75bde1f9441dc78"},
-    {file = "torch-2.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:03c8e660907ac1b8ee07f6d929c4e15cd95be2fb764368799cca02c725a212b8"},
-    {file = "torch-2.2.0-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:da0cefe7f84ece3e3b56c11c773b59d1cb2c0fd83ddf6b5f7f1fd1a987b15c3e"},
-    {file = "torch-2.2.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f81d23227034221a4a4ff8ef24cc6cec7901edd98d9e64e32822778ff01be85e"},
-    {file = "torch-2.2.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:dcbfb2192ac41ca93c756ebe9e2af29df0a4c14ee0e7a0dd78f82c67a63d91d4"},
-    {file = "torch-2.2.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:9eeb42971619e24392c9088b5b6d387d896e267889d41d267b1fec334f5227c5"},
-    {file = "torch-2.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:c718b2ca69a6cac28baa36d86d8c0ec708b102cebd1ceb1b6488e404cd9be1d1"},
-    {file = "torch-2.2.0-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:f11d18fceb4f9ecb1ac680dde7c463c120ed29056225d75469c19637e9f98d12"},
-    {file = "torch-2.2.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:ee1da852bfd4a7e674135a446d6074c2da7194c1b08549e31eae0b3138c6b4d2"},
-    {file = "torch-2.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0d819399819d0862268ac531cf12a501c253007df4f9e6709ede8a0148f1a7b8"},
-    {file = "torch-2.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:08f53ccc38c49d839bc703ea1b20769cc8a429e0c4b20b56921a9f64949bf325"},
-    {file = "torch-2.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:93bffe3779965a71dab25fc29787538c37c5d54298fd2f2369e372b6fb137d41"},
-    {file = "torch-2.2.0-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:c17ec323da778efe8dad49d8fb534381479ca37af1bfc58efdbb8607a9d263a3"},
-    {file = "torch-2.2.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c02685118008834e878f676f81eab3a952b7936fa31f474ef8a5ff4b5c78b36d"},
-    {file = "torch-2.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:d9f39d6f53cec240a0e3baa82cb697593340f9d4554cee6d3d6ca07925c2fac0"},
-    {file = "torch-2.2.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:51770c065206250dc1222ea7c0eff3f88ab317d3e931cca2aee461b85fbc2472"},
-    {file = "torch-2.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:008e4c6ad703de55af760c73bf937ecdd61a109f9b08f2bbb9c17e7c7017f194"},
-    {file = "torch-2.2.0-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:de8680472dd14e316f42ceef2a18a301461a9058cd6e99a1f1b20f78f11412f1"},
-    {file = "torch-2.2.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:99e1dcecb488e3fd25bcaac56e48cdb3539842904bdc8588b0b255fde03a254c"},
-]
-
-[package.dependencies]
-filelock = "*"
-fsspec = "*"
-jinja2 = "*"
-networkx = "*"
-nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cudnn-cu12 = {version = "8.9.2.26", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nccl-cu12 = {version = "2.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-sympy = "*"
-triton = {version = "2.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-typing-extensions = ">=4.8.0"
-
-[package.extras]
-opt-einsum = ["opt-einsum (>=3.3)"]
-optree = ["optree (>=0.9.1)"]
-
 [[package]]
 name = "tqdm"
 version = "4.66.2"
@@ -3012,95 +1651,6 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
-[[package]]
-name = "transformers"
-version = "4.41.2"
-description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "transformers-4.41.2-py3-none-any.whl", hash = "sha256:05555d20e43f808de1ef211ab64803cdb513170cef70d29a888b589caebefc67"},
-    {file = "transformers-4.41.2.tar.gz", hash = "sha256:80a4db216533d573e9cc7388646c31ed9480918feb7c55eb211249cb23567f87"},
-]
-
-[package.dependencies]
-filelock = "*"
-huggingface-hub = ">=0.23.0,<1.0"
-numpy = ">=1.17"
-packaging = ">=20.0"
-pyyaml = ">=5.1"
-regex = "!=2019.12.17"
-requests = "*"
-safetensors = ">=0.4.1"
-tokenizers = ">=0.19,<0.20"
-tqdm = ">=4.27"
-
-[package.extras]
-accelerate = ["accelerate (>=0.21.0)"]
-agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision"]
-audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
-codecarbon = ["codecarbon (==1.2.0)"]
-deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"]
-deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.19,<0.20)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
-flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
-ftfy = ["ftfy"]
-integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"]
-ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
-modelcreation = ["cookiecutter (==1.7.3)"]
-natten = ["natten (>=0.14.6,<0.15.0)"]
-onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
-onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
-optuna = ["optuna"]
-quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"]
-ray = ["ray[tune] (>=2.7.0)"]
-retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
-sagemaker = ["sagemaker (>=2.31.0)"]
-sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
-serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
-sigopt = ["sigopt"]
-sklearn = ["scikit-learn"]
-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
-tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
-tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
-timm = ["timm"]
-tokenizers = ["tokenizers (>=0.19,<0.20)"]
-torch = ["accelerate (>=0.21.0)", "torch"]
-torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.23.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.19,<0.20)", "torch", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)", "decord (==0.6.0)"]
-vision = ["Pillow (>=10.0.1,<=15.0)"]
-
-[[package]]
-name = "triton"
-version = "2.2.0"
-description = "A language and compiler for custom Deep Learning operations"
-optional = true
-python-versions = "*"
-files = [
-    {file = "triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5"},
-    {file = "triton-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da58a152bddb62cafa9a857dd2bc1f886dbf9f9c90a2b5da82157cd2b34392b0"},
-    {file = "triton-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af58716e721460a61886668b205963dc4d1e4ac20508cc3f623aef0d70283d5"},
-    {file = "triton-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8fe46d3ab94a8103e291bd44c741cc294b91d1d81c1a2888254cbf7ff846dab"},
-    {file = "triton-2.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ce26093e539d727e7cf6f6f0d932b1ab0574dc02567e684377630d86723ace"},
-    {file = "triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:227cc6f357c5efcb357f3867ac2a8e7ecea2298cd4606a8ba1e931d1d5a947df"},
-]
-
-[package.dependencies]
-filelock = "*"
-
-[package.extras]
-build = ["cmake (>=3.20)", "lit"]
-tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)", "torch"]
-tutorials = ["matplotlib", "pandas", "tabulate", "torch"]
-
 [[package]]
 name = "typer"
 version = "0.9.0"
@@ -3216,17 +1766,6 @@ files = [
     {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"},
 ]
 
-[[package]]
-name = "tzdata"
-version = "2024.1"
-description = "Provider of IANA time zone data"
-optional = true
-python-versions = ">=2"
-files = [
-    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
-    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
-]
-
 [[package]]
 name = "urllib3"
 version = "2.0.7"
@@ -3342,123 +1881,6 @@ files = [
     {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
 ]
 
-[[package]]
-name = "xxhash"
-version = "3.4.1"
-description = "Python binding for xxHash"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "xxhash-3.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:91dbfa55346ad3e18e738742236554531a621042e419b70ad8f3c1d9c7a16e7f"},
-    {file = "xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:665a65c2a48a72068fcc4d21721510df5f51f1142541c890491afc80451636d2"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb11628470a6004dc71a09fe90c2f459ff03d611376c1debeec2d648f44cb693"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bef2a7dc7b4f4beb45a1edbba9b9194c60a43a89598a87f1a0226d183764189"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0f7b2d547d72c7eda7aa817acf8791f0146b12b9eba1d4432c531fb0352228"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00f2fdef6b41c9db3d2fc0e7f94cb3db86693e5c45d6de09625caad9a469635b"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23cfd9ca09acaf07a43e5a695143d9a21bf00f5b49b15c07d5388cadf1f9ce11"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6a9ff50a3cf88355ca4731682c168049af1ca222d1d2925ef7119c1a78e95b3b"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f1d7c69a1e9ca5faa75546fdd267f214f63f52f12692f9b3a2f6467c9e67d5e7"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:672b273040d5d5a6864a36287f3514efcd1d4b1b6a7480f294c4b1d1ee1b8de0"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4178f78d70e88f1c4a89ff1ffe9f43147185930bb962ee3979dba15f2b1cc799"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9804b9eb254d4b8cc83ab5a2002128f7d631dd427aa873c8727dba7f1f0d1c2b"},
-    {file = "xxhash-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c09c49473212d9c87261d22c74370457cfff5db2ddfc7fd1e35c80c31a8c14ce"},
-    {file = "xxhash-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:ebbb1616435b4a194ce3466d7247df23499475c7ed4eb2681a1fa42ff766aff6"},
-    {file = "xxhash-3.4.1-cp310-cp310-win_arm64.whl", hash = "sha256:25dc66be3db54f8a2d136f695b00cfe88018e59ccff0f3b8f545869f376a8a46"},
-    {file = "xxhash-3.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58c49083801885273e262c0f5bbeac23e520564b8357fbb18fb94ff09d3d3ea5"},
-    {file = "xxhash-3.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b526015a973bfbe81e804a586b703f163861da36d186627e27524f5427b0d520"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36ad4457644c91a966f6fe137d7467636bdc51a6ce10a1d04f365c70d6a16d7e"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:248d3e83d119770f96003271fe41e049dd4ae52da2feb8f832b7a20e791d2920"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2070b6d5bbef5ee031666cf21d4953c16e92c2f8a24a94b5c240f8995ba3b1d0"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2746035f518f0410915e247877f7df43ef3372bf36cfa52cc4bc33e85242641"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ba6181514681c2591840d5632fcf7356ab287d4aff1c8dea20f3c78097088"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0aac5010869240e95f740de43cd6a05eae180c59edd182ad93bf12ee289484fa"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4cb11d8debab1626181633d184b2372aaa09825bde709bf927704ed72765bed1"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b29728cff2c12f3d9f1d940528ee83918d803c0567866e062683f300d1d2eff3"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:a15cbf3a9c40672523bdb6ea97ff74b443406ba0ab9bca10ceccd9546414bd84"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e66df260fed01ed8ea790c2913271641c58481e807790d9fca8bfd5a3c13844"},
-    {file = "xxhash-3.4.1-cp311-cp311-win32.whl", hash = "sha256:e867f68a8f381ea12858e6d67378c05359d3a53a888913b5f7d35fbf68939d5f"},
-    {file = "xxhash-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:200a5a3ad9c7c0c02ed1484a1d838b63edcf92ff538770ea07456a3732c577f4"},
-    {file = "xxhash-3.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:1d03f1c0d16d24ea032e99f61c552cb2b77d502e545187338bea461fde253583"},
-    {file = "xxhash-3.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c4bbba9b182697a52bc0c9f8ec0ba1acb914b4937cd4a877ad78a3b3eeabefb3"},
-    {file = "xxhash-3.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9fd28a9da300e64e434cfc96567a8387d9a96e824a9be1452a1e7248b7763b78"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6066d88c9329ab230e18998daec53d819daeee99d003955c8db6fc4971b45ca3"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93805bc3233ad89abf51772f2ed3355097a5dc74e6080de19706fc447da99cd3"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64da57d5ed586ebb2ecdde1e997fa37c27fe32fe61a656b77fabbc58e6fbff6e"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a97322e9a7440bf3c9805cbaac090358b43f650516486746f7fa482672593df"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbe750d512982ee7d831838a5dee9e9848f3fb440e4734cca3f298228cc957a6"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fd79d4087727daf4d5b8afe594b37d611ab95dc8e29fe1a7517320794837eb7d"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:743612da4071ff9aa4d055f3f111ae5247342931dedb955268954ef7201a71ff"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:b41edaf05734092f24f48c0958b3c6cbaaa5b7e024880692078c6b1f8247e2fc"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:a90356ead70d715fe64c30cd0969072de1860e56b78adf7c69d954b43e29d9fa"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ac56eebb364e44c85e1d9e9cc5f6031d78a34f0092fea7fc80478139369a8b4a"},
-    {file = "xxhash-3.4.1-cp312-cp312-win32.whl", hash = "sha256:911035345932a153c427107397c1518f8ce456f93c618dd1c5b54ebb22e73747"},
-    {file = "xxhash-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:f31ce76489f8601cc7b8713201ce94b4bd7b7ce90ba3353dccce7e9e1fee71fa"},
-    {file = "xxhash-3.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:b5beb1c6a72fdc7584102f42c4d9df232ee018ddf806e8c90906547dfb43b2da"},
-    {file = "xxhash-3.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6d42b24d1496deb05dee5a24ed510b16de1d6c866c626c2beb11aebf3be278b9"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b685fab18876b14a8f94813fa2ca80cfb5ab6a85d31d5539b7cd749ce9e3624"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419ffe34c17ae2df019a4685e8d3934d46b2e0bbe46221ab40b7e04ed9f11137"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e041ce5714f95251a88670c114b748bca3bf80cc72400e9f23e6d0d59cf2681"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc860d887c5cb2f524899fb8338e1bb3d5789f75fac179101920d9afddef284b"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:312eba88ffe0a05e332e3a6f9788b73883752be63f8588a6dc1261a3eaaaf2b2"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e01226b6b6a1ffe4e6bd6d08cfcb3ca708b16f02eb06dd44f3c6e53285f03e4f"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9f3025a0d5d8cf406a9313cd0d5789c77433ba2004b1c75439b67678e5136537"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:6d3472fd4afef2a567d5f14411d94060099901cd8ce9788b22b8c6f13c606a93"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:43984c0a92f06cac434ad181f329a1445017c33807b7ae4f033878d860a4b0f2"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a55e0506fdb09640a82ec4f44171273eeabf6f371a4ec605633adb2837b5d9d5"},
-    {file = "xxhash-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:faec30437919555b039a8bdbaba49c013043e8f76c999670aef146d33e05b3a0"},
-    {file = "xxhash-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:c9e1b646af61f1fc7083bb7b40536be944f1ac67ef5e360bca2d73430186971a"},
-    {file = "xxhash-3.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:961d948b7b1c1b6c08484bbce3d489cdf153e4122c3dfb07c2039621243d8795"},
-    {file = "xxhash-3.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:719a378930504ab159f7b8e20fa2aa1896cde050011af838af7e7e3518dd82de"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74fb5cb9406ccd7c4dd917f16630d2e5e8cbbb02fc2fca4e559b2a47a64f4940"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5dab508ac39e0ab988039bc7f962c6ad021acd81fd29145962b068df4148c476"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c59f3e46e7daf4c589e8e853d700ef6607afa037bfad32c390175da28127e8c"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc07256eff0795e0f642df74ad096f8c5d23fe66bc138b83970b50fc7f7f6c5"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9f749999ed80f3955a4af0eb18bb43993f04939350b07b8dd2f44edc98ffee9"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7688d7c02149a90a3d46d55b341ab7ad1b4a3f767be2357e211b4e893efbaaf6"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a8b4977963926f60b0d4f830941c864bed16aa151206c01ad5c531636da5708e"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8106d88da330f6535a58a8195aa463ef5281a9aa23b04af1848ff715c4398fb4"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4c76a77dbd169450b61c06fd2d5d436189fc8ab7c1571d39265d4822da16df22"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:11f11357c86d83e53719c592021fd524efa9cf024dc7cb1dfb57bbbd0d8713f2"},
-    {file = "xxhash-3.4.1-cp38-cp38-win32.whl", hash = "sha256:0c786a6cd74e8765c6809892a0d45886e7c3dc54de4985b4a5eb8b630f3b8e3b"},
-    {file = "xxhash-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:aabf37fb8fa27430d50507deeab2ee7b1bcce89910dd10657c38e71fee835594"},
-    {file = "xxhash-3.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6127813abc1477f3a83529b6bbcfeddc23162cece76fa69aee8f6a8a97720562"},
-    {file = "xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef2e194262f5db16075caea7b3f7f49392242c688412f386d3c7b07c7733a70a"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71be94265b6c6590f0018bbf73759d21a41c6bda20409782d8117e76cd0dfa8b"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10e0a619cdd1c0980e25eb04e30fe96cf8f4324758fa497080af9c21a6de573f"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa122124d2e3bd36581dd78c0efa5f429f5220313479fb1072858188bc2d5ff1"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17032f5a4fea0a074717fe33477cb5ee723a5f428de7563e75af64bfc1b1e10"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca7783b20e3e4f3f52f093538895863f21d18598f9a48211ad757680c3bd006f"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d77d09a1113899fad5f354a1eb4f0a9afcf58cefff51082c8ad643ff890e30cf"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:21287bcdd299fdc3328cc0fbbdeaa46838a1c05391264e51ddb38a3f5b09611f"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:dfd7a6cc483e20b4ad90224aeb589e64ec0f31e5610ab9957ff4314270b2bf31"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:543c7fcbc02bbb4840ea9915134e14dc3dc15cbd5a30873a7a5bf66039db97ec"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fe0a98d990e433013f41827b62be9ab43e3cf18e08b1483fcc343bda0d691182"},
-    {file = "xxhash-3.4.1-cp39-cp39-win32.whl", hash = "sha256:b9097af00ebf429cc7c0e7d2fdf28384e4e2e91008130ccda8d5ae653db71e54"},
-    {file = "xxhash-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:d699b921af0dcde50ab18be76c0d832f803034d80470703700cb7df0fbec2832"},
-    {file = "xxhash-3.4.1-cp39-cp39-win_arm64.whl", hash = "sha256:2be491723405e15cc099ade1280133ccfbf6322d2ef568494fb7d07d280e7eee"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:431625fad7ab5649368c4849d2b49a83dc711b1f20e1f7f04955aab86cd307bc"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc6dbd5fc3c9886a9e041848508b7fb65fd82f94cc793253990f81617b61fe49"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ff8dbd0ec97aec842476cb8ccc3e17dd288cd6ce3c8ef38bff83d6eb927817"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef73a53fe90558a4096e3256752268a8bdc0322f4692ed928b6cd7ce06ad4fe3"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:450401f42bbd274b519d3d8dcf3c57166913381a3d2664d6609004685039f9d3"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a162840cf4de8a7cd8720ff3b4417fbc10001eefdd2d21541a8226bb5556e3bb"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b736a2a2728ba45017cb67785e03125a79d246462dfa892d023b827007412c52"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d0ae4c2e7698adef58710d6e7a32ff518b66b98854b1c68e70eee504ad061d8"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6322c4291c3ff174dcd104fae41500e75dad12be6f3085d119c2c8a80956c51"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:dd59ed668801c3fae282f8f4edadf6dc7784db6d18139b584b6d9677ddde1b6b"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:92693c487e39523a80474b0394645b393f0ae781d8db3474ccdcead0559ccf45"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4603a0f642a1e8d7f3ba5c4c25509aca6a9c1cc16f85091004a7028607ead663"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa45e8cbfbadb40a920fe9ca40c34b393e0b067082d94006f7f64e70c7490a6"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:595b252943b3552de491ff51e5bb79660f84f033977f88f6ca1605846637b7c6"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:562d8b8f783c6af969806aaacf95b6c7b776929ae26c0cd941d54644ea7ef51e"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:41ddeae47cf2828335d8d991f2d2b03b0bdc89289dc64349d712ff8ce59d0647"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c44d584afdf3c4dbb3277e32321d1a7b01d6071c1992524b6543025fb8f4206f"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd7bddb3a5b86213cc3f2c61500c16945a1b80ecd572f3078ddbbe68f9dabdfb"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ecb6c987b62437c2f99c01e97caf8d25660bf541fe79a481d05732e5236719c"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:696b4e18b7023527d5c50ed0626ac0520edac45a50ec7cf3fc265cd08b1f4c03"},
-    {file = "xxhash-3.4.1.tar.gz", hash = "sha256:0379d6cf1ff987cd421609a264ce025e74f346e3e145dd106c0cc2e3ec3f99a9"},
-]
-
 [[package]]
 name = "yarl"
 version = "1.9.4"
@@ -3577,10 +1999,7 @@ files = [
 docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
 testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"]
 
-[extras]
-mllib = ["accelerate", "datasets", "einops", "h5py", "peft", "safetensors", "transformers"]
-
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "cef02f25d2bdc395f2187bf7b01eabd560ba18e597bf50005dd90b80fa25336c"
+content-hash = "59be54627e27caf3aa6e089881036b45a65705fcd5f31c9165ddc203930d526d"
diff --git a/pyproject.toml b/pyproject.toml
index dc20e960..a6246f77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "friendli-client"
-version = "1.4.2"
+version = "1.5.0"
 description = "Client of Friendli Suite."
 license = "Apache-2.0"
 authors = ["FriendliAI teams <eng@friendli.ai>"]
@@ -34,16 +34,9 @@ rich = "^12.2.0"
 jsonschema = "^4.17.3"
 tqdm = "^4.48.0"
 pydantic = {extras = ["email"], version = ">=1.9.0, <3"}
-transformers = { version = "4.41.2", optional = true }
-h5py = { version = "^3.9.0", optional = true }
-einops = { version = "^0.6.1", optional = true }
-accelerate = { version = "0.21.0", optional = true }
-datasets = { version = "2.16.0", optional = true }
 injector = "^0.21.0"
 protobuf = "^5.26.1"
 types-protobuf = "^5.26.0.20240422"
-peft = { version = "0.6.0", optional = true }
-safetensors = { version = "0.4.1", optional = true }
 httpx = "^0.24.1"
 fastapi = "^0.104.0"
 uvicorn = "^0.23.2"
@@ -75,9 +68,6 @@ types-toml = "^0.10.8.6"
 types-tqdm = "^4.65.0.1"
 typer = "^0.9.0"
 
-[tool.poetry.extras]
-mllib = ["transformers", "h5py", "accelerate", "einops", "datasets", "peft", "safetensors"]
-
 [tool.isort]
 profile = "black"
 known_local_folder = ["tests"]
@@ -122,12 +112,6 @@ disable = [
 ]
 extension-pkg-whitelist = "pydantic"
 
-[tool.pylint.TYPECHECK]
-generated-members = [
-    "numpy.*" ,
-    "torch.*"
-]
-
 [tool.pylint.check]
 ignored-classes = "Depends"
 ignore-patterns = [
diff --git a/tests/unit_tests/modules/__init__.py b/tests/unit_tests/modules/__init__.py
deleted file mode 100644
index 1fc4d985..00000000
--- a/tests/unit_tests/modules/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
diff --git a/tests/unit_tests/modules/conftest.py b/tests/unit_tests/modules/conftest.py
deleted file mode 100644
index f79aaa6a..00000000
--- a/tests/unit_tests/modules/conftest.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-from __future__ import annotations
-
-from typing import Any, Dict
-
-import pytest
-from peft import PeftConfig
-from transformers import (
-    AutoConfig,
-    BlenderbotConfig,
-    BloomConfig,
-    CodeGenConfig,
-    FalconConfig,
-    GPT2Config,
-    GPTJConfig,
-    GPTNeoXConfig,
-    LlamaConfig,
-    MistralConfig,
-    MixtralConfig,
-    MptConfig,
-    OPTConfig,
-    T5Config,
-)
-from transformers.models.mpt.configuration_mpt import MptAttentionConfig
-
-from friendli.enums import ModelDataType
-from friendli.modules.converter.base import OneOfConverter
-from friendli.modules.converter.maps import get_hf_converter_factory
-from friendli.modules.converter.models.mixtral import MixtralForCausalLMConverter
-from friendli.modules.converter.utils import get_model_arch
-
-from tests.unit_tests.modules.helpers.utils import ModelConfig, get_param_specs
-
-model_name_config_map = {
-    "blenderbot": BlenderbotConfig(
-        architectures=["BlenderbotForConditionalGeneration"],
-        activation_function="gelu",
-        tie_word_embeddings=True,
-        decoder_attention_heads=32,
-        encoder_attention_heads=32,
-        decoder_ffn_dim=10240,
-        encoder_ffn_dim=10240,
-        encoder_layers=1,
-        decoder_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "bloom": BloomConfig(
-        architectures=["BloomForCausalLM"],
-        apply_residual_connection_post_layernorm=False,
-        slow_but_exact=False,
-        tie_word_embeddings=True,
-        layer_norm_epsilon=1e-5,
-        n_layer=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "codegen": CodeGenConfig(
-        architectures=["CodeGenForCausalLM"],
-        activation_function="gelu",
-        tie_word_embeddings=False,
-        layer_norm_epsilon=1e-5,
-        n_layer=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "falcon_7b": FalconConfig(  # falcon-7b
-        architectures=["FalconForCausalLM"],
-        alibi=False,
-        bias=False,
-        new_decoder_architecture=False,
-        parallel_attn=True,
-        num_hidden_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "falcon": FalconConfig(  # falcon-40b
-        architectures=["FalconForCausalLM"],
-        alibi=False,
-        bias=False,
-        new_decoder_architecture=True,
-        num_hidden_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "gpt_neox": GPTNeoXConfig(  # pythia-1.4b
-        architectures=["GPTNeoXForCausalLM"],
-        hidden_act="gelu",
-        use_parallel_residual=True,
-        tie_word_embeddings=False,
-        layer_norm_eps=1e-5,
-        rotary_emb_base=10000,
-        num_hidden_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "gpt": GPT2Config(
-        architectures=["GPT2LMHeadModel"],
-        activation_function="gelu",
-        scale_attn_by_inverse_layer_idx=False,
-        tie_word_embeddings=True,
-        layer_norm_epsilon=1e-5,
-        n_layer=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "gpt_j": GPTJConfig(  # gpt-j-6b
-        architectures=["GPTJForCausalLM"],
-        tie_word_embeddings=False,
-        layer_norm_epsilon=1e-5,
-        n_layer=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "llama": LlamaConfig(
-        architectures=["LlamaForCausalLM"],
-        hidden_act="silu",
-        tie_word_embeddings=False,
-        rms_norm_eps=1e-5,
-        num_hidden_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "mpt": MptConfig(
-        architectures=["MPTForCausalLM"],
-        attn_config=MptAttentionConfig(
-            alibi=True,
-            alibi_bias_max=8,
-            attn_type="multihead_attention",
-            prefix_lm=False,
-            qk_ln=False,
-            softmax_scale=None,
-        ),
-        expansion_ratio=4,
-        no_bias=True,
-        logit_scale=None,
-        n_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "opt": OPTConfig(
-        architectures=["OPTForCausalLM"],
-        activation_function="relu",
-        do_layer_norm_before=True,
-        word_embed_proj_dim=768,
-        hidden_size=768,
-        _remove_first_dropout=False,
-        tie_word_embeddings=True,
-        num_hidden_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "t5_v1_1": T5Config(
-        architectures=["T5ForConditionalGeneration"],
-        is_gated_act=True,
-        tie_word_embeddings=False,
-        num_hidden_layers=1,
-        num_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-        relative_attention_num_buckets=32,  # fixed value for t5
-    ),
-    "t5": T5Config(
-        architectures=["T5ForConditionalGeneration"],
-        is_gated_act=False,
-        tie_word_embeddings=True,
-        layer_norm_epsilon=1e-6,
-        num_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-        relative_attention_num_buckets=32,  # fixed value for t5
-    ),
-    "mistral": MistralConfig(  # same as llama architecture
-        architectures=["MistralForCausalLM"],
-        hidden_act="silu",
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rms_norm_eps=1e-5,
-        num_hidden_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    "mixtral": MixtralConfig(  # same as llama architecture
-        architectures=["MixtralForCausalLM"],
-        hidden_act="silu",
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rms_norm_eps=1e-5,
-        num_hidden_layers=1,
-        vocab_size=10000,
-        max_position_embeddings=1024,
-    ),
-    # TODO: add phi_msft
-    # TODO: add mpt with grouped querry attention (e.g. replit-code)
-}
-
-
-@pytest.fixture
-def converter(model_config: AutoConfig) -> OneOfConverter:
-    model_arch = get_model_arch(model_config)
-    _, converter_cls = get_hf_converter_factory(model_arch)
-    return converter_cls(model_config, None, ModelDataType.FP16)
-
-
-# TODO: add render_model_config per model
-@pytest.fixture
-def render_model_config(converter: OneOfConverter) -> ModelConfig:
-    return ModelConfig(
-        dtype="float16",
-        num_decoder_layers=converter.decoder_layer_num,
-        hidden_size=converter.decoder_hidden_size,
-        num_heads=converter.decoder_num_attention_heads,
-        num_kv_heads=converter.decoder_num_kv_attention_heads,
-        head_size=converter.decoder_head_size,
-        num_encoder_layers=converter.decoder_layer_num,  # same as decoder for test
-        ff_intermediate_size=converter.decoder_ff_intermediate_size,
-        num_experts=converter.num_experts
-        if isinstance(converter, MixtralForCausalLMConverter)
-        else None,
-    )
-
-
-@pytest.fixture
-def spec_data(model_name: str, render_model_config: ModelConfig) -> Dict[str, Any]:
-    param_specs = get_param_specs(model_name, "models", render_model_config)
-    return param_specs
diff --git a/tests/unit_tests/modules/helpers/__init__.py b/tests/unit_tests/modules/helpers/__init__.py
deleted file mode 100644
index 1fc4d985..00000000
--- a/tests/unit_tests/modules/helpers/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
diff --git a/tests/unit_tests/modules/helpers/spec.py b/tests/unit_tests/modules/helpers/spec.py
deleted file mode 100644
index 127d8d59..00000000
--- a/tests/unit_tests/modules/helpers/spec.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""Model spec utils"""
-
-from __future__ import annotations
-
-from enum import Enum
-from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union
-
-import numpy as np
-import yaml
-from jinja2.environment import Template as JinjaTemplate
-from pydantic import BaseModel
-
-from friendli.utils.compat import model_parse
-
-
-class InvalidSpecFormatError(Exception):
-    """Invalid model spec format  that can be handled by users."""
-
-
-class SpecNodeType(str, Enum):
-    """Model spec node type."""
-
-    DATA = "data"
-    GROUP = "group"
-    REPEAT_GROUP = "repeat_group"
-
-
-class ParamInfo(BaseModel):
-    """Parameter info."""
-
-    name: str
-    dtype: np.dtype
-    shape: Tuple[int, ...]
-
-    class Config:
-        arbitrary_types_allowed = (
-            True  # for np.dtype only check `isinstance(dtype, np.dtype)`
-        )
-
-    @classmethod
-    def load(cls, name: str, data: Dict[str, Any]) -> ParamInfo:
-        """Load a param info from data.
-
-        Args:
-            name (str): Name of parameter.
-            data (dict[str, Any]): A dictionary describing the parameter info.
-
-        Raises:
-            InvalidSpecFormatError: Raised if required key does not exist in data.
-
-        Returns:
-            ParamInfo: Loaded param info.
-
-        """
-        try:
-            dtype = np.dtype(data["dtype"])
-            return ParamInfo(
-                name=name,
-                dtype=dtype,
-                shape=tuple(map(int, data["shape"])),
-            )
-        except (KeyError, AttributeError, TypeError) as exc:
-            raise InvalidSpecFormatError from exc
-
-
-class RepeatRange(BaseModel):
-    """Repeat group's repeat range."""
-
-    lo: int
-    hi: int
-
-
-class Template:
-    """Renderable YAML template."""
-
-    def __init__(self, jinja_template: JinjaTemplate):
-        self._jinja2_template = jinja_template
-
-    @classmethod
-    def from_file(cls, path: Union[str, Path]) -> Template:
-        with open(path, "r") as f:
-            return cls(jinja_template=JinjaTemplate(f.read()))
-
-    def render(self, **kwargs) -> Dict[str, Any] | List[Dict[str, Any]]:
-        """Render a Jinja2-YAML template with filling the variables.
-
-        Returns:
-            dict[str, Any] | list[dict[str, Any]]: Rendered template in JSON format.
-
-        """
-        return yaml.safe_load(self._jinja2_template.render(**kwargs))
-
-
-class ModelSpecParser:
-    """Model spec parser"""
-
-    def __init__(self, model_spec: Dict[str, Any]) -> None:
-        """Intialize model spec parser.
-
-        Args:
-            model_spec (dict[str, Any]): A dictionary describing the entire model spec.
-
-        """
-        self._model_spec = model_spec
-
-    def get_all_param_info(self) -> Dict[ParamInfo]:
-        """Get all parameter info specified in the model spec.
-
-        Returns:
-            list[ParamInfo]: A list of param info.
-
-        """
-        return self._get_param_info(self._model_spec)
-
-    def _get_param_info(
-        self, spec: Dict[str, Any], name_prefix: str = ""
-    ) -> Dict[ParamInfo]:
-        """Get a dictionary of param info in recursion.
-
-        Args:
-            spec (dict[str, Any]): Full or partial model spec.
-            name_prefix (str, optional): Parsed name until the current recursion step. Defaults to "".
-
-        Returns:
-            Dict[ParamInfo]: A dictionary of param info.
-
-        """
-        try:
-            node_type = spec["type"]
-        except KeyError as exc:
-            raise InvalidSpecFormatError from exc
-
-        if node_type == SpecNodeType.DATA:
-            return {name_prefix: ParamInfo.load(name=name_prefix, data=spec)}
-        if node_type == SpecNodeType.GROUP:
-            res = {}
-            for child_name, child_spec in spec.items():
-                if child_name == "type":
-                    continue
-                res.update(
-                    self._get_param_info(
-                        spec=child_spec,
-                        name_prefix=f"{name_prefix}/{child_name}"
-                        if name_prefix
-                        else child_name,
-                    )
-                )
-            return res
-        if node_type == SpecNodeType.REPEAT_GROUP:
-            try:
-                repeat_range = model_parse(RepeatRange, spec["range"])  # type: ignore
-            except KeyError as exc:
-                raise InvalidSpecFormatError from exc
-            res = {}
-
-            for i in range(repeat_range.lo, repeat_range.hi + 1):
-                for child_name, child_spec in spec.items():
-                    if child_name in ["type", "range"]:
-                        continue
-                    res.update(
-                        self._get_param_info(
-                            spec=child_spec,
-                            name_prefix=f"{name_prefix.replace('*', str(i))}/{child_name}"
-                            if name_prefix
-                            else child_name,
-                        )
-                    )
-            return res
-        raise InvalidSpecFormatError
diff --git a/tests/unit_tests/modules/helpers/utils.py b/tests/unit_tests/modules/helpers/utils.py
deleted file mode 100644
index 1de9b23d..00000000
--- a/tests/unit_tests/modules/helpers/utils.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-from __future__ import annotations
-
-import os
-from dataclasses import fields
-from typing import Dict, Optional
-from unittest.mock import Mock
-
-import numpy as np
-import torch
-from accelerate import init_empty_weights
-from peft import PeftConfig, PeftModel
-from pydantic import BaseModel
-from transformers import PretrainedConfig
-
-from friendli.enums import ModelDataType
-from friendli.modules.converter.maps import (
-    get_adapter_converter_factory,
-    get_hf_converter_factory,
-)
-from friendli.modules.converter.utils import get_model_arch
-from friendli.modules.quantizer.awq.base import AWQQuantizer
-from friendli.modules.quantizer.layers import (
-    WeightActQuantizedLinearLayer,
-    WeightOnlyQuantizedLinearLayer,
-)
-from friendli.modules.quantizer.schema.config import AWQConfig
-from friendli.modules.quantizer.schema.data import QuantInput
-from friendli.modules.quantizer.smoothquant.base import SmoothQuantQuantizer
-from friendli.utils.compat import model_dump
-
-from tests.unit_tests.modules.helpers.spec import ModelSpecParser, ParamInfo, Template
-
-SPEC_PATH_PREFIX = os.path.join(
-    os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "specs/"
-)
-
-
-class ModelConfig(BaseModel):
-    """Adjustable model config."""
-
-    dtype: str
-    num_decoder_layers: int
-    hidden_size: int
-    num_encoder_layers: Optional[int] = None
-    ff_intermediate_size: Optional[int] = None
-    num_heads: Optional[int] = None
-    num_kv_heads: Optional[int] = None
-    head_size: Optional[int] = None
-    seq_len: Optional[int] = 1024
-    vocab_size: Optional[int] = 10000
-    num_experts: Optional[int] = 8
-
-
-class LoraAdapterConfig(ModelConfig):
-    """Adjustable model config."""
-
-    lora_rank_dim: int
-
-
-class AWQModelConfig(ModelConfig):
-    """Adjustable model config for AWQ."""
-
-    group_size: int = 1
-    q_dtype: str = "int8"
-
-
-class SmoothQuantModelConfig(ModelConfig):
-    """Adjustable model config for SmoothQuant."""
-
-    attn_fc_smoothing: bool = False
-    ff2_smoothing: bool = False
-    q_dtype: str = "int8"
-
-
-def get_numpy_data_type(data_type: ModelDataType) -> np.dtype:
-    if data_type == ModelDataType.FP32:
-        return np.float32
-    elif data_type == ModelDataType.FP16:
-        return np.float16
-    elif data_type == ModelDataType.BF16:
-        return np.uint32
-    else:
-        return np.int8
-
-
-def get_param_specs(
-    model_name: str, spec_folder: str, model_config: ModelConfig
-) -> Dict[str, ParamInfo]:
-    file_path = f"{SPEC_PATH_PREFIX}{spec_folder}/{model_name}.yaml"
-    template = Template.from_file(file_path)
-    render_config = model_dump(model_config)
-    rendered = template.render(**render_config)
-    assert isinstance(rendered, dict)
-    parser = ModelSpecParser(model_spec=rendered)
-    param_specs = parser.get_all_param_info()
-    return param_specs
-
-
-def get_meta_model(
-    model_config: PretrainedConfig,
-) -> torch.nn.Module:
-    model_arch = get_model_arch(model_config)
-    model_factory, _ = get_hf_converter_factory(model_arch)
-    with init_empty_weights():
-        model = model_factory(config=model_config)
-    return model
-
-
-def get_meta_model_with_adapter(
-    model_config: PretrainedConfig, adapter_config: PeftConfig
-) -> torch.nn.Module:
-    model_arch = get_model_arch(model_config)
-    model_factory, _ = get_hf_converter_factory(model_arch)
-    with init_empty_weights():
-        model = model_factory(config=model_config)
-        PeftModel(model, adapter_config)
-    return model
-
-
-def get_smoothquant_quantized_meta_model(
-    model_config: PretrainedConfig, quantizer: SmoothQuantQuantizer
-):
-    model = get_meta_model(model_config)
-    model = quantizer.hook.pre_smooth(model).to("meta")
-
-    def weight_act_quant_layer(quant_input: QuantInput):
-        weight, start, end = (
-            quant_input.weight,
-            quant_input.start_offset,
-            quant_input.end_offset,
-        )
-        weight = weight[start:end]
-        return WeightActQuantizedLinearLayer(  # meta quantized linear layer
-            in_features=weight.size(1),
-            out_features=weight.size(0),
-            q_weight=weight,
-            weight_scale=torch.zeros(weight.size(1), device="meta"),
-            act_scale=torch.zeros(weight.size(1), device="meta"),
-        )
-
-    for tf_quant_input in quantizer.hook.iter_tf_quant_inputs(model):
-        for field in fields(tf_quant_input):
-            quant_input = getattr(tf_quant_input, field.name)
-            if isinstance(quant_input, QuantInput):
-                weight_act_quant_layer = Mock(side_effect=weight_act_quant_layer)
-                q_layer = weight_act_quant_layer(quant_input)
-                tf_quant_input.block.add_module(field.name, q_layer)
-
-    return model
-
-
-def get_awq_quantized_meta_model(
-    model_config: PretrainedConfig, quantizer: AWQQuantizer, quant_config: AWQConfig
-):
-    model = get_meta_model(model_config)
-    model = quantizer.hook.add_pre_scaler(model).to("meta")
-
-    def weight_act_quant_layer(quant_input: QuantInput):
-        weight, start, end = (
-            quant_input.weight,
-            quant_input.start_offset,
-            quant_input.end_offset,
-        )
-        w = weight[start:end]
-        out_dim = w.size(0)
-        in_dim = w.size(1)
-        num_groups = in_dim // quant_config.awq_args.quant_group_size
-        return WeightOnlyQuantizedLinearLayer(  # meta quantized linear layer
-            in_features=in_dim,
-            out_features=out_dim,
-            q_weight=w,
-            weight_scale=torch.zeros((num_groups, out_dim), device="meta"),
-            zeros=torch.zeros((num_groups, out_dim), device="meta"),
-        )
-
-    for tf_quant_input in quantizer.hook.iter_tf_quant_inputs(model):
-        for field in fields(tf_quant_input):
-            quant_input = getattr(tf_quant_input, field.name)
-            if isinstance(quant_input, QuantInput):
-                weight_only_quantzer = Mock(side_effect=weight_act_quant_layer)
-                q_layer = weight_only_quantzer(quant_input)
-                tf_quant_input.block.add_module(field.name, q_layer)
-
-    return model
diff --git a/tests/unit_tests/modules/specs/awq/gpt_j.yaml b/tests/unit_tests/modules/specs/awq/gpt_j.yaml
deleted file mode 100644
index 21ee18a5..00000000
--- a/tests/unit_tests/modules/specs/awq/gpt_j.yaml
+++ /dev/null
@@ -1,162 +0,0 @@
-# Jinja2 template to validate GPT-J model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        awq: 
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 3 | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        awq: 
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-    ln_1:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 4 | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        awq:
-          pre_scale:0:
-            type: data
-            dtype: float32
-            shape: 
-              - {{ hidden_size * 4 | int }}
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size * 4 // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size * 4 // group_size | int }}
-              - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  bias:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/awq/gpt_neox.yaml b/tests/unit_tests/modules/specs/awq/gpt_neox.yaml
deleted file mode 100644
index ca93fd0f..00000000
--- a/tests/unit_tests/modules/specs/awq/gpt_neox.yaml
+++ /dev/null
@@ -1,175 +0,0 @@
-# Jinja2 template to validate GPT-NeoX model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        awq: 
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size  | int }}
-              - {{ hidden_size * 3  | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size  | int }}
-              - {{ hidden_size * 3  | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        awq: 
-          type: group
-          pre_scale:0:
-            type: data
-            dtype: float32
-            shape: 
-              - {{ hidden_size | int }}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size  // group_size | int }}
-              - {{ hidden_size * 4 | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        awq:
-          type: group
-          pre_scale:0:
-            type: data
-            dtype: float32
-            shape: 
-              - {{ hidden_size * 4 | int }}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size * 4 // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size * 4 // group_size | int }}
-              - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/awq/llama.yaml b/tests/unit_tests/modules/specs/awq/llama.yaml
deleted file mode 100644
index 71984acc..00000000
--- a/tests/unit_tests/modules/specs/awq/llama.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-# Jinja2 template to validate LLaMA model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-      c_proj:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        awq: 
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ ff_intermediate_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ ff_intermediate_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ ff_intermediate_size | int }}
-      c_gate:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ ff_intermediate_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ ff_intermediate_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ ff_intermediate_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ ff_intermediate_size // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ ff_intermediate_size // group_size | int }}
-              - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/awq/mistral.yaml b/tests/unit_tests/modules/specs/awq/mistral.yaml
deleted file mode 100644
index 71984acc..00000000
--- a/tests/unit_tests/modules/specs/awq/mistral.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-# Jinja2 template to validate LLaMA model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-      c_proj:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        awq: 
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ ff_intermediate_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ ff_intermediate_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ ff_intermediate_size | int }}
-      c_gate:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ ff_intermediate_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ ff_intermediate_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ ff_intermediate_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ ff_intermediate_size // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ ff_intermediate_size // group_size | int }}
-              - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/awq/mpt.yaml b/tests/unit_tests/modules/specs/awq/mpt.yaml
deleted file mode 100644
index 9a1c736f..00000000
--- a/tests/unit_tests/modules/specs/awq/mpt.yaml
+++ /dev/null
@@ -1,137 +0,0 @@
-# Jinja2 template to validate MPT model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        awq: 
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 3 | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        awq: 
-          type: group
-          pre_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        awq:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 4 | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size // group_size | int }}
-              - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        awq:
-          type: group
-          pre_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          scale:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size * 4 // group_size | int }}
-              - {{ hidden_size | int }}
-          zero:0:
-            type: data
-            dtype: {{ dtype }}
-            shape:
-              - {{ hidden_size * 4 // group_size | int }}
-              - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/lora/llama.yaml b/tests/unit_tests/modules/specs/lora/llama.yaml
deleted file mode 100644
index 74d9de3c..00000000
--- a/tests/unit_tests/modules/specs/lora/llama.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-# Jinja2 template to validate Llama model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        lora:
-          type: group
-          query_A:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ hidden_size | int }}
-                - {{ lora_rank_dim | int }}
-          query_B:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ lora_rank_dim | int }}
-                - {{ num_heads * head_size | int }}
-          key_A:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ hidden_size | int }}
-                - {{ lora_rank_dim | int }}
-          key_B:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ lora_rank_dim | int }}
-                - {{ num_kv_heads * head_size | int }}
-      c_proj:
-        type: group
-        lora:
-          type: group
-          lora_A:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ num_heads * head_size | int }}
-                - {{ lora_rank_dim | int }}
-          lora_B:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ lora_rank_dim | int }}
-                - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        lora:
-          type: group
-          lora_A:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ hidden_size | int }}
-                - {{ lora_rank_dim | int }}
-          lora_B:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ lora_rank_dim | int }}
-                - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        lora:
-          type: group
-          lora_A:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ ff_intermediate_size | int }}
-                - {{ lora_rank_dim | int }}
-          lora_B:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ lora_rank_dim | int }}
-                - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/lora/mpt.yaml b/tests/unit_tests/modules/specs/lora/mpt.yaml
deleted file mode 100644
index 4bd1083d..00000000
--- a/tests/unit_tests/modules/specs/lora/mpt.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Jinja2 template to validate MPT model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        lora:
-          type: group
-          lora_A:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ hidden_size | int }}
-                - {{ lora_rank_dim | int}}
-          lora_B:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ lora_rank_dim | int }}
-                - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-
diff --git a/tests/unit_tests/modules/specs/models/ phi_msft.yaml b/tests/unit_tests/modules/specs/models/ phi_msft.yaml
deleted file mode 100644
index 15d7f42b..00000000
--- a/tests/unit_tests/modules/specs/models/ phi_msft.yaml	
+++ /dev/null
@@ -1,111 +0,0 @@
-# Jinja2 template to validate phi-msft model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ((num_kv_heads * 2 + num_heads) * head_size | int )}}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int )}}
-    ln_1:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ff_intermediate_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ff_intermediate_size | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  bias:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/blenderbot.yaml b/tests/unit_tests/modules/specs/models/blenderbot.yaml
deleted file mode 100644
index 02dc7d93..00000000
--- a/tests/unit_tests/modules/specs/models/blenderbot.yaml
+++ /dev/null
@@ -1,243 +0,0 @@
-# Jinja2 template to validate Blenderbot model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    cross_attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 3
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ seq_len | int }}
-        - {{ hidden_size | int }}
-encoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_encoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ seq_len | int }}
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/bloom.yaml b/tests/unit_tests/modules/specs/models/bloom.yaml
deleted file mode 100644
index cb5539f9..00000000
--- a/tests/unit_tests/modules/specs/models/bloom.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-# Jinja2 template to validate Bloom model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-wte:
-  type: group
-  ln:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/codegen.yaml b/tests/unit_tests/modules/specs/models/codegen.yaml
deleted file mode 100644
index 3e906ec4..00000000
--- a/tests/unit_tests/modules/specs/models/codegen.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-# Jinja2 template to validate Codegen model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_1:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  bias:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/falcon.yaml b/tests/unit_tests/modules/specs/models/falcon.yaml
deleted file mode 100644
index cb723f76..00000000
--- a/tests/unit_tests/modules/specs/models/falcon.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-# Jinja2 template to validate Falcon model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/falcon_7b.yaml b/tests/unit_tests/modules/specs/models/falcon_7b.yaml
deleted file mode 100644
index ca85b0a4..00000000
--- a/tests/unit_tests/modules/specs/models/falcon_7b.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Jinja2 template to validate Falcon 7B model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_1:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/gpt.yaml b/tests/unit_tests/modules/specs/models/gpt.yaml
deleted file mode 100644
index a8e6ff2e..00000000
--- a/tests/unit_tests/modules/specs/models/gpt.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-# Jinja2 template to validate GPT model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ seq_len | int }}
-        - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/gpt_j.yaml b/tests/unit_tests/modules/specs/models/gpt_j.yaml
deleted file mode 100644
index 3417f790..00000000
--- a/tests/unit_tests/modules/specs/models/gpt_j.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-# Jinja2 template to validate GPT-J model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_1:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  bias:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/gpt_neox.yaml b/tests/unit_tests/modules/specs/models/gpt_neox.yaml
deleted file mode 100644
index 93341f45..00000000
--- a/tests/unit_tests/modules/specs/models/gpt_neox.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-# Jinja2 template to validate GPT-NeoX model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/llama.yaml b/tests/unit_tests/modules/specs/models/llama.yaml
deleted file mode 100644
index d0f2266e..00000000
--- a/tests/unit_tests/modules/specs/models/llama.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-# Jinja2 template to validate LLaMA model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_gate:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ff_intermediate_size | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/mistral.yaml b/tests/unit_tests/modules/specs/models/mistral.yaml
deleted file mode 100644
index d0f2266e..00000000
--- a/tests/unit_tests/modules/specs/models/mistral.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-# Jinja2 template to validate LLaMA model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_gate:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ff_intermediate_size | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/mixtral.yaml b/tests/unit_tests/modules/specs/models/mixtral.yaml
deleted file mode 100644
index d0d79b01..00000000
--- a/tests/unit_tests/modules/specs/models/mixtral.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-# Jinja2 template to validate LLaMA model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    moe:
-      type: group
-      router:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int}}
-            - {{ num_experts | int }}
-      '*':
-        type: repeat_group
-        range:
-          lo: 0
-          hi: {{ num_experts - 1 | int }}
-        mlp:
-          type: group
-          c_fc: 
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ hidden_size | int }}
-                - {{ ff_intermediate_size | int }}
-          c_gate: 
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ hidden_size | int }}
-                - {{ ff_intermediate_size | int }}
-          c_proj:
-            type: group
-            weight:0:
-              type: data
-              dtype: {{ dtype }}
-              shape:
-                - {{ ff_intermediate_size | int }}
-                - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/mpt.yaml b/tests/unit_tests/modules/specs/models/mpt.yaml
deleted file mode 100644
index 701c56d2..00000000
--- a/tests/unit_tests/modules/specs/models/mpt.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-# Jinja2 template to validate MPT model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/opt.yaml b/tests/unit_tests/modules/specs/models/opt.yaml
deleted file mode 100644
index 2bc76839..00000000
--- a/tests/unit_tests/modules/specs/models/opt.yaml
+++ /dev/null
@@ -1,117 +0,0 @@
-# Jinja2 template to validate OPT model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ seq_len | int }}
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/t5.yaml b/tests/unit_tests/modules/specs/models/t5.yaml
deleted file mode 100644
index 3f7b88fb..00000000
--- a/tests/unit_tests/modules/specs/models/t5.yaml
+++ /dev/null
@@ -1,165 +0,0 @@
-# Jinja2 template to validate T5 (t5-v1_1) model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    cross_attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 3
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ff_intermediate_size | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: float32
-      shape:
-        - {{ 32 | int }}
-        - {{ num_heads | int }}
-encoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_encoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ff_intermediate_size | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: float32
-      shape:
-        - {{ 32 | int }}
-        - {{ num_heads | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/models/t5_v1_1.yaml b/tests/unit_tests/modules/specs/models/t5_v1_1.yaml
deleted file mode 100644
index 3b99f73c..00000000
--- a/tests/unit_tests/modules/specs/models/t5_v1_1.yaml
+++ /dev/null
@@ -1,189 +0,0 @@
-# Jinja2 template to validate T5 (t5-v1_1) model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    cross_attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 3
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_gate:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ff_intermediate_size | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: float32
-      shape:
-        - {{ 32 | int }}
-        - {{ num_heads | int }}
-encoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_encoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size * 3 | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_gate:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-            - {{ ff_intermediate_size | int }}
-      c_proj:
-        type: group
-        weight:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ ff_intermediate_size | int }}
-            - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: float32
-      shape:
-        - {{ 32 | int }}
-        - {{ num_heads | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/bloom.yaml b/tests/unit_tests/modules/specs/smoothquant/bloom.yaml
deleted file mode 100644
index c8a90282..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/bloom.yaml
+++ /dev/null
@@ -1,215 +0,0 @@
-# Jinja2 template to validate Bloom model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        smoothquant:
-          type: group
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-wte:
-  type: group
-  ln:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/codegen.yaml b/tests/unit_tests/modules/specs/smoothquant/codegen.yaml
deleted file mode 100644
index 87013ae5..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/codegen.yaml
+++ /dev/null
@@ -1,215 +0,0 @@
-# Jinja2 template to validate Codegen model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}          
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_1:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    ln_2:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  bias:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/falcon.yaml b/tests/unit_tests/modules/specs/smoothquant/falcon.yaml
deleted file mode 100644
index 91b828ac..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/falcon.yaml
+++ /dev/null
@@ -1,191 +0,0 @@
-# Jinja2 template to validate Falcon model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/falcon_7b.yaml b/tests/unit_tests/modules/specs/smoothquant/falcon_7b.yaml
deleted file mode 100644
index 0570e118..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/falcon_7b.yaml
+++ /dev/null
@@ -1,188 +0,0 @@
-# Jinja2 template to validate Falcon 7B model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ num_heads * head_size | int }}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ num_kv_heads * head_size | int }}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ num_kv_heads * head_size | int }}            
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_1:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/gpt.yaml b/tests/unit_tests/modules/specs/smoothquant/gpt.yaml
deleted file mode 100644
index a57b3952..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/gpt.yaml
+++ /dev/null
@@ -1,211 +0,0 @@
-# Jinja2 template to validate GPT model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}          
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ seq_len | int }}
-        - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/gpt_j.yaml b/tests/unit_tests/modules/specs/smoothquant/gpt_j.yaml
deleted file mode 100644
index dad7a61e..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/gpt_j.yaml
+++ /dev/null
@@ -1,215 +0,0 @@
-# Jinja2 template to validate GPT-J model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}          
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_1:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    ln_2:
-      type: group
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}          
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  bias:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/gpt_neox.yaml b/tests/unit_tests/modules/specs/smoothquant/gpt_neox.yaml
deleted file mode 100644
index 08230d20..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/gpt_neox.yaml
+++ /dev/null
@@ -1,211 +0,0 @@
-# Jinja2 template to validate GPT-NeoX model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}          
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/llama.yaml b/tests/unit_tests/modules/specs/smoothquant/llama.yaml
deleted file mode 100644
index f29d2f4e..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/llama.yaml
+++ /dev/null
@@ -1,206 +0,0 @@
-# Jinja2 template to validate LLaMA model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-            - {{ (num_kv_heads * 2 + num_heads) * head_size | int }}
-            - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-            - {{ num_heads * head_size | int }}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ num_kv_heads * head_size | int}}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ num_kv_heads * head_size | int}}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}          
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ ff_intermediate_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ ff_intermediate_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_gate:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ ff_intermediate_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ ff_intermediate_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ ff_intermediate_size | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ ff_intermediate_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ ff_intermediate_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ ff_intermediate_size | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/mpt.yaml b/tests/unit_tests/modules/specs/smoothquant/mpt.yaml
deleted file mode 100644
index 1d520f2c..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/mpt.yaml
+++ /dev/null
@@ -1,173 +0,0 @@
-# Jinja2 template to validate MPT model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}          
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-  ln_f:
-    type: group
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/specs/smoothquant/opt.yaml b/tests/unit_tests/modules/specs/smoothquant/opt.yaml
deleted file mode 100644
index cb76b1f8..00000000
--- a/tests/unit_tests/modules/specs/smoothquant/opt.yaml
+++ /dev/null
@@ -1,219 +0,0 @@
-# Jinja2 template to validate OPT model in Friendli format.
-
-type: group
-decoder:
-  type: group
-  h_._*:
-    type: repeat_group
-    range:
-      lo: 0
-      hi: {{ num_decoder_layers - 1 | int }}
-    attn:
-      type: group
-      c_attn:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 3 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 3 | int }}
-              - {{ hidden_size | int }}
-          q_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          q_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          k_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          v_out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int}}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if attn_fc_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          {% endif %}          
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-    ln_*:
-      type: repeat_group
-      range:
-        lo: 1
-        hi: 2
-      beta:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-      gamma:0:
-        type: data
-        dtype: {{ dtype }}
-        shape:
-          - {{ hidden_size | int }}
-    mlp:
-      type: group
-      c_fc:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size * 4 | int }}
-        smoothquant:
-          type: group
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size * 4 | int }}
-              - {{ hidden_size | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-      c_proj:
-        type: group
-        bias:0:
-          type: data
-          dtype: {{ dtype }}
-          shape:
-            - {{ hidden_size | int }}
-        smoothquant:
-          type: group
-          {% if ff2_smoothing %}
-          smoothing_vector:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          {% endif %}
-          weight:0:
-            type: data
-            dtype: {{ q_dtype }}
-            shape:
-              - {{ hidden_size | int }}
-              - {{ hidden_size * 4 | int }}
-          weight_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-          out_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size | int }}
-          in_scale:0:
-            type: data
-            dtype: float32
-            shape:
-              - {{ hidden_size * 4 | int }}
-  ln_f:
-    type: group
-    beta:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-    gamma:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ hidden_size | int }}
-  wpe:
-    type: group
-    weight:0:
-      type: data
-      dtype: {{ dtype }}
-      shape:
-        - {{ seq_len | int }}
-        - {{ hidden_size | int }}
-head_fc:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
-wte:
-  type: group
-  weight:0:
-    type: data
-    dtype: {{ dtype }}
-    shape:
-      - {{ vocab_size | int }}
-      - {{ hidden_size | int }}
diff --git a/tests/unit_tests/modules/test_awq.py b/tests/unit_tests/modules/test_awq.py
deleted file mode 100644
index 6123b159..00000000
--- a/tests/unit_tests/modules/test_awq.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-from __future__ import annotations
-
-from typing import Any, Dict
-
-import pytest
-
-from friendli.modules.converter.base import OneOfConverter
-from friendli.modules.converter.utils import get_tensor_from_state_dict
-from friendli.modules.quantizer.maps import get_quantized_converter
-from friendli.modules.quantizer.schema.config import AWQConfig
-
-from tests.unit_tests.modules.conftest import model_name_config_map
-from tests.unit_tests.modules.helpers.utils import (
-    AWQModelConfig,
-    get_awq_quantized_meta_model,
-    get_numpy_data_type,
-    get_param_specs,
-)
-
-awq_models = ["gpt_j", "gpt_neox", "llama", "mpt", "mistral"]
-awq_model_name_config_map = {}
-for model_name, model_config in model_name_config_map.items():
-    if model_name in awq_models:
-        awq_model_name_config_map[model_name] = model_config
-
-
-@pytest.fixture
-def quant_config() -> AWQConfig:
-    return AWQConfig()
-
-
-@pytest.fixture
-def render_awq_model_config(
-    converter: OneOfConverter, quant_config: AWQConfig
-) -> AWQModelConfig:
-    return AWQModelConfig(
-        dtype="float16",
-        num_decoder_layers=converter.decoder_layer_num,
-        hidden_size=converter.decoder_hidden_size,
-        num_heads=converter.decoder_num_attention_heads,
-        num_kv_heads=converter.decoder_num_kv_attention_heads,
-        head_size=converter.decoder_head_size,
-        num_encoder_layers=converter.decoder_layer_num,  # same as decoder for test
-        ff_intermediate_size=converter.decoder_ff_intermediate_size,
-        group_size=quant_config.awq_args.quant_group_size,
-        q_dtype="int8",
-    )
-
-
-@pytest.fixture
-def awq_spec_data(
-    model_name: str, render_awq_model_config: AWQModelConfig
-) -> Dict[str, Any]:
-    param_specs = get_param_specs(model_name, "awq", render_awq_model_config)
-    return param_specs
-
-
-@pytest.mark.parametrize(
-    "model_config",
-    awq_model_name_config_map.values(),
-)
-def test_convert_info_list_match_hf_state_dict(
-    converter: OneOfConverter, quant_config: AWQConfig
-):
-    quantizer = get_quantized_converter(quant_config, converter)
-    convert_info_list = quantizer.get_convert_info_list()
-    assert len(convert_info_list) != 0
-    quantized_model = get_awq_quantized_meta_model(
-        converter.config, quantizer, quant_config
-    )
-    state_dict = quantized_model.state_dict()
-    for convert_info in convert_info_list:
-        param_names = convert_info.param_names
-        for param_name in param_names:
-            assert param_name in state_dict
-
-
-@pytest.mark.parametrize(
-    "model_name, model_config",
-    awq_model_name_config_map.items(),
-)
-def test_quantized_model_match_spec(
-    converter: OneOfConverter, awq_spec_data: Dict[str, Any], quant_config: AWQConfig
-):
-    quantizer = get_quantized_converter(quant_config, converter)
-    quantized_model = get_awq_quantized_meta_model(
-        converter.config, quantizer, quant_config
-    )
-    state_dict = quantized_model.state_dict()
-    convert_info_list = quantizer.get_convert_info_list()
-    for convert_info in convert_info_list:
-        converted_name, reshape_fn, param_names, data_type = (
-            convert_info.converted_name,
-            convert_info.reshape_fn,
-            convert_info.param_names,
-            convert_info.data_type,
-        )
-        assert awq_spec_data[converted_name].dtype == get_numpy_data_type(
-            data_type
-        ), f"data type mismatch for {converted_name}: {param_names}"
-        params = [
-            get_tensor_from_state_dict(state_dict, param_name)
-            for param_name in param_names
-        ]
-        reshaped_tensor = reshape_fn(params)
-        assert (
-            awq_spec_data[converted_name].shape == reshaped_tensor.shape
-        ), f"shape mismatch for {converted_name}: {param_names}"
diff --git a/tests/unit_tests/modules/test_converter.py b/tests/unit_tests/modules/test_converter.py
deleted file mode 100644
index 3214c533..00000000
--- a/tests/unit_tests/modules/test_converter.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-from __future__ import annotations
-
-from typing import Any, Dict
-
-import pytest
-
-from friendli.modules.converter.base import OneOfConverter
-from friendli.modules.converter.utils import get_tensor_from_state_dict
-
-from tests.unit_tests.modules.conftest import model_name_config_map
-from tests.unit_tests.modules.helpers.utils import get_meta_model, get_numpy_data_type
-
-
-@pytest.mark.parametrize(
-    "model_config",
-    model_name_config_map.values(),
-)
-def test_convert_info_list_match_hf_state_dict(converter: OneOfConverter):
-    convert_info_list = converter.get_convert_info_list()
-    assert len(convert_info_list) != 0
-    model = get_meta_model(converter.config)
-    state_dict = model.state_dict()
-    for convert_info in convert_info_list:
-        param_names = convert_info.param_names
-        for param_name in param_names:
-            assert param_name in state_dict
-
-
-@pytest.mark.parametrize(
-    "model_name, model_config",
-    model_name_config_map.items(),
-)
-def test_convert_info_list_match_spec(
-    converter: OneOfConverter, spec_data: Dict[str, Any]
-):
-    convert_info_list = converter.get_convert_info_list()
-    assert len(convert_info_list) != 0
-    converted_param_names = set()
-    for convert_info in convert_info_list:
-        converted_param_names.add(convert_info.converted_name)
-
-    spec_converted_param_names = set(spec_data.keys())
-    assert converted_param_names == spec_converted_param_names
-
-
-@pytest.mark.parametrize(
-    "model_name, model_config",
-    model_name_config_map.items(),
-)
-def test_reshape_fn_match_spec(converter: OneOfConverter, spec_data: Dict[str, Any]):
-    convert_info_list = converter.get_convert_info_list()
-    model = get_meta_model(converter.config)
-    state_dict = model.state_dict()
-    for convert_info in convert_info_list:
-        converted_name, reshape_fn, param_names, data_type = (
-            convert_info.converted_name,
-            convert_info.reshape_fn,
-            convert_info.param_names,
-            convert_info.data_type,
-        )
-        assert spec_data[converted_name].dtype == get_numpy_data_type(
-            data_type
-        ), f"data type mismatch for {converted_name}: {param_names}"
-        params = [
-            get_tensor_from_state_dict(state_dict, param_name)
-            for param_name in param_names
-        ]
-        reshaped_tensor = reshape_fn(params)
-        assert (
-            spec_data[converted_name].shape == reshaped_tensor.shape
-        ), f"shape mismatch for {converted_name}: {param_names}"
diff --git a/tests/unit_tests/modules/test_lora_adapter_converter.py b/tests/unit_tests/modules/test_lora_adapter_converter.py
deleted file mode 100644
index e1626d09..00000000
--- a/tests/unit_tests/modules/test_lora_adapter_converter.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-from __future__ import annotations
-
-from typing import Any, Dict, cast
-
-import pytest
-from peft import LoraConfig
-
-from friendli.modules.converter.base import DecoderOnlyConverter, OneOfConverter
-from friendli.modules.converter.maps import get_adapter_converter_factory
-from friendli.modules.converter.utils import get_model_arch, get_tensor_from_state_dict
-
-from tests.unit_tests.modules.conftest import model_name_config_map
-from tests.unit_tests.modules.helpers.utils import (
-    LoraAdapterConfig,
-    get_meta_model_with_adapter,
-    get_numpy_data_type,
-    get_param_specs,
-)
-
-model_with_adpater = ["mpt", "llama"]
-model_with_adpater_name_config_map = {}
-for model_name, model_config in model_name_config_map.items():
-    if model_name in model_with_adpater:
-        model_with_adpater_name_config_map[model_name] = model_config
-
-
-@pytest.fixture
-def adapter_config(converter: OneOfConverter) -> LoraConfig:
-    model_type = cast(DecoderOnlyConverter, converter).config.model_type
-    if model_type == "mpt":
-        return LoraConfig(target_modules=["Wqkv"])
-    elif model_type == "llama":
-        return LoraConfig(
-            target_modules=["q_proj", "k_proj", "o_proj", "up_proj", "down_proj"]
-        )
-    return LoraConfig()
-
-
-@pytest.fixture
-def render_lora_adapter_config(
-    converter: OneOfConverter, adapter_config: LoraConfig
-) -> LoraAdapterConfig:
-    return LoraAdapterConfig(
-        dtype="float16",
-        num_decoder_layers=converter.decoder_layer_num,
-        hidden_size=converter.decoder_hidden_size,
-        num_heads=converter.decoder_num_attention_heads,
-        num_kv_heads=converter.decoder_num_kv_attention_heads,
-        head_size=converter.decoder_head_size,
-        num_encoder_layers=converter.decoder_layer_num,  # same as decoder for test
-        ff_intermediate_size=converter.decoder_ff_intermediate_size,
-        lora_rank_dim=adapter_config.r,
-    )
-
-
-@pytest.fixture
-def lora_spec_data(
-    model_name: str, render_lora_adapter_config: LoraAdapterConfig
-) -> Dict[str, Any]:
-    param_specs = get_param_specs(model_name, "lora", render_lora_adapter_config)
-    return param_specs
-
-
-@pytest.mark.parametrize(
-    "model_config",
-    model_with_adpater_name_config_map.values(),
-)
-def test_convert_info_list_match_hf_state_dict(
-    converter: OneOfConverter,
-    adapter_config: LoraConfig,
-):
-    model_arch = get_model_arch(converter.config)
-    adapter_converter_cls = get_adapter_converter_factory(model_arch)
-    adapter_converter = adapter_converter_cls(converter, adapter_config)
-
-    convert_info_list = adapter_converter.get_convert_info_list()
-    model_with_adapter = get_meta_model_with_adapter(
-        adapter_converter.converter.config, adapter_converter.adapter_config
-    )
-    state_dict = model_with_adapter.state_dict()
-    for convert_info in convert_info_list:
-        param_names = convert_info.param_names
-        for param_name in param_names:
-            assert param_name in state_dict
-
-
-@pytest.mark.parametrize(
-    "model_name, model_config",
-    model_with_adpater_name_config_map.items(),
-)
-def test_model_with_lora_match_spec(
-    converter: OneOfConverter,
-    lora_spec_data: Dict[str, Any],
-    adapter_config: LoraConfig,
-):
-    model_arch = get_model_arch(converter.config)
-    adapter_converter_cls = get_adapter_converter_factory(model_arch)
-    adapter_converter = adapter_converter_cls(converter, adapter_config)
-
-    convert_info_list = adapter_converter.get_convert_info_list()
-    model_with_adapter = get_meta_model_with_adapter(
-        adapter_converter.converter.config, adapter_converter.adapter_config
-    )
-    state_dict = model_with_adapter.state_dict()
-    for convert_info in convert_info_list:
-        converted_name, reshape_fn, param_names, data_type = (
-            convert_info.converted_name,
-            convert_info.reshape_fn,
-            convert_info.param_names,
-            convert_info.data_type,
-        )
-        assert lora_spec_data[converted_name].dtype == get_numpy_data_type(
-            data_type
-        ), f"data type mismatch for {converted_name}: {param_names}"
-        params = [
-            get_tensor_from_state_dict(state_dict, param_name)
-            for param_name in param_names
-        ]
-        reshaped_tensor = reshape_fn(params)
-        assert (
-            lora_spec_data[converted_name].shape == reshaped_tensor.shape
-        ), f"shape mismatch for {converted_name}: {param_names}"
diff --git a/tests/unit_tests/modules/test_smoothquant.py b/tests/unit_tests/modules/test_smoothquant.py
deleted file mode 100644
index 06904bc5..00000000
--- a/tests/unit_tests/modules/test_smoothquant.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2024-present, FriendliAI Inc. All rights reserved.
-
-# # Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-# from __future__ import annotations
-
-# from typing import Any, Dict
-
-# import pytest
-
-# from friendli.modules.converter.base import OneOfConverter
-# from friendli.modules.converter.utils import get_tensor_from_state_dict
-# from friendli.modules.quantizer.maps import get_quantized_converter
-# from friendli.modules.quantizer.schema.config import SmoothQuantArgs, SmoothQuantConfig
-
-# from tests.unit_tests.modules.conftest import model_name_config_map
-# from tests.unit_tests.modules.helpers.utils import (
-#     SmoothQuantModelConfig,
-#     get_numpy_data_type,
-#     get_param_specs,
-#     get_smoothquant_quantized_meta_model,
-# )
-
-# smoothquant_models = [
-#     "bloom",
-#     "codegen",
-#     "falcon",
-#     "falcon_7b",
-#     "gpt_j",
-#     "gpt_neox",
-#     "llama",
-#     "mpt",
-#     "opt",
-# ]
-# smoothquant_model_name_config_map = {}
-# for model_name, model_config in model_name_config_map.items():
-#     if model_name in smoothquant_models:
-#         smoothquant_model_name_config_map[model_name] = model_config
-
-
-# @pytest.fixture
-# def quant_config() -> SmoothQuantConfig:
-#     return SmoothQuantConfig(
-#         smoothquant_args=SmoothQuantArgs(
-#             attn_fc_smoothing=True,
-#             ff2_smoothing=True,
-#         )
-#     )
-
-
-# @pytest.fixture
-# def render_smoothquant_model_config(
-#     converter: OneOfConverter, quant_config: SmoothQuantConfig
-# ) -> SmoothQuantModelConfig:
-#     return SmoothQuantModelConfig(
-#         dtype="float16",
-#         num_decoder_layers=converter.decoder_layer_num,
-#         hidden_size=converter.decoder_hidden_size,
-#         num_heads=converter.decoder_num_attention_heads,
-#         num_kv_heads=converter.decoder_num_kv_attention_heads,
-#         head_size=converter.decoder_head_size,
-#         num_encoder_layers=converter.decoder_layer_num,  # same as decoder for test
-#         ff_intermediate_size=converter.decoder_ff_intermediate_size,
-#         attn_fc_smoothing=quant_config.smoothquant_args.attn_fc_smoothing,
-#         ff2_smoothing=quant_config.smoothquant_args.ff2_smoothing,
-#         q_dtype="int8",
-#     )
-
-
-# @pytest.fixture
-# def smoothquant_spec_data(
-#     model_name: str, render_smoothquant_model_config: SmoothQuantModelConfig
-# ) -> Dict[str, Any]:
-#     param_specs = get_param_specs(
-#         model_name, "smoothquant", render_smoothquant_model_config
-#     )
-#     return param_specs
-
-
-# @pytest.mark.parametrize(
-#     "model_config",
-#     smoothquant_model_name_config_map.values(),
-# )
-# def test_convert_info_list_match_hf_state_dict(
-#     converter: OneOfConverter, quant_config: SmoothQuantConfig
-# ):
-#     quantizer = get_quantized_converter(quant_config, converter)
-#     convert_info_list = quantizer.get_convert_info_list()
-#     assert len(convert_info_list) != 0
-#     quantized_model = get_smoothquant_quantized_meta_model(converter.config, quantizer)
-#     state_dict = quantized_model.state_dict()
-#     for convert_info in convert_info_list:
-#         param_names = convert_info.param_names
-#         for param_name in param_names:
-#             assert param_name in state_dict
-
-
-# @pytest.mark.parametrize(
-#     "model_name, model_config",
-#     smoothquant_model_name_config_map.items(),
-# )
-# def test_quantized_model_match_spec(
-#     converter: OneOfConverter,
-#     smoothquant_spec_data: Dict[str, Any],
-#     quant_config: SmoothQuantConfig,
-# ):
-#     quantizer = get_quantized_converter(quant_config, converter)
-#     quantized_model = get_smoothquant_quantized_meta_model(converter.config, quantizer)
-#     state_dict = quantized_model.state_dict()
-#     convert_info_list = quantizer.get_convert_info_list()
-#     for convert_info in convert_info_list:
-#         converted_name, reshape_fn, param_names, data_type = (
-#             convert_info.converted_name,
-#             convert_info.reshape_fn,
-#             convert_info.param_names,
-#             convert_info.data_type,
-#         )
-#         assert smoothquant_spec_data[converted_name].dtype == get_numpy_data_type(
-#             data_type
-#         ), f"data type mismatch for {converted_name}: {param_names}"
-#         params = [
-#             get_tensor_from_state_dict(state_dict, param_name)
-#             for param_name in param_names
-#         ]
-#         reshaped_tensor = reshape_fn(params)
-#         assert (
-#             smoothquant_spec_data[converted_name].shape == reshaped_tensor.shape
-#         ), f"shape mismatch for {converted_name}: {param_names}"