diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
deleted file mode 100644
index 0c28948fa48..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
+++ /dev/null
@@ -1,82 +0,0 @@
-
-Step-by-Step
-============
-This document describes the step-by-step instructions to run [VLM quantization for Llava](https://huggingface.co/liuhaotian/llava-v1.5-7b) using AutoRound Quantization.
-
-# Run Quantization on Multimodal Models
-
-In this example, we introduce an straight-forward way to execute quantization on some popular multimodal models such as LLaVA. 
-
-Please note that LLAVA quantized model is currently only support inference with **auto_round** format.
-
-## Install
-If you are not using Linux, do NOT proceed, see instructions for [macOS](https://github.com/haotian-liu/LLaVA/blob/main/docs/macOS.md) and [Windows](https://github.com/haotian-liu/LLaVA/blob/main/docs/Windows.md).
-
-1. Clone this repository and navigate to LLaVA folder
-```shell
-git clone https://github.com/haotian-liu/LLaVA.git
-cd LLaVA
-```
-
-2. Install Package
-```
-pip install --upgrade pip  # enable PEP 660 support
-pip install -e .
-```
-
-## Download the calibration/Evaluation data
-
-Our calibration process resembles the official visual instruction tuning process. To align the official implementation of [LLaVA](https://github.com/haotian-liu/LLaVA/tree/main?tab=readme-ov-file#visual-instruction-tuning)
-
-Please download the annotation of the final mixture our instruction tuning data [llava_v1_5_mix665k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json), and download the images from constituting datasets:
-
-COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip the image folder to any directory you desire.
-
-Please refer to [llava_eval_datasets](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md#scripts) to download the textVQA dataset for evaluation usage
-
-<br />
-
-## 2. Run Examples
-Enter into the examples folder and install requirements
-
-```bash
-pip install -r requirements.txt
-```
-
-- **Default Settings:**
-```bash
-CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name liuhaotian/llava-v1.5-7b  --bits 4 --group_size 128 --quantize
-```
-
-## 3. Results
-Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. When the vision components are not involved in quantization, it is able to achieve accuracy loss within 1%. The results for fake quantized LLava-7b are as follows:
-| Model | Config | Precision | Hyperparameter | Accuracy% | Relative drop |
-|  :----: | :----: | :----: | :----: | :----: | :----: |
-| liuhaotian/llava-v1.5-7b | - | FP16 | - | 58.21 | - |
-| liuhaotian/llava-v1.5-7b | W4G128 | FP16 | with vision | 56.39 | -3.13% |
-| liuhaotian/llava-v1.5-7b | W4G128 | FP16 | w/o vision | 58.08 | -0.22% |
-
-
-## 4. Known Issues
-* huggingface format model is not support yet, e.g. llava-1.5-7b-hf
-* Setting seqlen to 2048 is not working yet.
-
-
-## 5. Environment
-
-PyTorch 1.8 or higher version is needed
-
-
-## Reference
-If you find SignRound useful for your research, please cite our paper:
-```bash
-@article{cheng2023optimize,
-  title={Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs},
-  author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao},
-  journal={arXiv preprint arXiv:2309.05516},
-  year={2023}
-}
-```
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
deleted file mode 100644
index ca97a8f3797..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
+++ /dev/null
@@ -1,377 +0,0 @@
-import argparse
-parser = argparse.ArgumentParser()
-import torch
-import os
-import transformers
-# # os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-# torch.use_deterministic_algorithms(True, warn_only=True)
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
-from transformers import set_seed
-
-import re
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-import copy
-from PIL import Image
-import json
-from torch.utils.data import Dataset, DataLoader
-from llava.mm_utils import get_model_name_from_path
-from llava.train.train import preprocess, preprocess_multimodal, DataCollatorForSupervisedDataset
-from llava.model.builder import load_pretrained_model
-from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                    get_layer_names_in_block,
-                                                    detect_device,
-                                                    run_fn_for_vlm_autoround
-                                                    )
-from neural_compressor.torch.quantization import (AutoRoundConfig,
-                                                    prepare,
-                                                    convert,
-                                                    load)
-
-
-def save_tower(model, save_path, quant_vision: bool = False, max_shard_size: str = "5GB", safe_serialization: bool = True):
-    if not quant_vision:
-        print("Won't save vision_tower since this part was not quantized.")
-        return
-    ori_path = save_path
-    ori_tower_name = model.get_vision_tower().vision_tower_name
-    vision_tower = model.get_vision_tower().vision_tower
-    save_path = f'{save_path}-vision_tower'
-    os.makedirs(save_path, exist_ok=True)
-    quantization_config = model.config.quantization_config
-    redundant_prefix = "model.vision_tower.vision_tower."
-    org_block_list = copy.deepcopy(quantization_config['quant_block_list'])
-    # prepare vision_tower quantize list
-    quant_block_list = [element.split(redundant_prefix)[1] if redundant_prefix in element else "" \
-                        for sublist in org_block_list for element in sublist]
-    quant_block_list = [[element for element in quant_block_list if element != ""]]
-    quantization_config['quant_block_list'] = quant_block_list
-    if hasattr(vision_tower, "config"):
-        from transformers import AutoProcessor
-        processor = AutoProcessor.from_pretrained(ori_tower_name)
-        processor.save_pretrained(save_path)
-        vision_tower.config.quantization_config = quantization_config
-        vision_tower.config.save_pretrained(save_path)
-    vision_tower.save_pretrained(save_path, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
-    # prepare llava model quantize list
-    quant_block_list = [element if redundant_prefix not in element else "" \
-                        for sublist in org_block_list for element in sublist]
-    quant_block_list = [[element for element in quant_block_list if element != ""]]
-    quantization_config['quant_block_list'] = quant_block_list
-    model.config.mm_vision_tower = save_path
-    model.config.save_pretrained(ori_path)
-    
-
-class CustomDataset(Dataset): # for llava tuning
-    # much refer to https://github.com/haotian-liu/LLaVA/blob/main/llava/train/train.py
-    def __init__(self, list_data_dict, image_folder, tokenizer, image_processor, args):
-        self.list_data_dict = list_data_dict
-        self.image_folder = image_folder
-        self.tokenizer = tokenizer
-        self.image_processor = image_processor
-        self.args = args
-        self.args.is_multimodal = args.is_multimodal
-
-    def __getitem__(self, index):
-        sources = self.list_data_dict[index]
-        # image = None
-        image_file = os.path.basename(sources["image"])
-        try:
-            image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
-            image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-        except Exception as error:
-            print(f"{error}, skipped by set image to None")
-            image = None
-        sources = preprocess_multimodal(
-            copy.deepcopy([sources["conversations"]]), # a list
-            self.args,
-        )
-        data_dict = preprocess(
-            sources,
-            self.tokenizer,
-            has_image=('image' in self.list_data_dict[index]),
-        )
-        if isinstance(index, int):
-            data_dict = dict(input_ids=data_dict["input_ids"][0],
-                            labels=data_dict["labels"][0])
-        # image exist in the data
-        data_dict['image'] = image
-        return data_dict
-
-    def __len__(self):
-        return len(self.list_data_dict)
-
-
-def create_data_loader(dataset, batch_size=1, data_collator=None):
-    assert batch_size == 1, "batch_size must be 1"
-    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
-    return data_loader
-
-if __name__ == '__main__':
-
-    parser.add_argument("--model_name", default="liuhaotian/llava-v1.5-7b")
-    
-    parser.add_argument("--quantize", action="store_true")
-    
-    parser.add_argument("--accuracy", action="store_true")
-
-    parser.add_argument("--bits", default=4, type=int,
-                        help="number of  bits")
-
-    parser.add_argument("--group_size", default=128, type=int,
-                        help="group size")
-
-    parser.add_argument("--train_bs", default=1, type=int,
-                        help="train batch size")
-
-    parser.add_argument("--eval_bs", default=4, type=int,
-                        help="eval batch size")
-
-    parser.add_argument("--device", default="auto", type=str,
-                        help="The device to be used for tuning. The default is set to auto/None,"
-                             "allowing for automatic detection. Currently, device settings support CPU, GPU, and HPU.")
-
-    parser.add_argument("--sym", action='store_true',
-                        help=" sym quantization")
-
-    parser.add_argument("--iters", default=200, type=int,
-                        help=" iters")
-
-    parser.add_argument("--lr", default=None, type=float,
-                        help="learning rate, if None, it will be set to 1.0/iters automatically")
-
-    parser.add_argument("--minmax_lr", default=None, type=float,
-                        help="minmax learning rate, if None,it will beset to be the same with lr")
-
-    parser.add_argument("--seed", default=42, type=int,
-                        help="seed")
-
-    parser.add_argument("--eval_fp16_baseline", action='store_true',
-                        help="whether to eval FP16 baseline")
-
-    parser.add_argument("--adam", action='store_true',
-                        help="adam")
-
-    parser.add_argument("--seqlen", default=512, type=int,
-                        help="sequence length")
-
-    parser.add_argument("--gradient_accumulate_steps", default=1, type=int, help="gradient accumulate steps")
-
-    parser.add_argument("--nblocks", default=1, type=int, help="num of blocks to tune together")
-
-    parser.add_argument("--nsamples", default=512, type=int,
-                        help="number of samples")
-
-    parser.add_argument("--low_gpu_mem_usage", action='store_true',
-                        help="low_gpu_mem_usage is deprecated")
-
-    parser.add_argument("--export_format", default='auto_round:gptq', type=str,
-                        help="targeted inference acceleration platform,The options are 'fake', 'cpu', 'gpu', 'xpu' and 'auto_round'."
-                             "default to 'fake', indicating that it only performs fake quantization and won't be exported to any device.")
-
-    parser.add_argument("--scale_dtype", default='fp16',
-                        help="which scale data type to use for quantization, 'fp16', 'fp32' or 'bf16'.")
-
-    parser.add_argument("--output_dir", default="./tmp_autoround", type=str,
-                        help="Where to store the final model.")
-
-    parser.add_argument("--disable_eval", action='store_true',
-                        help="Whether to do lmeval evaluation.")
-
-    parser.add_argument("--disable_amp", action='store_true',
-                        help="disable amp")
-
-    parser.add_argument("--disable_minmax_tuning", action='store_true',
-                        help="whether disable  enable weight minmax tuning")
-
-    parser.add_argument("--disable_trust_remote_code", action='store_true',
-                        help="Whether to disable trust_remote_code")
-
-    parser.add_argument("--disable_quanted_input", action='store_true',
-                        help="whether to disuse the output of quantized block to tune the next block")
-
-    parser.add_argument("--quant_lm_head", action='store_true',
-                        help="quant_lm_head")
-
-    parser.add_argument("--model_dtype", default=None, type=str,
-                        help="force to convert the dtype, some backends supports fp16 dtype better")
-    
-    parser.add_argument("--act_bits", default=32, type=int,
-                    help="activation bits")
-    
-    parser.add_argument("--is_multimodal", type=bool, default=True,
-                        help="To determine whether the preprocessing should handle multimodal infomations.")
-    
-    parser.add_argument("--quant_vision", action='store_true',
-                        help="To determine whether the quantization should handle vision component.")
-    
-    # ========== Calibration Datasets ============= 
-    parser.add_argument("--mm-use-im-start-end", type=bool, default=False)
-    
-    parser.add_argument("--image_folder", default="coco", type=str,
-                        help="The dataset for quantization training. It can be a custom one.")
-    
-    parser.add_argument("--question_file", default=None, type=str,
-                            help="The dataset for quantization training. It can be a custom one.")
-    
-    # ================= Evaluation Related =====================
-    parser.add_argument("--eval_question_file", type=str, default="tables/question.jsonl")
-    
-    parser.add_argument("--eval_image_folder", type=str)
-    
-    parser.add_argument('--eval_result_file', type=str, default="./tmp_results")
-    
-    parser.add_argument('--eval_annotation_file', type=str)
-
-    args = parser.parse_args()
-
-    if args.quantize:
-        set_seed(args.seed)
-
-        if args.act_bits <= 8:
-            print(
-                "Warning, activation quantization is an experiment feature")
-        
-        if args.act_bits <= 8 and args.export_format != "fake":
-            assert False, "only support fake mode for activation quantization currently"
-            
-        if "marlin" in args.export_format and args.sym == False:
-            assert False, "marlin backend only supports sym quantization, please set --sym"
-            
-        model_name = args.model_name
-        if model_name[-1] == "/":
-            model_name = model_name[:-1]
-        print(model_name, flush=True)
-
-        device_str = detect_device(args.device)
-        torch_dtype = "auto"
-        torch_device = torch.device(device_str)
-        model_path = args.model_name
-        model_name = get_model_name_from_path(model_path)
-        tokenizer, model, image_processor, _ = load_pretrained_model(model_path, model_base=None, model_name=model_name,
-                torch_dtype=torch_dtype)
-
-        model = model.eval()
-
-        if args.model_dtype != None:
-            if args.model_dtype == "float16" or args.model_dtype == "fp16":
-                model = model.to(torch.float16)
-            if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
-                model = model.to(torch.bfloat16)
-                
-        seqlen = args.seqlen
-        if hasattr(tokenizer, "model_max_length"):
-            if tokenizer.model_max_length < seqlen:
-                print(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length",
-                    flush=True)
-                seqlen = min(seqlen, tokenizer.model_max_length)
-                args.seqlen = seqlen
-
-        excel_name = f"{model_name}_{args.bits}_{args.group_size}"
-        pt_dtype = torch.float16
-        if (hasattr(model, 'config') and (model.dtype is torch.bfloat16 or model.config.torch_dtype is torch.bfloat16)):
-            dtype = 'bfloat16'
-            pt_dtype = torch.bfloat16
-        else:
-            if str(args.device) != "cpu":
-                pt_dtype = torch.float16
-                dtype = 'float16'
-            else:
-                pt_dtype = torch.float32
-                dtype = 'float32'
-
-        questions = json.load(open(args.question_file, "r"))
-        dataset = CustomDataset(questions, args.image_folder, tokenizer, image_processor, args=args)
-        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
-        dataloader = create_data_loader(dataset, args.train_bs, data_collator)
-
-        quant_block_list = get_multimodal_block_names(model, args.quant_vision)
-            
-        quant_config = AutoRoundConfig(bits=args.bits, use_sym=args.sym, batch_size=args.train_bs, group_size=args.group_size,
-                            seqlen=seqlen, nblocks=args.nblocks, iters=args.iters, lr=args.lr,
-                            minmax_lr=args.minmax_lr, enable_quanted_input=not args.disable_quanted_input,
-                            nsamples=args.nsamples, seed=args.seed, gradient_accumulate_steps=args.gradient_accumulate_steps,
-                            scale_dtype=args.scale_dtype, enable_minmax_tuning=not args.disable_minmax_tuning, act_bits=args.act_bits,
-                            quant_block_list=quant_block_list, export_format=args.export_format)
-        
-        all_block_list = get_multimodal_block_names(model, quant_vision=True)
-        all_block_set = set(tuple(block) for block in all_block_list)
-        quant_block_set = set(tuple(block) for block in quant_block_list)
-        set_to_full_prec = list(all_block_set - quant_block_set)
-        set_to_full_prec = get_layer_names_in_block(model, quant_block_list=set_to_full_prec)
-        for name in set_to_full_prec:
-            quant_config.set_local(name, AutoRoundConfig(dtype="fp32"))
-            
-        # skip special layers
-        quant_config.set_local("model.mm_projector*", AutoRoundConfig(dtype="fp32"))
-            
-        for n, m in model.named_modules():
-            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
-                if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                    quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
-                    print(
-                        f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
-        
-        lm_head_layer_name = "lm_head"
-        if args.quant_lm_head:
-            from transformers import AutoConfig
-            config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
-            if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
-                tied_keys = model._tied_weights_keys
-                for item in tied_keys:
-                    if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
-                        args.quant_lm_head = False
-                        print(
-                            f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
-                            f"supported currently")
-                        break
-                    
-        if not args.quant_lm_head:
-                quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
-                transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
-                if transformers_version[0] == 4 and transformers_version[1] < 38:
-                    error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
-                    raise EnvironmentError(error_message)
-        
-        run_args = (dataloader, seqlen, args.nsamples)
-        user_model = prepare(model=model, quant_config=quant_config)
-        run_fn_for_vlm_autoround(user_model, *run_args)
-        user_model = convert(user_model)
-
-        from neural_compressor.torch.utils import LoadFormat
-        save_tower(user_model, args.output_dir, quant_vision=args.quant_vision)
-        user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(args.output_dir)
-
-    if args.accuracy:
-        device_str = detect_device(args.device)
-        torch_device = torch.device(device_str)
-        model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code)
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=False)
-        vision_tower = model.get_vision_tower()
-        if not vision_tower.is_loaded:
-            vision_tower.load_model() # replace vision_tower
-            vision_tower.to(device=model.device, dtype=model.dtype)
-        image_processor = vision_tower.image_processor
-        model = model.to(torch_device)
-        model_path = args.model_name
-        model_name = get_model_name_from_path(model_path)
-        from mm_evaluation import TextVQAEvaluator
-        evaluator = TextVQAEvaluator(
-            model,
-            tokenizer,
-            image_processor,
-            args.eval_image_folder,
-            args.eval_question_file,
-            args.eval_annotation_file,
-            model_name = model_name
-        )
-        evaluator.run_evaluate(result_file = args.eval_result_file)
-        evaluator.calculate_accuracy(result_file = args.eval_result_file)
-
-
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/__init__.py
deleted file mode 100644
index 42c010e5e21..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .textvqa import TextVQAEvaluator
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/textvqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/textvqa.py
deleted file mode 100644
index 2dd384603da..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/textvqa.py
+++ /dev/null
@@ -1,201 +0,0 @@
-import sys
-import os
-import math
-from tqdm import tqdm
-import shortuuid
-import json
-import re
-
-from PIL import Image
-
-import torch
-from torch.utils.data import Dataset, DataLoader
-from llava.utils import disable_torch_init
-from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
-from llava.mm_utils import tokenizer_image_token, process_images
-from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
-from llava.conversation import conv_templates, SeparatorStyle
-
-def split_list(lst, n):
-    """Split a list into n (roughly) equal-sized chunks"""
-    chunk_size = math.ceil(len(lst) / n)  # integer division
-    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
-
-def get_chunk(lst, n, k):
-    chunks = split_list(lst, n)
-    return chunks[k]
-
-def collate_fn(batch):
-    input_ids, image_tensors, image_sizes = zip(*batch)
-    input_ids = torch.stack(input_ids, dim=0)
-    image_tensors = torch.stack(image_tensors, dim=0)
-    return input_ids, image_tensors, image_sizes
-
-class CustomDatasetTextVQA(Dataset):
-    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, conv_mode):
-        self.questions = questions
-        self.image_folder = image_folder
-        self.tokenizer = tokenizer
-        self.image_processor = image_processor
-        self.model_config = model_config
-        self.conv_mode = conv_mode
-
-    def __getitem__(self, index):
-        # import pdb;pdb.set_trace()
-        line = self.questions[index]
-        image_file = line["image"]
-        qs = line["text"]
-        if self.model_config.mm_use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
-        else:
-            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
-
-        conv = conv_templates[self.conv_mode].copy()
-        conv.append_message(conv.roles[0], qs)
-        conv.append_message(conv.roles[1], None)
-        prompt = conv.get_prompt()
-
-        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
-        image_tensor = process_images([image], self.image_processor, self.model_config)[0]
-
-        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
-
-        return input_ids, image_tensor, image.size
-
-    def __len__(self):
-        return len(self.questions)
-
-class TextVQAEvaluator(object):
-    def __init__(
-            self, 
-            model, 
-            tokenizer, 
-            image_processor, 
-            image_folder,
-            question_file,
-            annotation_file, 
-            **kwargs
-        ):
-        super(TextVQAEvaluator, self).__init__()
-        self.model = model
-        self.tokenizer = tokenizer
-        self.image_processor = image_processor
-        self.image_folder = image_folder
-        self.question_file = question_file
-        self.annotation_file = annotation_file
-        # follow parameters can be set as default value.
-        self.model_name = kwargs.get("model_name", "llava")
-        self.conv_mode = kwargs.get("conv_mode", "vicuna_v1")
-        self.num_chunks = kwargs.get("num_chunks", 1)
-        self.chunk_idx = kwargs.get("chunk_idx", 0)
-        self.temperature = kwargs.get("temperature", 0)
-        self.top_p = kwargs.get("top_p", None)
-        self.num_beams = kwargs.get("num_beams", 1)
-        self.max_new_tokens = kwargs.get("max_new_tokens", 128)
-
-        if 'plain' in self.model_name and 'finetune' not in self.model_name.lower() and 'mmtag' not in self.conv_mode:
-            self.conv_mode = self.conv_mode + '_mmtag'
-            print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {self.conv_mode}.')
-
-    def create_dataloader(self):
-        questions = [json.loads(q) for q in open(os.path.expanduser(self.question_file), "r")]
-        questions = get_chunk(questions, self.num_chunks, self.chunk_idx)
-        dataset = CustomDatasetTextVQA(questions, self.image_folder, self.tokenizer, self.image_processor, self.model.config, self.conv_mode)
-        data_loader = DataLoader(dataset, batch_size=1, num_workers=4, shuffle=False, collate_fn=collate_fn)
-        return data_loader, questions
-
-    def run_evaluate(self, result_file = None):
-        disable_torch_init()
-        dataloader, questions = self.create_dataloader()
-        result_file = os.path.expanduser(result_file)
-        os.makedirs(os.path.dirname(result_file), exist_ok=True)
-        res_file = open(result_file, "w")
-        for (input_ids, image_tensor, image_sizes), line in tqdm(zip(dataloader, questions), total=len(questions)):
-            idx = line["question_id"]
-            cur_prompt = line["text"]
-
-            input_ids = input_ids.to(device='cuda', non_blocking=True)
-
-            with torch.inference_mode():
-                output_ids = self.model.generate(
-                    input_ids,
-                    images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
-                    image_sizes=image_sizes,
-                    do_sample=True if self.temperature > 0 else False,
-                    temperature=self.temperature,
-                    top_p=self.top_p,
-                    num_beams=self.num_beams,
-                    max_new_tokens=self.max_new_tokens,
-                    use_cache=True)
-
-            outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-
-            ans_id = shortuuid.uuid()
-            res_file.write(json.dumps({"question_id": idx,
-                                    "prompt": cur_prompt,
-                                    "text": outputs,
-                                    "answer_id": ans_id,
-                                    "model_id": self.model_name,
-                                    "metadata": {}}) + "\n")
-        res_file.close()
-
-    def prompt_processor(self, prompt):
-        if prompt.startswith('OCR tokens: '):
-            pattern = r"Question: (.*?) Short answer:"
-            match = re.search(pattern, prompt, re.DOTALL)
-            question = match.group(1)
-        elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
-            if prompt.startswith('Reference OCR token:'):
-                question = prompt.split('\n')[1]
-            else:
-                question = prompt.split('\n')[0]
-        elif len(prompt.split('\n')) == 2:
-            question = prompt.split('\n')[0]
-        else:
-            assert False
-
-        return question.lower()
-    
-    def calculate_accuracy(self, result_file = None):
-        experiment_name = os.path.splitext(os.path.basename(result_file))[0]
-        print(experiment_name)
-        annotations = json.load(open(self.annotation_file))['data']
-        annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
-        results = [json.loads(line) for line in open(result_file)]
-
-        pred_list = []
-        for result in results:
-            annotation = annotations[(result['question_id'], self.prompt_processor(result['prompt']))]
-            pred_list.append({
-                "pred_answer": result['text'],
-                "gt_answers": annotation['answers'],
-            })
-
-        evaluator = TextVQAAccuracyEvaluator()
-        print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
-
-
-
-# results
-
-
-
-# def eval_single(annotation_file, result_file):
-#     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
-#     print(experiment_name)
-#     annotations = json.load(open(annotation_file))['data']
-#     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
-#     results = [json.loads(line) for line in open(result_file)]
-
-#     pred_list = []
-#     for result in results:
-#         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
-#         pred_list.append({
-#             "pred_answer": result['text'],
-#             "gt_answers": annotation['answers'],
-#         })
-
-#     evaluator = TextVQAAccuracyEvaluator()
-#     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/requirements.txt
deleted file mode 100644
index 0a3d5a0a420..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/requirements.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-transformers
-torch
-tiktoken
-transformers_stream_generator
-peft
-sentencepiece
-einops
-accelerate
-datasets
-protobuf
-auto-gptq
-openpyxl
-wandb
-py-cpuinfo
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
deleted file mode 100644
index 991ee772610..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  for var in "$@"
-  do
-    case $var in
-      --model_name=*)
-          model_name=$(echo $var |cut -f2 -d=)
-      ;;
-      --image_folder=*)
-          image_folder=$(echo $var |cut -f2 -d=)
-      ;;
-      --question_file=*)
-          question_file=$(echo $var |cut -f2 -d=)
-      ;;
-      --output_dir=*)
-          output_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    python main.py \
-            --model_name ${model_name} \
-            --bits 4 \
-            --group_size 128 \
-            --iters 200 \
-            --seqlen 512 \
-            --quantize \
-            --image_folder ${image_folder} \
-            --question_file ${question_file} \
-            --output_dir ${output_dir}
-}
-
-
-main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
deleted file mode 100644
index c6c978e465d..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_evaluation
-
-}
-
-# init params
-function init_params {
-  for var in "$@"
-  do
-    case $var in
-      --model_name=*)
-          model_name=$(echo $var |cut -f2 -d=)
-      ;;
-      --eval_question_file=*)
-          eval_question_file=$(echo $var |cut -f2 -d=)
-      ;;
-      --eval_image_folder=*)
-          eval_image_folder=$(echo $var |cut -f2 -d=)
-      ;;
-      --eval_annotation_file=*)
-          eval_annotation_file=$(echo $var |cut -f2 -d=)
-      ;; 
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_evaluation
-function run_evaluation {
-    python main.py \
-            --accuracy \
-            --model_name ${model_name} \
-            --eval_question_file ${eval_question_file} \
-            --eval_image_folder ${eval_image_folder} \
-            --eval_annotation_file ${eval_annotation_file}
-}
-
-main "$@"
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
deleted file mode 100644
index 930ec7963d7..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
+++ /dev/null
@@ -1,111 +0,0 @@
-Step-by-Step
-============
-This document describes the step-by-step instructions to run [VLM quantization for Phi3-Vision](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) using AutoRound Quantization.
-
-# Run Quantization on Phi-3-vision Models
-
-In this example, we introduce an straight-forward way to execute quantization on some popular multimodal models such as Phi-3-vision. 
-
-## Download the calibration data
-
-Our calibration process resembles the official visual instruction tuning process.
-
-Please download the annotation of the final mixture our instruction tuning data [llava_v1_5_mix665k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json), and download the images from constituting datasets:
-
-COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip the image folder to any directory you desire.
-
-
-## 2. Run Examples
-PyTorch 1.8 or higher version is needed
-
-Enter into the examples folder and install lm-eval to run the evaluation
-```bash
-pip install -r requirements.txt
-```
-
-- **Default Settings:**
-```bash
-CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name microsoft/Phi-3-vision-128k-instruct  --bits 4 --group_size 128  --quantize
-```
-
-
-## 3. Run Inference
-
-```python
-from PIL import Image
-import requests
-import io
-from transformers import AutoModelForCausalLM
-from transformers import AutoProcessor
-from neural_compressor.torch.quantization import load
-quantized_model_path = "./tmp_autoround"
-model = load(quantized_model_path, format='huggingface', \
-             trust_remote_code=True, device_map="auto", torch_dtype="auto", _attn_implementation='eager') # use _attn_implementation='flash_attention_2' to enable flash attention
-
-processor = AutoProcessor.from_pretrained("microsoft/Phi-3-vision-128k-instruct", trust_remote_code=True)
-
-messages = [ \
-    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"}, \
-    {"role": "assistant", "content": "The chart displays the percentage of respondents who agree with various statements about their preparedness for meetings. It shows five categories: 'Having clear and pre-defined goals for meetings', 'Knowing where to find the information I need for a meeting', 'Understanding my exact role and responsibilities when I'm invited', 'Having tools to manage admin tasks like note-taking or summarization', and 'Having more focus time to sufficiently prepare for meetings'. Each category has an associated bar indicating the level of agreement, measured on a scale from 0% to 100%."}, \
-    {"role": "user", "content": "Provide insightful questions to spark discussion."}]
-
-url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" 
-# image = Image.open(requests.get(url, stream=True).raw)
-image = Image.open(io.BytesIO(requests.get(url, stream=True).content))
-
-prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
-
-generation_args = {
-    "max_new_tokens": 50,
-    "temperature": 0.0,
-    "do_sample": False,
-}
-
-generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) 
-
-# remove input tokens 
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
-response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 
-
-print(response)
-# 1. How does the level of agreement on each statement reflect the overall preparedness of respondents for meetings?
-# 2. What are the most and least agreed-upon statements, and why might that be the case?
-# 3.
-```
-<!-- 
-
-## 4. Results
-Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and lm_eval dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Phi-3-vision-128k-instruct are as follows:
-| Metric         | bf16   | INT4   |
-|----------------|--------|--------|
-| avg            | 0.6014 | 0.5940 |
-| mmlu           | 0.6369 | 0.6310 |
-| lambada_openai | 0.6487 | 0.6406 |
-| hellaswag      | 0.5585 | 0.5483 |
-| winogrande     | 0.7395 | 0.7451 |
-| piqa           | 0.7954 | 0.7889 |
-| truthfulqa_mc1 | 0.3084 | 0.2987 |
-| openbookqa     | 0.3580 | 0.3600 |
-| boolq          | 0.8532 | 0.8557 |
-| arc_easy       | 0.8371 | 0.8346 |
-| arc_challenge  | 0.5572 | 0.5469 |
-| cmmlu          | 0.4074 | 0.3950 |
-| ceval          | 0.4027 | 0.4012 |
-| gsm8k          | 0.7157 | 0.6755 | -->
-
-<!-- ## 4. Known Issues
-* The Flashattention2 component that Phi3-Vision relies on is **not supported on cpu devices**. -->
-
-## Reference
-If you find SignRound useful for your research, please cite our paper:
-```bash
-@article{cheng2023optimize,
-  title={Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs},
-  author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao},
-  journal={arXiv preprint arXiv:2309.05516},
-  year={2023}
-}
-```
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/evaluation.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/evaluation.py
deleted file mode 100644
index 3d542aa594e..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/evaluation.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import itertools
-import logging
-import random
-import time
-from collections import defaultdict
-from typing import TYPE_CHECKING, List, Optional, Union
-
-from packaging.version import Version
-import pkg_resources
-LM_EVAL_VERSION = Version(pkg_resources.get_distribution('lm_eval').version)
-
-import numpy as np
-import torch
-
-import lm_eval.api.metrics
-import lm_eval.api.registry
-import lm_eval.models
-from lm_eval.evaluator import evaluate
-from lm_eval.caching.cache import delete_cache
-from lm_eval.evaluator_utils import run_task_tests
-if LM_EVAL_VERSION == Version('0.4.2'):
-    from lm_eval.logging_utils import add_env_info, get_git_commit_hash
-else:
-    from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
-from lm_eval.tasks import TaskManager, get_task_dict
-from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
-
-if TYPE_CHECKING:
-    from lm_eval.api.model import LM
-    from lm_eval.tasks import Task
-
-
-@positional_deprecated
-def simple_evaluate(
-        model,
-        model_args: Optional[Union[str, dict]] = None,
-        tasks: Optional[List[Union[str, dict, object]]] = None,
-        num_fewshot: Optional[int] = None,
-        batch_size: Optional[int] = None,
-        max_batch_size: Optional[int] = None,
-        device: Optional[str] = None,
-        use_cache: Optional[str] = None,
-        cache_requests: bool = False,
-        rewrite_requests_cache: bool = False,
-        delete_requests_cache: bool = False,
-        limit: Optional[Union[int, float]] = None,
-        bootstrap_iters: int = 100000,
-        check_integrity: bool = False,
-        write_out: bool = False,
-        log_samples: bool = True,
-        gen_kwargs: Optional[str] = None,
-        task_manager: Optional[TaskManager] = None,
-        verbosity: str = "INFO",
-        predict_only: bool = False,
-        random_seed: int = 0,
-        numpy_random_seed: int = 1234,
-        torch_random_seed: int = 1234,
-        user_model = None, ##user model does not support tensor parallelism
-):
-    """Instantiate and evaluate a model on a list of tasks.
-
-    :param model: Union[str, LM]
-        Name of model or LM object, see lm_eval.models.get_model
-    :param model_args: Optional[str, dict]
-        String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
-        Ignored if `model` argument is a LM object.
-    :param tasks: list[Union[str, dict, Task]]
-        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME 
-        if defined and type(task).__name__ otherwise.
-    :param num_fewshot: int
-        Number of examples in few-shot context
-    :param batch_size: int or str, optional
-        Batch size for model
-    :param max_batch_size: int, optional
-        Maximal batch size to try with automatic batch size detection
-    :param device: str, optional
-        PyTorch device (e.g. "cpu" or "cuda:0") for running models
-    :param use_cache: str, optional
-        A path to a sqlite db file for caching model responses. `None` if not caching.
-    :param cache_requests: bool, optional
-        Speed up evaluation by caching the building of dataset requests. `None` if not caching.
-    :param rewrite_requests_cache: bool, optional
-        Rewrites all of the request cache if set to `True`. `None` if not desired.
-    :param delete_requests_cache: bool, optional
-        Deletes all of the request cache if set to `True`. `None` if not desired.
-    :param limit: int or float, optional
-        Limit the number of examples per task (only use this for testing), If <1, 
-        limit is a percentage of the total number of examples.
-    :param bootstrap_iters:
-        Number of iterations for bootstrap statistics
-    :param check_integrity: bool
-        Whether to run the relevant part of the test suite for the tasks
-    :param write_out: bool
-        If True, write out an example document and model input for checking task integrity
-    :param log_samples: bool
-        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
-    :param gen_kwargs: str
-        String arguments for model generation
-        Ignored for all tasks with loglikelihood output_type
-    :param predict_only: bool
-        If true only model outputs will be generated and returned. Metrics will not be evaluated
-    :param random_seed: int
-        Random seed for python's random module. If set to None, the seed will not be set.
-    :param numpy_random_seed: int
-        Random seed for numpy. If set to None, the seed will not be set.
-    :param torch_random_seed: int
-        Random seed for torch. If set to None, the seed will not be set.
-
-    :return
-        Dictionary of results
-    """
-    from auto_round.auto_quantizer import AutoHfQuantizer
-    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
-    start_date = time.time()
-
-    if delete_requests_cache:
-        eval_logger.info("Deleting requests cache...")
-        delete_cache()
-
-    seed_message = []
-    if random_seed is not None:
-        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
-        seed_message.append(f"Setting random seed to {random_seed}")
-        random.seed(random_seed)
-
-    if numpy_random_seed is not None:
-        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
-        np.random.seed(numpy_random_seed)
-
-    if torch_random_seed is not None:
-        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
-        torch.manual_seed(torch_random_seed)
-
-    if seed_message:
-        eval_logger.info(" | ".join(seed_message))
-
-    if tasks is None:
-        tasks = []
-    if len(tasks) == 0:
-        raise ValueError(
-            "No tasks specified, or no tasks found. Please verify the task names."
-        )
-
-    if gen_kwargs is not None:
-        gen_kwargs = simple_parse_args_string(gen_kwargs)
-        eval_logger.warning(
-            "generation_kwargs specified through cli, these settings will update set parameters in yaml tasks. "
-            "Ensure 'do_sample=True' for non-greedy decoding!"
-        )
-        if gen_kwargs == "":
-            gen_kwargs = None
-
-    if isinstance(model, str):
-        if model_args is None:
-            model_args = ""
-
-        if isinstance(model_args, dict):
-            lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
-                model_args,
-                {
-                    "batch_size": batch_size,
-                    "max_batch_size": max_batch_size,
-                    "device": device,
-                },
-            )
-
-        else:
-            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
-                model_args,
-                {
-                    "batch_size": batch_size,
-                    "max_batch_size": max_batch_size,
-                    "device": device,
-                },
-            )
-    else:
-        if not isinstance(model, lm_eval.api.model.LM):
-            raise TypeError
-        lm = model
-
-    if use_cache is not None:
-        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
-        lm = lm_eval.api.model.CachingLM(
-            lm,
-            use_cache
-            # each rank receives a different cache db.
-            # necessary to avoid multiple writes to cache at once
-            + "_rank"
-            + str(lm.rank)
-            + ".db",
-        )
-    if user_model is not None:
-        lm._model = user_model
-
-    if task_manager is None:
-        task_manager = TaskManager(verbosity)
-
-    task_dict = get_task_dict(tasks, task_manager)
-    for task_name in task_dict.keys():
-        task_obj = task_dict[task_name]
-        if isinstance(task_obj, tuple):
-            _, task_obj = task_obj
-            if task_obj is None:
-                continue
-
-        if task_obj.get_config("output_type") == "generate_until":
-            if gen_kwargs is not None:
-                task_obj.set_config(
-                    key="generation_kwargs", value=gen_kwargs, update=True
-                )
-
-        if predict_only:
-            log_samples = True
-            eval_logger.info(
-                f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
-            )
-            # we have to change the class properties post-hoc. This is pretty hacky.
-            task_obj.override_metric(metric_name="bypass")
-
-        # override tasks' fewshot values to the provided num_fewshot arg value
-        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
-        if num_fewshot is not None:
-            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
-                eval_logger.info(
-                    f"num_fewshot has been set to 0 for {task_name} in its config."
-                    " Manual configuration will be ignored."
-                )
-            else:
-                eval_logger.warning(
-                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
-                )
-                task_obj.set_config(key="num_fewshot", value=num_fewshot)
-        else:
-            # if num_fewshot not provided, and the task does not define a default one, default to 0
-            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
-                task_obj.set_config(key="num_fewshot", value=0)
-
-    if check_integrity:
-        run_task_tests(task_list=tasks)
-
-    results = evaluate(
-        lm=lm,
-        task_dict=task_dict,
-        limit=limit,
-        cache_requests=cache_requests,
-        rewrite_requests_cache=rewrite_requests_cache,
-        bootstrap_iters=bootstrap_iters,
-        write_out=write_out,
-        log_samples=log_samples,
-        verbosity=verbosity,
-    )
-
-    if lm.rank == 0:
-        if isinstance(model, str):
-            model_name = model
-        elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
-            model_name = model.config._name_or_path
-        else:
-            model_name = type(model).__name__
-
-        # add info about the model and few shot config
-        results["config"] = {
-            "model": model_name,
-            "model_args": model_args,
-            "batch_size": batch_size,
-            "batch_sizes": (
-                list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
-            ),
-            "device": device,
-            "use_cache": use_cache,
-            "limit": limit,
-            "bootstrap_iters": bootstrap_iters,
-            "gen_kwargs": gen_kwargs,
-        }
-        results["git_hash"] = get_git_commit_hash()
-        results["date"] = start_date
-        add_env_info(results)  # additional environment info to results
-        return results
-    else:
-        return None
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
deleted file mode 100644
index b2e2c945ed2..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
+++ /dev/null
@@ -1,430 +0,0 @@
-import argparse
-# import sys
-# sys.path.insert(0, '../../..')
-parser = argparse.ArgumentParser()
-import torch
-import os
-# os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-# torch.use_deterministic_algorithms(True, warn_only=True)
-import copy
-from PIL import Image
-import json
-from torch.utils.data import Dataset, DataLoader
-import torch
-from typing import Dict, Optional, List, Union, Sequence
-import transformers
-from model.processing_phi3_v import Phi3VProcessor
-from dataclasses import dataclass, field
-from transformers import AutoModelForCausalLM, AutoConfig
-from transformers.trainer_pt_utils import LabelSmoother
-
-IGNORE_TOKEN_ID = LabelSmoother.ignore_index
-import subprocess
-LLaVA_IMAGE_TOKEN = "<image>"
-DEFAULT_IMAGE_TOKEN = "<|image_1|>"
-IMAGE_TOKEN_INDEX = -200
-IGNORE_INDEX = -100
-from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                    get_layer_names_in_block,
-                                                    detect_device,
-                                                    run_fn_for_vlm_autoround
-                                                    )
-from neural_compressor.torch.quantization import (AutoRoundConfig,
-                                                    prepare,
-                                                    convert,
-                                                    load)
-@dataclass
-class DataArguments:
-    data_path: str = field(
-        default=None, metadata={"help": "Path to the training data."}
-    )
-    lazy_preprocess: bool = True
-    is_multimodal: bool = True
-    image_folder: Optional[str] = field(default=None)
-    max_seq_length: int = field(
-        default=2048, metadata={"help": "Maximum sequence length."}
-    )
-    
-    
-def llava_to_openai(data):
-    role_mapping = {"human": "user", "gpt": "assistant"}
-
-    transformed_data = []
-    for entry in data:
-        transformed_entry = {
-            "role": role_mapping.get(entry["from"], entry["from"]),
-            "content": entry["value"].replace(LLaVA_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN),
-        }
-        transformed_data.append(transformed_entry)
-
-    return transformed_data
-
-
-class LazySupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-
-    def __init__(
-        self,
-        data_path: Union[int, str],
-        processor: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        padding=True,
-    ):
-        super(LazySupervisedDataset, self).__init__()
-        if isinstance(data_path, str):
-            list_data_dict = json.load(open(data_path, "r"))
-        else:
-            list_data_dict = data_path
-
-        print("Formatting inputs...Skip in lazy mode")
-        self.processor = processor
-        self.list_data_dict = list_data_dict
-        self.data_args = data_args
-        self.padding = padding
-        self.max_seq_length = data_args.max_seq_length
-
-    def __len__(self):
-        return len(self.list_data_dict)
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        sources = self.list_data_dict[i]
-        if isinstance(i, int):
-            sources = [sources]
-        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
-        processor = self.processor
-        if "image" in sources[0]:
-            image_file = os.path.basename(self.list_data_dict[i]["image"])
-            image_folder = self.data_args.image_folder
-            image_file = os.path.join(image_folder, image_file)
-            image = [Image.open(image_file).convert("RGB")]
-        else:
-            image = None
-        sources = copy.deepcopy([e["conversations"] for e in sources])
-        for i in range(len(sources)):
-            sources[i] = llava_to_openai(sources[i])
-
-        prompt = processor.tokenizer.apply_chat_template(
-            sources[0], tokenize=False, add_generation_prompt=True
-        )
-        data_dict = processor(prompt, image, return_tensors="pt")
-
-        if self.padding:
-            training_length = self.max_seq_length
-            if 'pixel_values' not in data_dict:
-                data_dict['pixel_values'] = torch.zeros([1, 17, 3, 336, 336], dtype=torch.bfloat16)
-                data_dict['image_sizes'] = torch.zeros([1, 2], dtype=torch.int64)
-            data_dict = dict(
-                input_ids=data_dict["input_ids"][0],
-                attention_mask=data_dict["attention_mask"][0],
-                pixel_values=data_dict["pixel_values"][0],
-                image_sizes=data_dict["image_sizes"][0],
-                labels=data_dict["labels"][0],
-            )
-        return data_dict
-
-
-@dataclass
-class DataCollatorForSupervisedDataset(object):
-    """Collate examples for supervised fine-tuning."""
-
-    tokenizer: transformers.PreTrainedTokenizer
-
-    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        input_ids, labels = tuple([instance[key] for instance in instances]
-                                  for key in ("input_ids", "labels"))
-        input_ids = torch.nn.utils.rnn.pad_sequence(
-            input_ids,
-            batch_first=True,
-            padding_value=self.tokenizer.pad_token_id)
-        labels = torch.nn.utils.rnn.pad_sequence(
-            labels,
-            batch_first=True,
-            padding_value=IGNORE_INDEX)
-        input_ids = input_ids[:, :self.tokenizer.model_max_length]
-        labels = labels[:, :self.tokenizer.model_max_length]
-
-        pixel_values = [instance["pixel_values"] for instance in instances]
-        pixel_values = torch.stack(pixel_values, dim=0)
-        image_sizes = [instance["image_sizes"] for instance in instances]
-        image_sizes = torch.stack(image_sizes, dim=0)
-
-        batch = dict(
-            input_ids=input_ids,
-            labels=labels,
-            pixel_values=pixel_values,
-            image_sizes=image_sizes,
-            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
-        )
-        return batch
-
-
-def create_data_loader(dataset, batch_size=1, data_collator=None):
-    assert batch_size == 1, "batch_size must be 1"
-    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
-    return data_loader
-
-
-if __name__ == '__main__':
-
-    parser.add_argument(
-        "--model_name", default="microsoft/Phi-3-vision-128k-instruct")
-    
-    parser.add_argument("--quantize", action="store_true")
-    
-    parser.add_argument("--accuracy", action="store_true")
-
-    parser.add_argument("--bits", default=4, type=int,
-                        help="number of  bits")
-
-    parser.add_argument("--group_size", default=128, type=int,
-                        help="group size")
-
-    parser.add_argument("--train_bs", default=1, type=int,
-                        help="train batch size")
-
-    parser.add_argument("--eval_bs", default=4, type=int,
-                        help="eval batch size")
-
-    parser.add_argument("--device", default="auto", type=str,
-                        help="The device to be used for tuning. The default is set to auto/None,"
-                             "allowing for automatic detection. Currently, device settings support CPU, GPU, and HPU.")
-
-    parser.add_argument("--sym", action='store_true',
-                        help=" sym quantization")
-
-    parser.add_argument("--iters", default=200, type=int,
-                        help=" iters")
-
-    parser.add_argument("--lr", default=None, type=float,
-                        help="learning rate, if None, it will be set to 1.0/iters automatically")
-
-    parser.add_argument("--minmax_lr", default=None, type=float,
-                        help="minmax learning rate, if None,it will beset to be the same with lr")
-
-    parser.add_argument("--seed", default=42, type=int,
-                        help="seed")
-
-    parser.add_argument("--eval_fp16_baseline", action='store_true',
-                        help="whether to eval FP16 baseline")
-
-    parser.add_argument("--adam", action='store_true',
-                        help="adam")
-
-    parser.add_argument("--seqlen", default=2048, type=int,
-                        help="sequence length")
-
-    parser.add_argument("--gradient_accumulate_steps", default=1, type=int, help="gradient accumulate steps")
-
-    parser.add_argument("--nblocks", default=1, type=int, help="num of blocks to tune together")
-
-    parser.add_argument("--nsamples", default=512, type=int,
-                        help="number of samples")
-
-    parser.add_argument("--low_gpu_mem_usage", action='store_true',
-                        help="low_gpu_mem_usage is deprecated")
-
-    parser.add_argument("--export_format", default='auto_round:gptq', type=str,
-                        help="targeted inference acceleration platform,The options are 'fake', 'cpu', 'gpu', 'xpu' and 'auto_round'."
-                             "default to 'fake', indicating that it only performs fake quantization and won't be exported to any device.")
-
-    parser.add_argument("--scale_dtype", default='fp16',
-                        help="which scale data type to use for quantization, 'fp16', 'fp32' or 'bf16'.")
-
-    parser.add_argument("--output_dir", default="./tmp_autoround", type=str,
-                        help="Where to store the final model.")
-
-    parser.add_argument("--disable_eval", action='store_true',
-                        help="Whether to do lmeval evaluation.")
-
-    parser.add_argument("--disable_amp", action='store_true',
-                        help="disable amp")
-
-    parser.add_argument("--disable_minmax_tuning", action='store_true',
-                        help="whether disable  enable weight minmax tuning")
-
-    parser.add_argument("--disable_trust_remote_code", action='store_true',
-                        help="Whether to disable trust_remote_code")
-
-    parser.add_argument("--disable_quanted_input", action='store_true',
-                        help="whether to disuse the output of quantized block to tune the next block")
-
-    parser.add_argument("--quant_lm_head", action='store_true',
-                        help="quant_lm_head")
-
-    parser.add_argument("--model_dtype", default=None, type=str,
-                        help="force to convert the dtype, some backends supports fp16 dtype better")
-    
-    parser.add_argument("--model_max_length", default=2048, type=int,
-                        help="")
-    
-    parser.add_argument("--act_bits", default=32, type=int,
-                    help="activation bits")
-    
-    parser.add_argument("--quant_vision", action='store_true',
-                        help="To determine whether the quantization should handle vision component.")
-    
-    parser.add_argument("--enable_safe_serialization", action='store_true',
-                        help="To determine whether the save_pretrained process should use safe_serialization.")
-    
-    # ========== Calibration Datasets ============= 
-    parser.add_argument("--image_folder", default="coco", type=str,
-                        help="The dataset for quantization training. It can be a custom one.")
-    
-    parser.add_argument("--question_file", default=None, type=str,
-                            help="The dataset for quantization training. It can be a custom one.")
-    
-    # ================= Evaluation Related =====================
-    parser.add_argument("--tasks", #wikitext
-                        default="lambada_openai,hellaswag,winogrande,piqa,mmlu,truthfulqa_mc1," \
-                                "truthfulqa_mc2,openbookqa,boolq,rte,arc_easy,arc_challenge",
-                        help="lm-eval tasks for lm_eval version 0.4")
-    
-    parser.add_argument("--eval-dataset", type=str, default="textvqa_val")
-
-    args = parser.parse_args()
-
-    if args.quantize:
-        tasks = args.tasks
-            
-        if args.act_bits <= 8:
-            print(
-                "Warning, activation quantization is an experiment feature")
-        
-        if args.act_bits <= 8 and args.export_format != "fake":
-            assert False, "only support fake mode for activation quantization currently"
-            
-        if "marlin" in args.export_format and args.sym == False:
-            assert False, "marlin backend only supports sym quantization, please set --sym"
-
-        model_name = args.model_name
-        if model_name[-1] == "/":
-            model_name = model_name[:-1]
-        print(model_name, flush=True)
-
-        device_str = detect_device(args.device)
-        torch_device = torch.device(device_str)
-        
-        torch.manual_seed(1234)
-        model_name = args.model_name
-        seqlen = args.seqlen
-        questions = json.load(open(args.question_file, "r"))
-        torch_dtype = "auto"
-        if "hpu" in device_str:
-            torch_dtype = torch.bfloat16
-        if args.model_dtype != None:
-            if args.model_dtype == "float16" or args.model_dtype == "fp16":
-                torch_dtype = torch.float16
-            if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
-                torch_dtype = torch.bfloat16
-                
-        # config = AutoConfig.from_pretrained(
-        #     model_name,
-        #     trust_remote_code=not args.disable_trust_remote_code,
-        # )
-        # config.use_cache = False
-        
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch_dtype,
-            trust_remote_code=not args.disable_trust_remote_code,
-            _attn_implementation='eager' # _attn_implementation='flash_attention_2' to enable flash attention
-        )
-        seqlen = args.seqlen
-        processor = Phi3VProcessor.from_pretrained(model_name)
-        tokenizer = processor.tokenizer
-        data_args = DataArguments(
-            data_path=args.question_file,
-            is_multimodal=True,
-            image_folder=args.image_folder,
-            max_seq_length=seqlen,
-        )
-        dataset = LazySupervisedDataset(
-            data_path=args.question_file, processor=processor, data_args=data_args
-        )
-        data_collator = DataCollatorForSupervisedDataset(tokenizer=processor.tokenizer)
-        dataloader = create_data_loader(dataset, batch_size=args.train_bs, data_collator=data_collator)
-            
-        quant_block_list = get_multimodal_block_names(model, args.quant_vision)
-        
-        quant_config = AutoRoundConfig(bits=args.bits, use_sym=args.sym, batch_size=args.train_bs, group_size=args.group_size,
-                            seqlen=seqlen, nblocks=args.nblocks, iters=args.iters, lr=args.lr,
-                            minmax_lr=args.minmax_lr, enable_quanted_input=not args.disable_quanted_input,
-                            nsamples=args.nsamples, seed=args.seed, gradient_accumulate_steps=args.gradient_accumulate_steps,
-                            scale_dtype=args.scale_dtype, enable_minmax_tuning=not args.disable_minmax_tuning, act_bits=args.act_bits,
-                            quant_block_list=quant_block_list, export_format=args.export_format)
-        
-        all_block_list = get_multimodal_block_names(model, quant_vision=True)
-        all_block_set = set(tuple(block) for block in all_block_list)
-        quant_block_set = set(tuple(block) for block in quant_block_list)
-        set_to_full_prec = list(all_block_set - quant_block_set)
-        set_to_full_prec = get_layer_names_in_block(model, quant_block_list=set_to_full_prec)
-        for name in set_to_full_prec:
-            quant_config.set_local(name, AutoRoundConfig(dtype="fp32"))
-            
-        # skip special layers
-        quant_config.set_local("model.vision_embed_tokens.img_projection*", AutoRoundConfig(dtype="fp32"))
-            
-        for n, m in model.named_modules():
-            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
-                if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                    quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
-                    print(
-                        f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
-        
-        lm_head_layer_name = "lm_head"
-        if args.quant_lm_head:
-            from transformers import AutoConfig
-            config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
-            if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
-                tied_keys = model._tied_weights_keys
-                for item in tied_keys:
-                    if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
-                        args.quant_lm_head = False
-                        print(
-                            f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
-                            f"supported currently")
-                        break
-                    
-        if not args.quant_lm_head:
-                quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
-                transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
-                if transformers_version[0] == 4 and transformers_version[1] < 38:
-                    error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
-                    raise EnvironmentError(error_message)
-        
-        run_args = (dataloader, seqlen, args.nsamples)
-        user_model = prepare(model=model, quant_config=quant_config)
-        run_fn_for_vlm_autoround(user_model, *run_args)
-        user_model = convert(user_model)
-
-        from neural_compressor.torch.utils import (LoadFormat,)
-        user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE, safe_serialization=False)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(args.output_dir)
-        if processor is not None:
-            processor.save_pretrained(args.output_dir)
-
-    
-    # if args.accuracy:
-    #     from eval.evaluation import simple_evaluate
-    #     device_str = detect_device(args.device)
-    #     tasks = args.tasks
-    #     if isinstance(tasks, str):
-    #         tasks = tasks.split(',')
-    #     model_args = f"pretrained={args.model_name}"
-    #     model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}"
-    #     model_args += f",autogptq=True,gptq_use_triton=True"
-    #     user_model = load(args.model_name, format='huggingface', \
-    #                         trust_remote_code=not args.disable_trust_remote_code, _attn_implementation='eager')
-    #     if args.act_bits <= 8:
-    #         user_model = model.to(device_str)
-
-    #     res = simple_evaluate(model="hf", model_args=model_args,
-    #                         tasks=tasks,
-    #                         batch_size=args.eval_bs, user_model=user_model)
-    #     from lm_eval.utils import make_table
-    #     print(make_table(res))
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/__init__.py
deleted file mode 100644
index dbe21cebce8..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-if __name__ == "__main__":
-    import sys
-    sys.path.insert(0, './')
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/configuration_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/configuration_phi3_v.py
deleted file mode 100644
index 573bb8d39fb..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/configuration_phi3_v.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" Phi-3-V model configuration"""
-
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",
-}
-
-
-class Phi3VConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Phi3VModel`]. It is used to instantiate a Phi-3
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the
-    [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32064):
-            Vocabulary size of the Phi-3-V model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Phi3VModel`].
-        hidden_size (`int`, *optional*, defaults to 3072):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 8192):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        resid_pdrop (`float`, *optional*, defaults to 0.0):
-            Dropout probability for mlp outputs.
-        embd_pdrop (`int`, *optional*, defaults to 0.0):
-            The dropout ratio for the embeddings.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio after computing the attention scores.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model might ever be used with.
-        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model was trained with. This is used to determine the size of the
-            original RoPE embeddings when using long scaling.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon value used for the RMSNorm.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`dict`, *optional*):
-            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
-            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
-            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
-            divided by the number of attention heads divided by 2.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 32000):
-            The id of the "end-of-sequence" token.
-        pad_token_id (`int`, *optional*, defaults to 32000):
-            The id of the padding token.
-        sliding_window (`int`, *optional*):
-            Sliding window attention window size. If `None`, no sliding window is applied.
-        embd_layer (`str`, *optional*, defaults to `"default"`):
-            The embedding layer to use. Can be either `"default"` or `"image"`. "default" uses the standard embedding for text. 
-
-    Example:
-
-    ```python
-    >>> from transformers import Phi3VModel, Phi3VConfig
-
-    >>> # Initializing a Phi-3-V style configuration
-    >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-vision-128k-instruct")
-
-    >>> # Initializing a model from the configuration
-    >>> model = Phi3VModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "phi3_v"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32064,
-        hidden_size=3072,
-        intermediate_size=8192,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        resid_pdrop=0.0,
-        embd_pdrop=0.0,
-        attention_dropout=0.0,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        original_max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        bos_token_id=1,
-        eos_token_id=32000,
-        pad_token_id=32000,
-        sliding_window=None,
-        embd_layer: str = "default",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attention_dropout = attention_dropout
-        self.hidden_act = hidden_act
-        self.max_position_embeddings = max_position_embeddings
-        self.original_max_position_embeddings = original_max_position_embeddings
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-        self.sliding_window = sliding_window
-        self.embd_layer = embd_layer
-
-
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
-        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
-            raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
-        if not (
-            isinstance(rope_scaling_short_factor, list)
-            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
-        ):
-            raise ValueError(
-                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
-            )
-        if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
-            raise ValueError(
-                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
-            )
-        if not (
-            isinstance(rope_scaling_long_factor, list)
-            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
-        ):
-            raise ValueError(
-                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
-            )
-        if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
-            raise ValueError(
-                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
-            )
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_embedding_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_embedding_phi3_v.py
deleted file mode 100644
index c2994c6ca05..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_embedding_phi3_v.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import torch
-import torch.nn as nn
-from transformers import CLIPVisionModel, PretrainedConfig
-from transformers import CLIPVisionConfig 
-from transformers.utils import logging
-from datetime import datetime 
-
-logger = logging.get_logger(__name__)
-
-CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
-  attention_dropout=0.0,
-  dropout=0.0,
-  hidden_act="quick_gelu",
-  hidden_size=1024,
-  image_size=336,
-  initializer_factor=1.0,
-  initializer_range=0.02,
-  intermediate_size=4096,
-  layer_norm_eps=1e-05,
-  num_attention_heads=16,
-  num_channels=3,
-  num_hidden_layers=24,
-  patch_size=14,
-  projection_dim=768 
-)
-
-class Phi3ImageEmbedding(nn.Module):
-    """Phi3 Image embedding."""
-
-    def __init__(self, config: PretrainedConfig, wte=None, **kwargs) -> None:
-        super().__init__()
-
-        # n_embed or hidden_size
-        hidden_size = config.n_embd if hasattr(config, 'n_embd') else config.hidden_size
-        if hasattr(config, 'embd_pdrop') or hasattr(config, 'embed_pdrop'):
-            embd_drop = config.embd_pdrop if hasattr(config, 'embd_pdrop') else config.embed_pdrop
-            self.drop = nn.Dropout(embd_drop)
-        else:
-            self.drop = None
-
-        self.wte = wte
-
-        if isinstance(config.img_processor, dict) and config.img_processor.get('name', None) == 'clip_vision_model':
-            assert 'model_name' in config.img_processor, 'model_name must be provided for CLIPVisionModel'
-            assert 'image_dim_out' in config.img_processor, 'image_dim_out must be provided for CLIPVisionModel'
-            assert 'num_img_tokens' in config.img_processor, 'num_img_tokens must be provided for CLIPVisionModel'
-            assert config.img_processor['model_name'] == 'openai/clip-vit-large-patch14-336'
-            clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
-            self.img_processor = CLIPVisionModel(clip_config)
-            image_dim_out = config.img_processor['image_dim_out']
-            self.num_img_tokens = config.img_processor['num_img_tokens']
-        else:
-            raise NotImplementedError(f'img_processor = {config.img_processor}, not implemented')
-
-        self.image_dim_out = image_dim_out
-        self.img_sizes = None
-
-        # global_gn and sub_gn for hd transform, serves as line separator
-        self.use_hd_transform = kwargs.get('use_hd_transform', False)
-        self.with_learnable_separator = kwargs.get('with_learnable_separator', False)
-        self.hd_transform_order = kwargs.get('hd_transform_order', 'glb_sub')
-        # with_hd_transform and with_learnable_separator should have same value
-        assert self.use_hd_transform == self.with_learnable_separator, 'use_hd_transform and with_learnable_separator should have same value'
-        if self.with_learnable_separator:
-            assert self.use_hd_transform, 'learnable separator is only for hd transform'
-            # 1024 * 4, merge spatial to channel dimension
-            self.glb_GN = nn.Parameter(torch.zeros([1, 1, self.image_dim_out * 4]))
-            self.sub_GN = nn.Parameter(torch.zeros([1, 1, 1, self.image_dim_out * 4]))
-            logger.info(f'learnable separator enabled for hd transform, hd_transform_order = {self.hd_transform_order}')
-
-        projection_cls = kwargs.get('projection_cls', 'linear')
-        if projection_cls == 'linear':
-            self.img_projection = nn.Linear(image_dim_out, hidden_size)
-        elif projection_cls == 'mlp' and self.use_hd_transform:
-            dim_projection = hidden_size
-            depth = 2
-            layers = [nn.Linear(image_dim_out * 4, dim_projection)]
-            for _ in range(1, depth):
-                layers.extend([nn.GELU(),
-                                nn.Linear(dim_projection, dim_projection)])
-            self.img_projection = nn.Sequential(*layers)
-        elif projection_cls == 'mlp':
-            dim_projection = hidden_size
-            depth = 2
-            layers = [nn.Linear(image_dim_out, dim_projection)]
-            for _ in range(1, depth):
-                layers.extend([nn.GELU(),
-                                nn.Linear(dim_projection, dim_projection)])
-            self.img_projection = nn.Sequential(*layers)
-        else:
-            raise NotImplementedError(f'projection_cls = {projection_cls}, not implemented')
-
-        self.vocab_size = config.vocab_size
-        self.img_features = None
-
-        if isinstance(config.img_processor, dict):
-            self.layer_idx = config.img_processor.get('layer_idx', -2)
-            self.type_feature = config.img_processor.get('type_feature', 'patch')
-        else:
-            self.layer_idx = -2
-            self.type_feature = 'patch'
-
-
-    def set_img_features(self, img_features: torch.FloatTensor) -> None:
-        self.img_features = img_features
-
-    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
-        self.img_sizes = img_sizes
-
-    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
-        LAYER_IDX = self.layer_idx
-        TYPE_FEATURE = self.type_feature
-
-        img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)
-        img_feature = img_processor_output.hidden_states[LAYER_IDX]
-
-        if TYPE_FEATURE == "patch":
-            patch_feature = img_feature[:, 1:]
-            return patch_feature
-
-        if TYPE_FEATURE == "cls_patch":
-            return img_feature
-
-        raise NotImplementedError
-
-    def forward(self, input_ids: torch.LongTensor, pixel_values: torch.FloatTensor, image_sizes=None) -> torch.FloatTensor:
-
-        MAX_INPUT_ID = int(1e9)
-        img_embeds = pixel_values
-        img_sizes = image_sizes
-
-        if self.img_features is not None:
-            img_embeds = self.img_features.clone()
-            self.img_features = None
-
-        if self.img_sizes is not None:
-            img_sizes = self.img_sizes
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        with torch.no_grad():
-            positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=False)
-        
-        select = False
-
-        if isinstance(self.img_projection, nn.Sequential):  
-            target_device = self.img_projection[0].bias.device  
-            target_dtype = self.img_projection[0].bias.dtype  
-        else:  # It's a single nn.Linear layer  
-            target_device = self.img_projection.bias.device  
-            target_dtype = self.img_projection.bias.dtype  
-
-        if len(positions.tolist()) > 0:
-            with torch.no_grad():
-                g_values = abs(input_ids[positions[:, 0], positions[:, 1]])
-
-            if self.use_hd_transform and img_sizes is not None and len(img_sizes):
-                hd_transform = True
-                assert img_embeds.ndim == 5, f'img_embeds size: {img_embeds.size()}, expect 5D tensor for hd transform'
-                # img_embeds: (num_images, max_num_crops, 3, H, W)
-                # img_sizes: (num_images, 2).view(1, -1)
-
-                start_time = datetime.now()
-                bs = img_embeds.shape[0]
-                # Nx(HW)xC
-                img_features = self.get_img_features(img_embeds.flatten(0, 1))
-                base_feat_height = base_feat_width = int(img_features.shape[1] ** 0.5)
-
-                assert base_feat_height == 24 and base_feat_width == 24, f'base_feat_height: {base_feat_height}, base_feat_width: {base_feat_width}, expect 24x24 features for hd transform'
-
-                # bs x max_num_crops x (24x24) x C
-                img_features = img_features.view(bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
-                C = self.image_dim_out
-                H = base_feat_height
-
-                output_imgs = []
-                output_len = []
-                # training is tensor, inference is list
-                if isinstance(img_sizes, torch.Tensor):
-                    img_sizes = img_sizes.view(-1, 2)
-                num_pure_text = 0
-                for _bs in range(bs):
-                    h, w = img_sizes[_bs]
-                    h = h // 336 
-                    w = w // 336
-                    B_ = h * w
-                    if B_ == 0:
-                        num_pure_text += 1
-                        continue
-
-                    # 1 x (24x24) x 1024
-                    global_img_feature = img_features[_bs, :1]
-
-                    # 1 x 12 x 12 x 4096
-                    glb_img = global_img_feature.reshape(1,H,H,C).reshape(1,H//2,2,H//2,2,C).contiguous().permute(0,1,3,2,4,5).reshape(1,H//2,H//2,4*C).contiguous()
-                    temp_glb_GN = self.sub_GN.repeat(1, H//2, 1, 1)
-
-                    # 1 x 156 x 4096
-                    glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(1,-1,4*C)
-
-                    # (max_num_crops-1) x (12x12) x C
-                    sub_img = img_features[_bs, 1:]
-                    # 16x574x1024
-                    # get rid of padding sub_img
-                    sub_img = sub_img[:B_]
-
-                    # (num_crops, 12, 2, 12, 2, 1024) -> (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
-                    sub_img = sub_img.reshape(B_,H,H,C).reshape(B_,H//2,2,H//2,2,C).contiguous().permute(0,1,3,2,4,5).reshape(B_,-1,4*C).contiguous()
-                    sub_img = sub_img.reshape(1, h, w, 12, 12, -1).permute(0,1,3,2,4,5).reshape(1,h*12,w*12,4*C)
-                    temp_sub_GN = self.sub_GN.repeat(1, h*12, 1, 1)
-                    sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(1,-1,4*C)
-                    # (1, num_img_tokens, 1024*4)
-
-                    # glb + sub
-                    if self.hd_transform_order == 'glb_sub':
-                        output_imgs.append(torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
-                    elif self.hd_transform_order == 'sub_glb':
-                        output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
-                    else:
-                        raise NotImplementedError(f'hd_transform_order = {self.hd_transform_order}, not implemented')
-
-                    temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
-                    assert temp_len == output_imgs[-1].shape[1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'
-                    output_len.append(temp_len)
-                
-                num_img_tokens = output_len
-                img_set_tensor = []
-                for _output_img in output_imgs:
-                    img_feature_proj = self.img_projection(_output_img.to(target_device).to(target_dtype))
-                    img_set_tensor.append(img_feature_proj)
-                # logger.info(f'img_embeds size: {img_embeds.size()}, image sizes: {img_sizes} loading time {datetime.now() - start_time}')
-            elif img_embeds.ndim == 4:
-                selected_g_values = g_values[::self.num_img_tokens]
-                assert len(img_embeds) == len(selected_g_values), f'img_embeds size: {img_embeds.size()}, selected_g_values size: {len(selected_g_values)}, selected_g_value {selected_g_values}'
-                start_time = datetime.now()
-                tt = (
-                    self.get_img_features(img_embeds)
-                    .to(target_device)
-                    .to(target_dtype)
-                    .reshape(-1, self.image_dim_out)
-                )
-                # logger.info(f'img_embeds size: {img_embeds.size()}, loading time {datetime.now() - start_time}')
-                img_set_tensor = self.img_projection(tt)  # adapted visual features.
-            elif img_embeds.ndim == 3:
-                selected_g_values = g_values[::self.num_img_tokens]
-                assert len(img_embeds) == len(selected_g_values), f'img_embeds size: {img_embeds.size()}, selected_g_values size: {len(selected_g_values)}, selected_g_value {selected_g_values}'
-                tt = (
-                    img_embeds
-                    .to(target_device)
-                    .to(target_dtype)
-                    .view(-1, self.image_dim_out)
-                )
-                img_set_tensor = self.img_projection(tt)  # adapted visual features.
-            else:
-                raise NotImplementedError
-            select = True
-        # It's a hacky way to walkaround the hang-out problem when deepspeed `zero3` is used
-        # and the training batch is a mixture of pure text and vision-language data.
-        else:
-            num_pure_text = input_ids.shape[0]
-            self.get_img_features(img_embeds.flatten(0, 1))
-        for _ in range(num_pure_text):
-            self.img_projection(torch.zeros(1, 1921, 4096, device=self.img_processor.device, dtype=self.img_processor.dtype))
-        with torch.no_grad():
-            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
-        
-        hidden_states = self.wte(input_ids)
-
-        if select:
-            if hd_transform:
-                idx = 0
-                for i, cnt in enumerate(num_img_tokens):
-                    # see https://github.com/GaiZhenbiao/Phi3V-Finetuning/pull/5
-                    hidden_states = hidden_states.clone()
-                    hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = (
-                        img_set_tensor[i]
-                        .to(hidden_states.dtype)
-                        .to(hidden_states.device)
-                    )
-                    idx += cnt
-            else:
-                idx = 0
-                assert len(selected_g_values) * self.num_img_tokens == len(img_set_tensor), f'len(selected_g_values) * self.num_img_tokens = {len(selected_g_values) * self.num_img_tokens}, len(img_set_tensor) = {len(img_set_tensor)}'
-                for i, g in enumerate(selected_g_values):
-                    cnt = self.num_img_tokens
-                    hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = (
-                        img_set_tensor[i * cnt : (i + 1) * cnt]
-                        .to(hidden_states.dtype)
-                        .to(hidden_states.device)
-                        )
-                    idx += cnt
-
-        if self.drop is not None:
-            hidden_states = self.drop(hidden_states)
-
-        return hidden_states
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_processing_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_processing_phi3_v.py
deleted file mode 100644
index 821096527f7..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_processing_phi3_v.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Image processor class for Phi3-V."""
-
-from typing import List, Optional, Union
-
-import numpy as np
-from auto_round.utils import transformers
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
-from transformers.image_transforms import (
-    convert_to_rgb,
-)
-from transformers.image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
-    ImageInput,
-    make_list_of_images,
-    valid_images,
-)
-from transformers.utils import TensorType, is_vision_available, logging
-
-from transformers import AutoImageProcessor
-
-logger = logging.get_logger(__name__)
-
-
-if is_vision_available():
-    from PIL import Image
-
-import torch
-import torchvision
-
-def padding_336(b):
-    width, height = b.size
-    tar = int(np.ceil(height / 336) * 336)
-    top_padding = int((tar - height)/2)
-    bottom_padding = tar - height - top_padding
-    left_padding = 0
-    right_padding = 0
-    b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
-
-    return b
-
-def calc_padded_size(width, height, padding_unit=336):  
-    target_height = int(np.ceil(height / padding_unit) * padding_unit)  
-    top_padding = int((target_height - height) / 2)  
-    bottom_padding = target_height - height - top_padding  
-    left_padding = 0  
-    right_padding = 0  
-    padded_width = width + left_padding + right_padding  
-    padded_height = height + top_padding + bottom_padding  
-    return padded_width, padded_height  
-
-def HD_transform(img, hd_num=16):
-    width, height = img.size
-    trans = False
-    if width < height:
-        img = img.transpose(Image.TRANSPOSE)
-        trans = True
-        width, height = img.size
-    ratio = (width/ height)
-    scale = 1
-    while scale*np.ceil(scale/ratio) <= hd_num:
-        scale += 1
-    scale -= 1
-    new_w = int(scale * 336)
-    new_h = int(new_w / ratio)
-
-    img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
-    img = padding_336(img)
-    width, height = img.size
-    if trans:
-        img = img.transpose(Image.TRANSPOSE)
-
-    return img
-
-def calc_hd_transform_size(width, height, hd_num=16):  
-    transposed = False  
-    if width < height:  
-        width, height = height, width  
-        transposed = True  
-  
-    ratio = width / height  
-    scale = 1  
-    while scale * np.ceil(scale / ratio) <= hd_num:  
-        scale += 1  
-    scale -= 1  
-  
-    new_width = int(scale * 336)  
-    new_height = int(new_width / ratio)  
-  
-    padded_width, padded_height = calc_padded_size(new_width, new_height)  
-      
-    if transposed:  
-        padded_width, padded_height = padded_height, padded_width  
-  
-    return padded_width, padded_height  
-
-def pad_to_max_num_crops_tensor(images, max_crops=5):
-    """
-    images: B x 3 x H x W, B<=max_crops
-    """
-    B, _, H, W = images.shape
-    if B < max_crops:
-        pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
-        images = torch.cat([images, pad], dim=0)
-    return images
-
-
-class Phi3VImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a Phi3 image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques
-    for processing high resolution images as explained in the [InternLM-XComposer2-4KHD](https://arxiv.org/abs/2401.16420)
-
-    Args:
-        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
-            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-            Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-    """
-
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        num_crops: int = 1,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        self.num_crops = num_crops
-        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
-        self.do_convert_rgb = do_convert_rgb
-    
-    def calc_num_image_tokens(
-            self, 
-            images: ImageInput 
-    ):
-        """ Calculate the number of image tokens for each image.
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-        """
-        images = make_list_of_images(images)
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        images = [image.convert('RGB') for image in images]
-        # (H, W, C)
-        elems = [HD_transform(im, hd_num = self.num_crops) for im in images] 
-        shapes = [[im.size[1], im.size[0]] for im in elems]
-        num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
-        return num_img_tokens
-
-    def calc_num_image_tokens_from_image_size(self, width, height):
-        """
-        Calculate the number of image tokens for a given image size.
-        Args:
-            width (`int`): Width of the image.
-            height (`int`): Height of the image.
-        """
-        new_width, new_height = calc_hd_transform_size(width, height, hd_num=self.num_crops)  
-        num_img_tokens = int((new_height // 336 * new_width // 336 + 1) * 144 + 1 + (new_height // 336 + 1) * 12)  
-        return num_img_tokens
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ):
-        """
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-        """
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        images = make_list_of_images(images)
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-
-        image_sizes = []
-        img_processor = torchvision.transforms.Compose([
-            torchvision.transforms.ToTensor(),
-            torchvision.transforms.Normalize(image_mean, image_std)
-        ])
-
-        # PIL images
-        # HD_transform pad images to size of multiiply of 336, 336
-        # convert to RGB first
-        images = [image.convert('RGB') for image in images]
-        elems = [HD_transform(im, hd_num = self.num_crops) for im in images] 
-        # tensor transform and normalize
-        hd_images = [img_processor(im) for im in elems]
-        # create global image 
-        global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(336, 336), mode='bicubic',).to(im.dtype) for im in hd_images]
-
-        # [(3, h, w)], where h, w is multiple of 336
-        shapes = [[im.size(1), im.size(2)] for im in hd_images]
-        num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
-        # reshape to channel dimension -> (num_images, num_crops, 3, 336, 336)
-        # (1, 3, h//336, 336, w//336, 336) -> (1, h//336, w//336, 3, 336, 336) -> (h//336*w//336, 3, 336, 336)
-        hd_images_reshape = [im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336).contiguous() for im, (h, w) in zip(hd_images, shapes)]
-        # concat global image and local image
-        hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
-
-        # pad to max_num_crops
-        image_transformed = [pad_to_max_num_crops_tensor(im, self.num_crops+1) for im in hd_images_reshape]
-        image_transformed = torch.stack(image_transformed, dim=0)
-        image_sizes = [torch.LongTensor(_shapes) for _shapes in shapes]
-        padded_images = image_transformed
-        image_sizes = shapes
-
-        data = {"pixel_values": padded_images, 
-                "image_sizes": image_sizes,
-                "num_img_tokens": num_img_tokens
-                }
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-AutoImageProcessor.register("Phi3VImageProcessor", Phi3VImageProcessor)
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/modeling_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/modeling_phi3_v.py
deleted file mode 100644
index 0b5357b1655..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/modeling_phi3_v.py
+++ /dev/null
@@ -1,1634 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" PyTorch Phi-3-V model."""
-
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from .image_embedding_phi3_v import Phi3ImageEmbedding
-import transformers
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_phi3_v import Phi3VConfig
-
-
-
-logger = logging.get_logger(__name__)
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-vision-128k-instruct"
-_CONFIG_FOR_DOC = "Phi3VConfig"
-
-PHI3V_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "microsoft/Phi-3-vision-128k-instruct",
-    # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
-]
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
-class Phi3RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Phi3RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
-class Phi3RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        self.register_buffer("inv_freq", None, persistent=False)
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if self.inv_freq is None:
-            self.inv_freq = 1.0 / (
-                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
-            )
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
-    def __init__(self, dim, config, device=None):
-        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
-
-        self.short_factor = config.rope_scaling["short_factor"]
-        self.long_factor = config.rope_scaling["long_factor"]
-        self.original_max_position_embeddings = config.original_max_position_embeddings
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.original_max_position_embeddings:
-            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
-        else:
-            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-
-        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
-        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
-
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-
-            scale = self.max_position_embeddings / self.original_max_position_embeddings
-            if scale <= 1.0:
-                scaling_factor = 1.0
-            else:
-                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
-
-            cos = emb.cos() * scaling_factor
-            sin = emb.sin() * scaling_factor
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
-    def __init__(self, dim, config, device=None):
-        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
-
-        self.short_factor = config.rope_scaling["short_factor"]
-        self.long_factor = config.rope_scaling["long_factor"]
-        self.original_max_position_embeddings = config.original_max_position_embeddings
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.original_max_position_embeddings:
-            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
-        else:
-            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-
-        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
-        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
-
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-
-            scale = self.max_position_embeddings / self.original_max_position_embeddings
-            if scale <= 1.0:
-                scaling_factor = 1.0
-            else:
-                scaling_factor = 0.1 * math.log(scale) + 1.0
-
-            cos = emb.cos() * scaling_factor
-            sin = emb.sin() * scaling_factor
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class Phi3MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.config = config
-        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
-
-        self.activation_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        up_states = self.gate_up_proj(hidden_states)
-
-        gate, up_states = up_states.chunk(2, dim=-1)
-        up_states = up_states * self.activation_fn(gate)
-
-        return self.down_proj(up_states)
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Phi3Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: Phi3VConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.original_max_position_embeddings = config.original_max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.rope_scaling = config.rope_scaling
-        self.is_causal = True
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-
-        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.rope_scaling is None:
-            self.rotary_emb = Phi3RotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            if scaling_type == "su":
-                self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config)
-            elif scaling_type == "yarn":
-                self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config)
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.")
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv = self.qkv_proj(hidden_states)
-        query_pos = self.num_heads * self.head_dim
-        query_states = qkv[..., :query_pos]
-        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
-        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Phi3FlashAttention2(Phi3Attention):
-    """
-    Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # Phi3FlashAttention2 attention does not support output_attentions
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library."
-            )
-            raise ValueError("The current flash attention version does not support sliding window attention.")
-
-        output_attentions = False
-
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv = self.qkv_proj(hidden_states)
-        query_pos = self.num_heads * self.head_dim
-        query_states = qkv[..., :query_pos]
-        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
-        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-        )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_dropout = self.attention_dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32.
-
-        if query_states.dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.qkv_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=attn_dropout,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
-# TODO @Arthur no longer copied from LLama after static cache
-class Phi3SdpaAttention(Phi3Attention):
-    """
-    Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Phi3Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv = self.qkv_proj(hidden_states)
-        query_pos = self.num_heads * self.head_dim
-        query_states = qkv[..., :query_pos]
-        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
-        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-PHI3_ATTENTION_CLASSES = {
-    "eager": Phi3Attention,
-    "flash_attention_2": Phi3FlashAttention2,
-    "sdpa": Phi3SdpaAttention,
-}
-
-
-class Phi3DecoderLayer(nn.Module):
-    def __init__(self, config: Phi3VConfig, layer_idx: int):
-        super().__init__()
-
-        self.config = config
-        self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
-
-        self.mlp = Phi3MLP(config)
-        self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
-        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`):
-                input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
-                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-
-        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
-
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-PHI3V_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Phi3VConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Phi-3-V model outputting raw hidden-states without any specific head on top.",
-    PHI3V_START_DOCSTRING,
-)
-class Phi3VPreTrainedModel(PreTrainedModel):
-    config_class = Phi3VConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Phi3DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = False
-    _supports_cache_class = True
-
-    _version = "0.0.5"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-PHI3V_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
-            The tensors corresponding to the input images. Pixel values can be obtained using [`AutoImageProcessor`]. 
-            See [`Phi3ImageProcessor.__call__`] for details. 
-        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`, *optional*):
-            The sizes of the images in the batch, being (height, width) for each image.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Phi-3-V model outputting raw hidden-states without any specific head on top.",
-    PHI3V_START_DOCSTRING,
-)
-class Phi3VModel(Phi3VPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
-
-    Args:
-        config: Phi3Config
-    """
-
-    def __init__(self, config: Phi3VConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.embed_dropout = nn.Dropout(config.embd_pdrop)
-
-        self.vision_embed_tokens = None
-        if isinstance(config.embd_layer, dict):
-            # vision embedding layer
-            embedding_config = {
-                'embedding_cls': config.embd_layer['embedding_cls'],
-                **config.embd_layer
-            }
-            self.vision_embed_tokens = Phi3ImageEmbedding(config, wte=self.embed_tokens, **embedding_config)
-            # # set wte the same for vision embedding 
-            # self.vision_embed_tokens.wte.weight = self.embed_tokens.weight
-
-        self.layers = nn.ModuleList(
-            [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(PHI3V_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_sizes: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        past_key_values_length = 0
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            if pixel_values is not None and image_sizes is not None:
-                assert self.vision_embed_tokens is not None, "Vision embedding layer is not defined"
-                inputs_embeds = self.vision_embed_tokens(input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
-            else:
-                inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    use_reentrant=False
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Phi3VForCausalLM(Phi3VPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Phi3VModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
-    def get_decoder(self):
-        return self.model
-
-    # Ignore copy
-    @add_start_docstrings_to_model_forward(PHI3V_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_sizes: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Phi3ForCausalLM
-
-        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
-        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
-
-        >>> prompt = "This is an example script ."
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_sizes=image_sizes,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, pixel_values=None, image_sizes=None, **kwargs
-    ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "image_sizes": image_sizes,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The [`Phi3VModel`] with a sequence classification head on top (linear layer).
-
-    [`Phi3VForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    PHI3V_START_DOCSTRING,
-)
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
-class Phi3VForSequenceClassification(Phi3VPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Phi3VModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(PHI3V_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_sizes: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        model_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_sizes=image_sizes,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = model_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + model_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=model_outputs.past_key_values,
-            hidden_states=model_outputs.hidden_states,
-            attentions=model_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    [`Phi3VModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    PHI3V_START_DOCSTRING,
-)
-# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
-class Phi3VForTokenClassification(Phi3VPreTrainedModel):
-    def __init__(self, config: Phi3VConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.model = Phi3VModel(config)
-        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
-            classifier_dropout = config.classifier_dropout
-        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(PHI3V_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_sizes: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        model_outputs = self.model(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values,
-            image_sizes=image_sizes,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = model_outputs[0]
-        hidden_states = self.dropout(hidden_states)
-        logits = self.classifier(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            batch_size, seq_length = labels.shape
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
-            )
-
-        if not return_dict:
-            output = (logits,) + model_outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=model_outputs.hidden_states,
-            attentions=model_outputs.attentions,
-        )
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/processing_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/processing_phi3_v.py
deleted file mode 100644
index 1263845d1bc..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/processing_phi3_v.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Modified by Github@GaiZhenbiao
-
-"""
-Processor class for Phi3-V.
-"""
-import re
-from typing import List, Optional, Union
-
-import torch
-from .image_processing_phi3_v import Phi3VImageProcessor
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import ImageInput
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
-from transformers.utils import TensorType
-import transformers
-transformers.Phi3VImageProcessor = Phi3VImageProcessor
-
-class Phi3VProcessor(ProcessorMixin):
-    r"""
-    Constructs a Phi3-V processor which wraps a Phi3-V image processor and a LLaMa tokenizer into a single processor.
-
-    [`Phi3VProcessor`] offers all the functionalities of [`Phi3VImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~Phi3VProcessor.__call__`] and [`~Phi3VProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`Phi3VImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Phi3VImageProcessor"
-    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
-    special_image_token = "<|image|>"
-
-    def __init__(self, image_processor, tokenizer):
-        self.image_processor = image_processor
-        self.tokenizer = tokenizer
-        self.num_img_tokens = image_processor.num_img_tokens
-        self.img_tokens = [f"<|image_{i+1}|>" for i in range(1000000)]
-
-    def __call__(
-        self,
-        text: Union[TextInput, List[TextInput]],
-        images: ImageInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length=None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        Phi3ImageProcessor's [`~Phi3ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-        if images is not None:
-            image_inputs = self.image_processor(images, return_tensors=return_tensors)
-        else:
-            image_inputs = {}
-        inputs = self._convert_images_texts_to_inputs(image_inputs, text, padding=padding, truncation=truncation, max_length=max_length, return_tensors=return_tensors)
-        return inputs
-
-    def calc_num_image_tokens(self, images: ImageInput):
-        """ Calculate the number of image tokens for each image.
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-        """
-        return self.image_processor.calc_num_image_tokens(images)
-
-    def calc_num_image_tokens_from_image_size(self, width, height):
-        """ Calculate the number of image token for an image with given width and height.
-        Args:
-            width (`int`):
-                Width of the image.
-            height (`int`):
-                Height of the image.
-        """
-        return self.image_processor.calc_num_image_tokens_from_image_size(width, height)
-
-
-    @property
-    def special_image_token_id(self):
-        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
-
-    def get_special_image_token_id(self):
-        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
-
-    def _convert_images_texts_to_inputs(self, images, texts, padding=False, truncation=None, max_length=None, return_tensors=None):
-
-        def split_with_separators(s, separators):
-            parts = []
-            start = 0
-            sep_len = {sep: len(sep) for sep in separators}
-            while start < len(s):
-                index = min((s.find(sep, start), sep) for sep in separators if s.find(sep, start) != -1)
-                if index[0] == -1:
-                    parts.append(s[start:])
-                    break
-                if s[start:index[0]]:
-                    parts.append(s[start:index[0]])
-                parts.append(index[1])
-                start = index[0] + sep_len[index[1]]
-            return parts
-
-        def split_with_roles(input_text):
-            parts = split_with_separators(input_text, ["<|user|>\n", "<|end|>\n", "<|assistant|>\n", "<|image_1|>"])
-            new_parts = []
-            current_role = None
-            for p in parts:
-                if p in ["<|user|>\n", "<|assistant|>\n", "<|end|>\n"]:
-                    if p == "<|user|>\n":
-                        current_role = "user"
-                    elif p == "<|assistant|>\n":
-                        current_role = "assistant"
-                    _type = ["<|user|>\n", "<|assistant|>\n", "<|end|>\n"].index(p) + 1
-                    new_parts.append({"role": current_role, "content": p, "type": _type})
-                else:
-                    new_parts.append({"role": current_role, "content": p, "type": 0})
-            return new_parts
-
-        if not len(images):
-            model_inputs = self.tokenizer(texts, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length)
-            # prompt_chunks = []
-            label_prompt_chunks = []
-            # the behavior of the tokenizer is very very weird, what I observed is concluded by the following:
-            # 1. "<|user|>\n" is encoded as 3 tokens, while "<|assistant|>\n" is encoded as 1 tokens
-            # 2. tokenizing "I am here" and "\nI am here", the tokens of "I" in these two cases are different ("I" can be any word and is used as an example here)
-            # 3. when tokenizing "<|user|>\nI am here", the tokens of "I" follow the tokenization of "I" in "\nI am here"
-            # 4. when tokenizing "<|assistant|>\nI am here", the tokens of "I" follow the tokenization of "I" in "I am here"
-            # [Edited by zhenwei - 2024-06-01 22:25]
-            for chunk in split_with_roles(texts):
-                if chunk["role"] == "assistant" and chunk['type'] in [0, 3]:
-                    tmp_input_ids = self.tokenizer(chunk["content"], add_special_tokens=False).input_ids
-                    # prompt_chunks.append(tmp_input_ids)
-                    label_prompt_chunks.append(tmp_input_ids)
-                else:
-                    tmp_input_ids = self.tokenizer('\n' + chunk["content"], add_special_tokens=False).input_ids[2:]
-                    # prompt_chunks.append(tmp_input_ids)
-                    label_prompt_chunks.append([-100 for _ in range(len(tmp_input_ids))])
-
-            labels = [-100]
-            for chunk in label_prompt_chunks:
-                labels.extend(chunk)
-            # input_ids = [1]
-            # for chunk in prompt_chunks:
-            #     input_ids.extend(chunk)
-
-            labels = torch.tensor(labels, dtype=torch.long).unsqueeze(0)
-            # with open('tmp/input_ids.txt', 'w') as f:
-            #     print(texts, file=f)
-            #     print(split_with_roles(texts), file=f)
-            #     print("input_ids_before", file=f)
-            #     print(model_inputs['input_ids'][0].tolist(), file=f)
-            #     print("input_ids", file=f)
-            #     print(input_ids, file=f)
-            assert labels.shape[1] == model_inputs['input_ids'].shape[1], f"labels length: {labels.shape[1]}, input_ids length: {model_inputs['input_ids'].shape[1]}"
-            return BatchFeature(data={**model_inputs, "labels": labels})
-
-
-        if 'num_img_tokens' in images:
-            num_img_tokens = images['num_img_tokens']
-        else:
-            assert 'num_crops' in images, 'num_crops must be provided in images if num_img_tokens is not provided'
-            num_crops = images['num_crops']
-            num_img_tokens = [_num_crops * self.num_img_tokens for _num_crops in num_crops]
-
-        images, image_sizes = images['pixel_values'], images['image_sizes']
-
-        pattern = r"<\|image_\d+\|>"
-        # image_tags needs to start from 1 to n
-        image_tags = re.findall(pattern, texts)
-        # image_ids = [int(s.split("|")[1].split("_")[-1]) * -1 for s in image_tags]
-        # image_ids_pad = [[iid]*num_img_tokens[i] for i, iid in enumerate(image_ids)]
-        image_ids = [int(s.split("|")[1].split("_")[-1]) for s in image_tags]
-        unique_image_ids = sorted(list(set(image_ids)))
-        # image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be [1, 4, 5]
-        # check the condition
-        assert unique_image_ids == list(range(1, len(unique_image_ids)+1)), f"image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be {unique_image_ids}"
-        # total images must be the same as the number of image tags
-        assert len(unique_image_ids) == len(images), f"total images must be the same as the number of image tags, got {len(unique_image_ids)} image tags and {len(images)} images"
-
-        image_ids_pad = [[-iid]*num_img_tokens[iid-1] for iid in image_ids]
-
-        prompt_chunks = []
-        label_prompt_chunks = []
-        for chunk in split_with_roles(texts):
-            if chunk["role"] == "assistant" and chunk['type'] in [0, 3]:
-                tmp_input_ids = self.tokenizer(chunk["content"], add_special_tokens=False).input_ids
-                prompt_chunks.append(tmp_input_ids)
-                label_prompt_chunks.append(tmp_input_ids)
-            else:
-                if chunk["content"] == "<|image_1|>":
-                    tmp_input_ids = image_ids_pad.pop(0)
-                else:
-                    tmp_input_ids = self.tokenizer('\n' + chunk["content"], add_special_tokens=False).input_ids[2:]
-                prompt_chunks.append(tmp_input_ids)
-                label_prompt_chunks.append([-100 for _ in range(len(tmp_input_ids))])
-
-        input_ids = [1]
-        labels = [-100]
-        for chunk in prompt_chunks:
-            input_ids.extend(chunk)
-        for chunk in label_prompt_chunks:
-            labels.extend(chunk)
-        input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)
-        labels = torch.tensor(labels, dtype=torch.long).unsqueeze(0)
-        attention_mask = (input_ids > -1000000).to(torch.long)
-
-        return BatchFeature(data={"input_ids": input_ids,
-                                  "attention_mask": attention_mask,
-                                  "pixel_values": images,
-                                  "image_sizes": image_sizes,
-                                  "labels": labels})
-
-
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
-
-    @property
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt
deleted file mode 100644
index 6bba92c0b02..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-transformers==4.41.0
-torch
-tiktoken
-transformers_stream_generator
-peft
-sentencepiece
-einops
-accelerate
-datasets
-protobuf
-auto-gptq
-openpyxl
-wandb
-py-cpuinfo
-Pillow
-torchvision
-# lm-eval==0.4.4
-setuptools==70.0.0
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
deleted file mode 100644
index 9fe47669fb1..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  for var in "$@"
-  do
-    case $var in
-      --model_name=*)
-          model_name=$(echo $var |cut -f2 -d=)
-      ;;
-      --image_folder=*)
-          image_folder=$(echo $var |cut -f2 -d=)
-      ;;
-      --question_file=*)
-          question_file=$(echo $var |cut -f2 -d=)
-      ;;
-      --output_dir=*)
-          output_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    python main.py \
-            --model_name ${model_name} \
-            --nsamples 512 \
-            --quantize \
-            --image_folder ${image_folder} \
-            --question_file ${question_file} \
-            --output_dir ${output_dir}
-}
-
-main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md
deleted file mode 100644
index 00b9f0ee330..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md
+++ /dev/null
@@ -1,265 +0,0 @@
-Step-by-Step
-============
-This document describes the step-by-step instructions to run [VLM quantization for Qwen-VL](https://huggingface.co/Qwen/Qwen-VL) using AutoRound Quantization.
-
-# Run Quantization on Qwen-VL Models
-
-In this example, we introduce an straight-forward way to execute quantization on some popular multimodal models such as Qwen-VL. 
-
-## Download the calibration data
-
-Our calibration process resembles the official visual instruction tuning process.
-
-Please download the annotation of the final mixture our instruction tuning data [llava_v1_5_mix665k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json), and download the images from constituting datasets:
-
-COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip the image folder to any directory you desire.
-
-You can also refer to the official Qwen-VL finetuning requirements to create a [custom dataset](https://github.com/QwenLM/Qwen-VL/blob/master/README.md#data-preparation)
-
-## Download the evaluation data
-
-Please refer to [Qwen-VL evaluation](https://github.com/cognitedata/Qwen-VL-finetune/blob/master/eval_mm/EVALUATION.md)
-<details>
-<summary>TextVQA Data Preparation</summary>
-
-```bash
-mkdir -p data/textvqa && cd data/textvqa
-
-# download images
-wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip && unzip train_val_images.zip
-
-# download annotations and questions
-wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_train.json
-wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
-
-# download converted files
-wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_annotations.json
-wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_questions.json
-wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train.jsonl
-wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_annotations.json
-wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_questions.json
-wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val.jsonl
-
-cd ../..
-
-```
-</details>
-
-<br />
-
-<details>
-<summary>ScienceQA Data Preparation</summary>
-
-```bash
-mkdir -p data/scienceqa/images && cd data/scienceqa/images
-
-# download images
-wget https://scienceqa.s3.us-west-1.amazonaws.com/images/test.zip && unzip test.zip
-
-cd ..
-
-# download original questions
-wget https://github.com/lupantech/ScienceQA/blob/main/data/scienceqa/problems.json
-
-# download converted files
-wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/scienceqa/scienceqa_test_img.jsonl
-
-cd ../..
-
-```
-</details>
-<br />
-
-## 2. Run Examples
-Enter into the examples folder and install requirements
-```bash
-pip install -r requirements.txt
-```
-
-- **Default Settings:**
-```bash
-sh run_autoround.sh
-```
-
-
-## 3. run inference
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
-import torch
-from neural_compressor.torch.quantization import load
-from transformers import set_seed
-set_seed(1234)
-
-quantized_model_path = "./tmp_autoround"
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, trust_remote_code=True)
-model = load(quantized_model_path, format='huggingface', device_map="auto", trust_remote_code=True).eval()
-query = tokenizer.from_list_format([{'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, \
-    {'text': 'Generate the caption in English with grounding:'}, \
-])
-inputs = tokenizer(query, return_tensors='pt')
-inputs = inputs.to(model.device)
-with torch.cuda.amp.autocast(): 
-    pred = model.generate(**inputs)
-    
-response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
-print(response)
-# <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
-image = tokenizer.draw_bbox_on_latest_picture(response)
-if image:
-image.save('2.jpg')
-else:
-print("no box")
-
-```
-
-
-
-- Qwen2-VL-7B-Instruct inference
-
-```python
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
-from neural_compressor.torch.quantization import load
-quantized_model_path="./tmp_autoround"
-model = load(quantized_model_path, format='huggingface', device_map="auto",
-             trust_remote_code=True, model_class=Qwen2VLForConditionalGeneration)
-processor = AutoProcessor.from_pretrained(quantized_model_path)
-messages = [{
-    "role": "user",
-    "content": [
-        {
-            "type": "image",
-            "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
-        },
-        {"type": "text", "text": "Describe this image."},]
-}]
-# Preparation for inference
-text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-image_inputs, video_inputs = process_vision_info(messages)
-inputs = processor(
-    text=[text],
-    images=image_inputs,
-    videos=video_inputs,
-    padding=True,
-    return_tensors="pt",
-)
-inputs = inputs.to(model.device)
- 
-# Inference: Generation of the output
-generated_ids = model.generate(**inputs, max_new_tokens=50)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-print(output_text)
-# The image depicts a serene beach scene at sunset. A woman is sitting on the sand, facing a large dog that appears to be a Labrador Retriever. The dog is wearing a harness and is extending its paw towards the woman's hand, possibly
-
-# messages = [{
-#     "role": "user",
-#     "content": [
-#         {
-#             "type": "image",
-#             "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
-#         },
-#         {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},]
-# }]
-
-# The label 15 represents an ash cloud. In the context of a volcano, an ash cloud is formed when volcanic ash is ejected into the atmosphere during an eruption. Therefore, the correct answer is:\n\n(4) ash cloud
-
-```
-
-
-- Llama-3.2-11B-Vision-Instruct inference
-
-```python
-import requests
-import torch
-from PIL import Image
-from transformers import MllamaForConditionalGeneration, AutoProcessor
-from neural_compressor.torch.quantization import load
-quantized_model_path="./tmp_autoround"
-model = load(quantized_model_path, format='huggingface', device_map="auto", torch_dtype=torch.bfloat16,
-             trust_remote_code=True, model_class=MllamaForConditionalGeneration)
-processor = AutoProcessor.from_pretrained(quantized_model_path)
-
-url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-
-prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
-inputs = processor(image, prompt, return_tensors="pt", truncation=True).to(model.device)
-
-output = model.generate(**inputs, max_new_tokens=30)
-print(processor.decode(output[0]))
-
-# <|begin_of_text|><|image|><|begin_of_text|>If I had to write a haiku for this one, it would be:
-
-# Rabbit in a coat
-# Dressed up in style for the day
-# Country charm abounds
-
-# The image depicts a rabbit
-```
-
-
-## 4. Results
-Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Qwen-VL are as follows:
-| Metric         | bf16   | INT4   |
-|:----------------|:--------|:--------|
-| avg            | 0.5628 | 0.5589 |
-| paper-avg      | 0.5603 | 0.5611 |
-| mmlu           | 0.4828 | 0.4639 |
-| lambada_openai | 0.6782 | 0.6664 |
-| hellaswag      | 0.5593 | 0.5487 |
-| winogrande     | 0.6827 | 0.6875 |
-| piqa           | 0.7786 | 0.7748 |
-| truthfulqa_mc1 | 0.2876 | 0.2901 |
-| openbookqa     | 0.2880 | 0.2940 |
-| boolq          | 0.7012 | 0.7318 |
-| arc_easy       | 0.7201 | 0.7327 |
-| arc_challenge  | 0.4249 | 0.4206 |
-| cmmlu          | 0.4798 | 0.4618 |
-| ceval          | 0.4814 | 0.4569 |
-| textVQA        | 0.6402 | 0.6379 |
-| scienceVQA     | 0.6748 | 0.6574 |
-
-
-## 5. Known Issues
-* 'QWenTokenizer' object has no attribute 'IMAGE_ST'
-
-    When encountering the above error during evaluation or inference with a quantized model, it is due to Qwen-VL being incompatible with higher versions of the transformers. You can refer to this issue and manually comment out lines 227-228 in the 'tokenization_qwen.py' file.
-
-
-* No such file or directory: 'PATH/modeling_qwen.py'
-
-    Due to the particularities of Qwen-VL, even when setting trust_remote_code=True while loading the model, the above error may still occur. Please manually copy the modeling_qwen.py, visual.py, and qwen_generation_utils.py files from the original model path to resolve the issue.
-
-
-## 6. Environment
-
-PyTorch 1.8 or higher version is needed
-
-
-## Reference
-If you find SignRound useful for your research, please cite our paper:
-```bash
-@article{cheng2023optimize,
-  title={Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs},
-  author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao},
-  journal={arXiv preprint arXiv:2309.05516},
-  year={2023}
-}
-```
-
-
-
-
-
-
-
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
deleted file mode 100644
index 91a5b948539..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
+++ /dev/null
@@ -1,587 +0,0 @@
-import argparse
-parser = argparse.ArgumentParser()
-import torch
-import os
-import transformers
-# os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-# torch.use_deterministic_algorithms(True, warn_only=True)
-from transformers import set_seed
-import json
-from torch.utils.data import Dataset, DataLoader
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from auto_round.utils import convert_dtype_torch2str
-from typing import Dict, Optional, List
-from transformers.trainer_utils import RemoveColumnsCollator
-from transformers.data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-from PIL import Image
-from transformers.trainer_pt_utils import LabelSmoother
-IGNORE_TOKEN_ID = LabelSmoother.ignore_index
-import inspect
-OLD_IMAGE_TOKEN = '<image>'
-DEFAULT_IM_START_TOKEN = '<img>'
-DEFAULT_IM_END_TOKEN = '</img>'
-from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                    get_layer_names_in_block,
-                                                    detect_device,
-                                                    run_fn_for_vlm_autoround
-                                                    )
-from neural_compressor.torch.quantization import (AutoRoundConfig,
-                                                    prepare,
-                                                    convert,
-                                                    load)
-
-
-def DataFormating(raw_data, image_folder=None, model_type='qwen'):
-    for source in raw_data:
-        source_inputs = source['conversations']
-        for sentence in source_inputs:
-            sentence['from'] = sentence['from'].replace('human', 'user')
-            sentence['from'] = sentence['from'].replace('gpt', 'assistant')
-            if OLD_IMAGE_TOKEN in sentence['value']:
-                sentence['value'] = sentence['value'].replace(OLD_IMAGE_TOKEN, '').strip()
-                sentence['value'] = OLD_IMAGE_TOKEN + sentence['value']
-                sentence['value'] = sentence['value'].strip()
-                if 'qwen2' in model_type: # for Qwen2-vl
-                    replace_token = '<|vision_start|><|image_pad|><|vision_end|>'
-                elif 'mllama' in model_type:
-                    replace_token = '<|image|>'
-                else:
-                    replace_img = os.path.join(image_folder, os.path.basename(source["image"]))
-                    replace_token = DEFAULT_IM_START_TOKEN + replace_img + DEFAULT_IM_END_TOKEN + '\n'
-                sentence["value"] = sentence["value"].replace(OLD_IMAGE_TOKEN, replace_token)
-    return raw_data
-
-
-def common_preprocess(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-    max_len: int,
-    system_message: str = "You are a helpful assistant.",
-    model_type='qwen2'
-) -> Dict:
-    if 'mllama' in model_type:
-        roles = {"user": "<|start_header_id|>user<|end_header_id|>\n", "assistant": "<|start_header_id|>assistant<|end_header_id|>\n"}
-        im_start = "<|start_header_id|>"
-        im_end = "<|end_header_id|>\n"
-        im_dot = '<|eot_id|>'
-        text_start = '<|begin_of_text|>'
-    else :
-        roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
-        im_start = "<|im_start|>"
-        im_end = "<|im_end|>"
-    nl_tokens = '\n'
-    _system = 'system'
-
-    # Apply prompt templates
-    inputs, targets = [], []
-    for i, source in enumerate(sources):
-        if roles[source[0]["from"]] != roles["user"]:
-            source = source[1:]
-
-        text, target = "", None
-        if 'mllama' in model_type:
-            system = text_start + im_start + _system + im_end + nl_tokens + system_message + im_dot
-        else:
-            system = im_start + _system + nl_tokens + system_message + im_end + nl_tokens
-        text += system
-        for j, sentence in enumerate(source):
-            role = roles[sentence["from"]]
-            if 'mllama' in model_type:
-                _text = role + nl_tokens + \
-                    sentence["value"] + im_dot
-            else:
-                _text = role + nl_tokens + \
-                    sentence["value"] + im_end + nl_tokens
-            text += _text
-        token_length = len(tokenizer(text).input_ids)
-        if token_length < max_len:
-            text += tokenizer.pad_token * (max_len - token_length)
-        else:
-            text = tokenizer.decode(tokenizer(text).input_ids[:max_len])
-            pass
-        inputs.append(text)
-
-    return inputs
-
-
-def preprocess(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-    max_len: int,
-    system_message: str = "You are a helpful assistant."
-) -> Dict:
-    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
-    if 'qwen2' not in model_type:
-        im_start = tokenizer.im_start_id
-        im_end = tokenizer.im_end_id
-    else:
-        im_start = tokenizer('<|im_start|>')
-        im_end = tokenizer('<|im_end|>')
-    nl_tokens = tokenizer('\n').input_ids
-    _system = tokenizer('system').input_ids + nl_tokens
-    _user = tokenizer('user').input_ids + nl_tokens
-    _assistant = tokenizer('assistant').input_ids + nl_tokens
-
-    # Apply prompt templates
-    input_ids, targets = [], []
-    for i, source in enumerate(sources):
-        if roles[source[0]["from"]] != roles["user"]:
-            source = source[1:]
-
-        input_id, target = [], []
-        system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
-        input_id += system
-        target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
-        assert len(input_id) == len(target)
-        for j, sentence in enumerate(source):
-            role = roles[sentence["from"]]
-            _input_id = tokenizer(role).input_ids + nl_tokens + \
-                tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
-            input_id += _input_id
-            if role == '<|im_start|>user':
-                _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
-            elif role == '<|im_start|>assistant':
-                _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
-                    _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
-            else:
-                raise NotImplementedError
-            target += _target
-        assert len(input_id) == len(target)
-        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
-        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
-        input_ids.append(input_id[:max_len])
-        targets.append(target[:max_len])
-    input_ids = torch.tensor(input_ids, dtype=torch.int)
-    targets = torch.tensor(targets, dtype=torch.int)
-    
-    return dict(
-        input_ids=input_ids,
-        labels=targets,
-        attention_mask=input_ids.ne(tokenizer.pad_token_id),
-    )
-
-
-
-class LazySupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-
-    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer,
-                 max_len: int, image_folder=None, model_type='qwen_vl'):
-        super(LazySupervisedDataset, self).__init__()
-        self.tokenizer = tokenizer
-        self.max_len = max_len
-        self.image_folder = image_folder
-        print("Formatting inputs...Skip in lazy mode")
-        self.raw_data = raw_data
-        self.cached_data_dict = {}
-
-    def __len__(self):
-        return len(self.raw_data)
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        if i in self.cached_data_dict:
-            return self.cached_data_dict[i]
-
-        if 'qwen' == model_type: # for Qwen-VL
-            ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
-            ret = dict(
-                input_ids=ret["input_ids"][0],
-                labels=ret["labels"][0],
-                attention_mask=ret["attention_mask"][0],
-            )
-        else: # Qwen2-VL and Llama-3.2 
-            texts = common_preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len, model_type=model_type)
-            if 'qwen2' in model_type:
-                image_path = os.path.join(f"file://{self.image_folder}", os.path.basename(self.raw_data[i]["image"]))
-                image = fetch_image({'image':image_path})
-            else:
-                image = Image.open(os.path.join(self.image_folder, os.path.basename(self.raw_data[i]["image"]))) #.convert('RGB')
-            ret = self.tokenizer.processor(
-                text=texts,
-                images=image,
-                padding=True,
-                truncation=True,
-                return_tensors="pt",
-                # videos=None,
-            )
-            if 'qwen2' in model_type:
-                ret = dict(
-                    input_ids=ret["input_ids"][0],
-                    # labels=ret["labels"][0],
-                    attention_mask=ret["attention_mask"][0],
-                    image_grid_thw=ret["image_grid_thw"][0],
-                    pixel_values=ret["pixel_values"],
-                )
-            else:
-                ret = dict(
-                    input_ids=ret["input_ids"][0],
-                    attention_mask=ret["attention_mask"][0],
-                    aspect_ratio_ids=ret["aspect_ratio_ids"][0],
-                    aspect_ratio_mask=ret["aspect_ratio_mask"][0],
-                    cross_attention_mask=ret["cross_attention_mask"][0],
-                    pixel_values=ret["pixel_values"][0],
-                )
-        self.cached_data_dict[i] = ret
-        return ret
-
-
-def set_signature_columns_if_needed(model):
-    # Inspect model forward signature to keep only the arguments it accepts.
-    model_to_inspect = model
-    signature = inspect.signature(model_to_inspect.forward)
-    signature_columns = list(signature.parameters.keys())
-    # Labels may be named label or label_ids, the default data collator handles that.
-    signature_columns += list(set(["label", "label_ids", 'labels']))
-    return signature_columns
-    
-def get_collator_with_removed_columns(model, data_collator: Callable, description: Optional[str] = None
-    ) -> Callable:
-        """Wrap the data collator in a callable removing unused columns."""
-        signature_columns = set_signature_columns_if_needed(model)
-
-        remove_columns_collator = RemoveColumnsCollator(
-            data_collator=data_collator,
-            signature_columns=signature_columns,
-            description=description,
-            model_name=model.__class__.__name__,
-        )
-        return remove_columns_collator
-
-
-def get_train_dataloader(train_dataset, model, data_collator=default_data_collator,
-                         train_batch_size=1, num_workers=0) -> DataLoader:
-    """
-    Returns the training [`~torch.utils.data.DataLoader`].
-
-    Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
-    training if necessary) otherwise.
-
-    Subclass and override this method if you want to inject some custom behavior.
-    """
-    if train_dataset is None:
-        raise ValueError("Trainer: training requires a train_dataset.")
-    
-    if data_collator != default_data_collator:
-        data_collator = get_collator_with_removed_columns(model, data_collator, description="training")
-
-    dataloader_params = {
-        "batch_size": train_batch_size,
-        "collate_fn": data_collator,
-        "num_workers": num_workers,
-    }
-
-    return DataLoader(train_dataset, **dataloader_params)
-
-if __name__ == '__main__':
-
-    parser.add_argument(
-        "--model_name", default="Qwen/Qwen-VL"
-    )
-
-    parser.add_argument("--quantize", action="store_true")
-    
-    parser.add_argument("--accuracy", action="store_true")
-    
-    parser.add_argument("--bits", default=4, type=int,
-                        help="number of  bits")
-
-    parser.add_argument("--group_size", default=128, type=int,
-                        help="group size")
-
-    parser.add_argument("--train_bs", default=1, type=int,
-                        help="train batch size")
-
-    parser.add_argument("--eval_bs", default=4, type=int,
-                        help="eval batch size")
-
-    parser.add_argument("--device", default="auto", type=str,
-                        help="The device to be used for tuning. The default is set to auto/None,"
-                             "allowing for automatic detection. Currently, device settings support CPU, GPU, and HPU.")
-
-    parser.add_argument("--sym", action='store_true',
-                        help=" sym quantization")
-
-    parser.add_argument("--iters", default=200, type=int,
-                        help=" iters")
-
-    parser.add_argument("--lr", default=None, type=float,
-                        help="learning rate, if None, it will be set to 1.0/iters automatically")
-
-    parser.add_argument("--minmax_lr", default=None, type=float,
-                        help="minmax learning rate, if None,it will beset to be the same with lr")
-
-    parser.add_argument("--seed", default=42, type=int,
-                        help="seed")
-
-    parser.add_argument("--adam", action='store_true',
-                        help="adam")
-
-    parser.add_argument("--seqlen", default=512, type=int,
-                        help="sequence length")
-
-    parser.add_argument("--gradient_accumulate_steps", default=8, type=int, help="gradient accumulate steps")
-
-    parser.add_argument("--nblocks", default=1, type=int, help="num of blocks to tune together")
-
-    parser.add_argument("--nsamples", default=512, type=int,
-                        help="number of samples")
-
-    parser.add_argument("--export_format", default='auto_round:gptq', type=str,
-                        help="targeted inference acceleration platform,The options are 'fake', 'cpu', 'gpu', 'xpu' and 'auto_round'."
-                             "default to 'fake', indicating that it only performs fake quantization and won't be exported to any device.")
-
-    parser.add_argument("--scale_dtype", default='fp16',
-                        help="which scale data type to use for quantization, 'fp16', 'fp32' or 'bf16'.")
-
-    parser.add_argument("--output_dir", default="./tmp_autoround", type=str,
-                        help="Where to store the final model.")
-
-    parser.add_argument("--disable_eval", action='store_true',
-                        help="Whether to do lmeval evaluation.")
-
-    parser.add_argument("--disable_amp", action='store_true',
-                        help="disable amp")
-
-    parser.add_argument("--disable_minmax_tuning", action='store_true',
-                        help="whether disable  enable weight minmax tuning")
-
-    parser.add_argument("--disable_trust_remote_code", action='store_true',
-                        help="Whether to disable trust_remote_code")
-
-    parser.add_argument("--disable_quanted_input", action='store_true',
-                        help="whether to disuse the output of quantized block to tune the next block")
-
-    parser.add_argument("--quant_lm_head", action='store_true',
-                        help="quant_lm_head")
-
-    parser.add_argument("--model_dtype", default=None, type=str,
-                        help="force to convert the dtype, some backends supports fp16 dtype better")
-    
-    parser.add_argument("--model_max_length", default=2048, type=int,
-                        help="")
-    
-    parser.add_argument("--act_bits", default=32, type=int,
-                    help="activation bits")
-    
-    parser.add_argument("--quant_vision", action='store_true',
-                        help="To determine whether the quantization should handle vision component.")
-    
-    # ========== Calibration Datasets ============= 
-    parser.add_argument("--image_folder", default="coco", type=str,
-                        help="The dataset for quantization training. It can be a custom one.")
-    
-    parser.add_argument("--question_file", default=None, type=str,
-                            help="The dataset for quantization training. It can be a custom one.")
-    
-    # ================= Evaluation Related =====================
-    # parser.add_argument("--eval-path", type=str, default=None)
-    
-    parser.add_argument("--eval_dataset", type=str, default="textvqa_val,scienceqa_test_img")
-
-    args = parser.parse_args()
-
-    set_seed(args.seed)
-    
-
-    if args.quantize:
-        if args.act_bits <= 8:
-            print(
-                "Warning, activation quantization is an experiment feature")
-        
-        if args.act_bits <= 8 and args.export_format != "fake":
-            assert False, "only support fake mode for activation quantization currently"
-            
-        if "marlin" in args.export_format and args.sym == False:
-            assert False, "marlin backend only supports sym quantization, please set --sym"
-            
-        model_name = args.model_name
-        if model_name[-1] == "/":
-            model_name = model_name[:-1]
-        print(model_name, flush=True)
-
-        device_str = detect_device(args.device)
-        torch_device = torch.device(device_str)
-        
-        model_name = args.model_name
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code,
-                                                padding_side="right", use_fast=False)
-        seqlen = args.seqlen
-        if hasattr(tokenizer, "model_max_length"):
-            if tokenizer.model_max_length < seqlen:
-                print(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length",
-                    flush=True)
-                seqlen = min(seqlen, tokenizer.model_max_length)
-                args.seqlen = seqlen
-                
-        torch_dtype = "auto"
-        if "hpu" in device_str:
-            torch_dtype = torch.bfloat16 ## TODO test on hpu
-        if args.model_dtype != None:
-            if args.model_dtype == "float16" or args.model_dtype == "fp16":
-                torch_dtype = torch.float16
-            if args.model_dtype == "bfloat16" or args.model_dtype == "bf16":
-                torch_dtype = torch.bfloat16
-                
-        dtype_str = convert_dtype_torch2str(torch_dtype)
-        questions = json.load(open(args.question_file, "r"))
-        config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
-        model_type = config.model_type
-        processor = None
-        if "mllama" in model_type: #for Llama-3.2-11B-Vision-Instruct
-            transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
-            if transformers_version[0] == 4 and transformers_version[1] < 45:
-                error_message = "Please upgrade transformers to version >= 4.45 or the newest source code to support Qwen2-VL quantization."
-                raise EnvironmentError(error_message)
-            from transformers import MllamaForConditionalGeneration, AutoProcessor
-            model = MllamaForConditionalGeneration.from_pretrained(args.model_name, attn_implementation="eager",
-                                                                trust_remote_code=not args.disable_trust_remote_code) # torch_dtype=torch.bfloat16
-            processor = AutoProcessor.from_pretrained(args.model_name)
-            tokenizer.processor = processor
-            default_collator = default_data_collator
-        elif 'qwen2' in model_type: # for Qwen2-VL-instruct
-            transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
-            if transformers_version[0] == 4 and transformers_version[1] < 45:
-                error_message = "Please upgrade transformers to version >= 4.45 or the newest source code to support Qwen2-VL quantization."
-                raise EnvironmentError(error_message)
-            from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-            from qwen_vl_utils import process_vision_info, fetch_image
-            model = Qwen2VLForConditionalGeneration.from_pretrained(args.model_name, torch_dtype=torch_dtype)
-            processor = AutoProcessor.from_pretrained(args.model_name)
-            tokenizer.processor = processor
-            default_collator = default_data_collator
-        else: # for Qwen-VL/Qwen-VL-Chat
-            tokenizer.pad_token_id = tokenizer.eod_id
-            config.use_cache = False
-            if dtype_str == "bf16":
-                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code, bf16=True).eval()
-            elif dtype_str == "fp16":
-                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code, fp16=True).eval()
-            else:
-                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code).eval()
-            # raw_data = DataFormating(questions, args.image_folder)
-            default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
-            
-        raw_data = DataFormating(questions, args.image_folder, model_type=model_type)
-        dataset = LazySupervisedDataset(raw_data, tokenizer,
-                                        max_len=min(args.seqlen, tokenizer.model_max_length), image_folder=args.image_folder)
-        dataloader = get_train_dataloader(dataset, model, data_collator=default_collator, train_batch_size=args.train_bs)
-        
-        
-        model = model.eval()
-        seqlen = args.seqlen
-                    
-        lm_head_layer_name = "lm_head"
-        # for n, _ in model.named_modules():
-        #     lm_head_layer_name = n
-        if args.quant_lm_head:
-            from transformers import AutoConfig
-            config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
-            if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
-                tied_keys = model._tied_weights_keys
-                for item in tied_keys:
-                    if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
-                        args.quant_lm_head = False
-                        print(
-                            f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
-                            f"supported currently")
-                        break
-            
-        quant_block_list = get_multimodal_block_names(model, args.quant_vision)
-        
-        # dataset=dataloader, layer_config=, amp, 
-        quant_config = AutoRoundConfig(bits=args.bits, use_sym=args.sym, batch_size=args.train_bs, group_size=args.group_size,
-                        seqlen=seqlen, nblocks=args.nblocks, iters=args.iters, lr=args.lr,
-                        minmax_lr=args.minmax_lr, enable_quanted_input=not args.disable_quanted_input,
-                        nsamples=args.nsamples, seed=args.seed, gradient_accumulate_steps=args.gradient_accumulate_steps,
-                        scale_dtype=args.scale_dtype, enable_minmax_tuning=not args.disable_minmax_tuning, act_bits=args.act_bits,
-                        quant_block_list=quant_block_list, export_format=args.export_format)
-        
-        all_block_list = get_multimodal_block_names(model, quant_vision=True)
-        all_block_set = set(tuple(block) for block in all_block_list)
-        quant_block_set = set(tuple(block) for block in quant_block_list)
-        set_to_full_prec = list(all_block_set - quant_block_set)
-        set_to_full_prec = get_layer_names_in_block(model, quant_block_list=set_to_full_prec)
-        for name in set_to_full_prec:
-            quant_config.set_local(name, AutoRoundConfig(dtype="fp32"))
-        
-        for n, m in model.named_modules():
-            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
-                if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                    quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
-                    print(
-                        f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
-                    
-        # skip special layers
-        quant_config.set_local("transformer.visual.attn_pool.*_proj", AutoRoundConfig(dtype="fp32"))
-        quant_config.set_local("multi_modal_projector", AutoRoundConfig(dtype="fp32"))
-        quant_config.set_local("visual.merger", AutoRoundConfig(dtype="fp32"))
-        
-
-        if not args.quant_lm_head:
-            quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
-            transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
-            if transformers_version[0] == 4 and transformers_version[1] < 38:
-                error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
-                raise EnvironmentError(error_message)
-            
-        run_args = (dataloader, seqlen, args.nsamples)
-        user_model = prepare(model=model, quant_config=quant_config)
-        run_fn_for_vlm_autoround(user_model, *run_args)
-        user_model = convert(user_model)
-        
-        from neural_compressor.torch.utils import (LoadFormat,)
-        user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(args.output_dir)
-        if processor is not None:
-            processor.save_pretrained(args.output_dir)
-
-    if args.accuracy:
-        torch_dtype = "auto"
-        model_name = args.model_name
-        device_str = detect_device(args.device)
-        torch_device = torch.device(device_str)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code,
-                                                  padding_side="right", use_fast=False)
-        config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
-        model_type = config.model_type
-        model_cls = None
-        if 'mllama' in model_type or 'qwen2' in model_type:
-            print(f"{model_type} quantized model evaluation is not supported yet.")
-            exit()
-        if 'qwen2' in model_type: ## TODO test the eval ability
-            from transformers import Qwen2VLForConditionalGeneration
-            model_cls = Qwen2VLForConditionalGeneration
-        elif 'mllama' in model_type:
-            from transformers import MllamaForConditionalGeneration, AutoProcessor
-            model_cls = MllamaForConditionalGeneration
-        model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code, model_class=model_cls)
-        model = model.to(torch_device)
-        torch_dtype = model.dtype
-        datasets=args.eval_dataset.split(',')
-        for dataset in datasets:
-            if 'vqa' in dataset:
-                from mm_evaluation.evaluate_vqa import textVQA_evaluation
-                with torch.amp.autocast(device_type=device_str.split(":")[0], dtype=torch_dtype):
-                    evaluator = textVQA_evaluation(
-                        model,
-                        dataset_name=dataset,
-                        tokenizer=tokenizer,
-                        batch_size=args.eval_bs,
-                        device=str(torch_device)
-                    )
-            elif 'scienceqa' in dataset:
-                from mm_evaluation.evaluate_multiple_choice import scienceQA_evaluation
-                with torch.amp.autocast(device_type=device_str.split(":")[0], dtype=torch_dtype):
-                    evaluator = scienceQA_evaluation(
-                        model,
-                        dataset_name=dataset,
-                        tokenizer=tokenizer,
-                        batch_size=args.eval_bs,
-                        device=str(torch_device)
-                    )
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/__init__.py
deleted file mode 100644
index 01913bdfeb9..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# if __name__ == "__main__":
-#     import sys
-#     sys.path.insert(0, './')
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py
deleted file mode 100644
index 9d802ebff13..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import argparse
-import itertools
-import json
-import os
-from functools import partial
-
-import torch
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-multiple_choices = ['A', 'B', 'C', 'D', 'E']
-
-ds_collections = {
-    'scienceqa_test_img': {
-        'test': 'data/scienceqa/scienceqa_test_img.jsonl',
-    }
-}
-
-
-def collate_fn(batches, pad_token_id):
-
-    input_tokens = [_['input_tokens'] for _ in batches]
-    target_lengths = [_['target_lengths'] for _ in batches]
-    answers = [_['answer'] for _ in batches]
-
-    chunk_sizes = [len(_) for _ in input_tokens]
-
-    input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
-
-    max_lengths = max([len(_) for _ in input_tokens])
-    input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
-                    for _ in input_tokens]
-    input_tokens = torch.LongTensor(input_tokens)
-
-    attention_mask = 1 - input_tokens.eq(pad_token_id).float()
-
-    return input_tokens, attention_mask, target_lengths, answers, chunk_sizes
-
-
-class MultipleChoiceDataste(torch.utils.data.Dataset):
-
-    def __init__(self, test, prompt, tokenizer):
-        self.data = open(test).readlines()
-        self.prompt = prompt
-        self.tokenizer = tokenizer
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-
-        data = json.loads(self.data[idx].strip())
-        image = data['image']
-        hint = data['hint'] if data['hint'] else 'N/A'
-        question = data['question']
-
-        choices = data['choices']
-        choice_list = []
-        for i, c in enumerate(choices):
-            choice_list.append('{}. {}'.format(multiple_choices[i], c))
-        choice_txt = '\n'.join(choice_list)
-
-        prompt = self.prompt.format(image, hint, question, choice_txt)
-
-        prompt_tokens = self.tokenizer(prompt).input_ids
-        target_tokens = [
-            self.tokenizer(' ' + _).input_ids
-            for _ in multiple_choices[:len(choices)]
-        ]
-
-        return {
-            'input_tokens': [prompt_tokens + _ for _ in target_tokens],
-            'target_lengths': [len(_) for _ in target_tokens],
-            'answer': data['answer'],
-        }
-
-
-class InferenceSampler(torch.utils.data.sampler.Sampler):
-
-    def __init__(self, size):
-        self._size = int(size)
-        assert size > 0
-        self._rank = torch.distributed.get_rank()
-        self._world_size = torch.distributed.get_world_size()
-        self._local_indices = self._get_local_indices(size, self._world_size,
-                                                      self._rank)
-
-    @staticmethod
-    def _get_local_indices(total_size, world_size, rank):
-        shard_size = total_size // world_size
-        left = total_size % world_size
-        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
-
-        begin = sum(shard_sizes[:rank])
-        end = min(sum(shard_sizes[:rank + 1]), total_size)
-        return range(begin, end)
-
-    def __iter__(self):
-        yield from self._local_indices
-
-    def __len__(self):
-        return len(self._local_indices)
-
-
-def scienceQA_evaluation(model_name, dataset_name, dataset_path=None, tokenizer=None,
-                       batch_size=1, few_shot=0, seed=0, trust_remote_code=True, device="cuda:0"):
-    # torch.distributed.init_process_group(
-    #     backend='nccl',
-    #     world_size=int(os.getenv('WORLD_SIZE', '1')),
-    #     rank=int(os.getenv('RANK', '0')),
-    # )
-    if "cuda" in device:
-        torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
-    if isinstance(model_name, str):
-        config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
-        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=trust_remote_code).eval()
-        model = model.to(torch.device(device))
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code, use_fast=False)
-    else:
-        assert tokenizer is not None, "Two types of parameter passing are supported:model_path or model with tokenizer."
-        model = model_name
-
-    prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
-
-    dataset = MultipleChoiceDataste(test=ds_collections[dataset_name]['test'],
-                                    prompt=prompt,
-                                    tokenizer=tokenizer)
-    dataloader = torch.utils.data.DataLoader(
-        dataset=dataset,
-        # sampler=InferenceSampler(len(dataset)),
-        batch_size=batch_size,
-        # num_workers=0,
-        pin_memory=True,
-        drop_last=False,
-        collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
-    )
-
-    results = []
-    with torch.no_grad():
-        for _, (input_tokens, attention_mask, target_lengths, answer,
-                chunk_sizes) in tqdm(enumerate(dataloader)):
-
-            outputs = model(
-                input_ids=input_tokens[:, :-1].to(device),
-                attention_mask=attention_mask[:, :-1].to(device),
-                return_dict=True,
-            )
-            losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
-                0, 2, 1),
-                                                       input_tokens[:,
-                                                                    1:].to(device),
-                                                       reduction='none')
-
-            losses = losses.split(chunk_sizes, dim=0)
-
-            for loss, target_length, answer in zip(losses, target_lengths,
-                                                   answer):
-
-                target_loss = loss.mean(-1)
-                for _ in range(len(target_length)):
-                    target_loss[_] = loss[_, -target_length[_]:].mean()
-                pred = target_loss.argmin().item()
-                if pred == answer:
-                    results.append(1)
-                else:
-                    results.append(0)
-
-    # torch.distributed.barrier()
-
-    # world_size = torch.distributed.get_world_size()
-    # merged_results = [None for _ in range(world_size)]
-    # torch.distributed.all_gather_object(merged_results, results)
-    merged_results = [json.dumps(results)]
-    merged_results = [json.loads(_) for _ in merged_results]
-    merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
-
-    # if torch.distributed.get_rank() == 0:
-    print(f"Evaluating {dataset_name} ...")
-    print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
-
-    # torch.distributed.barrier()
-
-
-
-
-if __name__ == "__main__":
-    import sys
-    import time
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", default="Qwen/Qwen-VL"
-    )
-    parser.add_argument(
-        "--dataset_name", default="scienceqa_test_img"
-    )
-    parser.add_argument(
-        "--eval_bs", default=4,
-    )
-    parser.add_argument(
-        "--trust_remote_code", action='store_true',
-        help="Whether to enable trust_remote_code"
-    )
-    args = parser.parse_args()
-    s = time.time()
-    evaluator = scienceQA_evaluation(
-        args.model_name,
-        dataset_name=args.dataset_name,
-        # dataset_path=args.eval_path,
-        batch_size=args.eval_bs,
-        trust_remote_code=args.trust_remote_code
-    )
-    print("cost time: ", time.time() - s)
-
-    
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py
deleted file mode 100644
index e055e71c63b..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py
+++ /dev/null
@@ -1,465 +0,0 @@
-import argparse
-import itertools
-import json
-import os
-import random
-import time
-from functools import partial
-from typing import Optional
-
-import torch
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-from .vqa import VQA
-from .vqa_eval import VQAEval
-
-# This code is much refer to https://github.com/cognitedata/Qwen-VL-finetune/blob/master/eval_mm/evaluate_vqa.py
-
-ds_collections = {
-    'vqav2_val': {
-        'train': 'data/vqav2/vqav2_train.jsonl',
-        'test': 'data/vqav2/vqav2_val.jsonl',
-        'question': 'data/vqav2/v2_OpenEnded_mscoco_val2014_questions.json',
-        'annotation': 'data/vqav2/v2_mscoco_val2014_annotations.json',
-        'metric': 'vqa_score',
-        'max_new_tokens': 10,
-    },
-    'vqav2_testdev': {
-        'train': 'data/vqav2/vqav2_train.jsonl',
-        'test': 'data/vqav2/vqav2_testdev.jsonl',
-        'metric': None,
-        'max_new_tokens': 10,
-    },
-    'okvqa_val': {
-        'train': 'data/okvqa/okvqa_train.jsonl',
-        'test': 'data/okvqa/okvqa_val.jsonl',
-        'question': 'data/okvqa/OpenEnded_mscoco_val2014_questions.json',
-        'annotation': 'data/okvqa/mscoco_val2014_annotations.json',
-        'metric': 'vqa_score',
-        'max_new_tokens': 10,
-    },
-    'textvqa_val': {
-        'train': 'data/textvqa/textvqa_train.jsonl',
-        'test': 'data/textvqa/textvqa_val.jsonl',
-        'question': 'data/textvqa/textvqa_val_questions.json',
-        'annotation': 'data/textvqa/textvqa_val_annotations.json',
-        'metric': 'vqa_score',
-        'max_new_tokens': 10,
-    },
-    'vizwiz_val': {
-        'train': 'data/vizwiz/vizwiz_train.jsonl',
-        'test': 'data/vizwiz/vizwiz_val.jsonl',
-        'question': 'data/vizwiz/vizwiz_val_questions.json',
-        'annotation': 'data/vizwiz/vizwiz_val_annotations.json',
-        'metric': 'vqa_score',
-        'max_new_tokens': 10,
-    },
-    'vizwiz_test': {
-        'train': 'data/vizwiz/vizwiz_train.jsonl',
-        'test': 'data/vizwiz/vizwiz_test.jsonl',
-        'metric': None,
-        'max_new_tokens': 10,
-    },
-    'docvqa_val': {
-        'train': 'data/docvqa/train.jsonl',
-        'test': 'data/docvqa/val.jsonl',
-        'annotation': 'data/docvqa/val/val_v1.0.json',
-        'metric': 'anls',
-        'max_new_tokens': 100,
-    },
-    'docvqa_test': {
-        'train': 'data/docvqa/train.jsonl',
-        'test': 'data/docvqa/test.jsonl',
-        'metric': None,
-        'max_new_tokens': 100,
-    },
-    'chartqa_test_human': {
-        'train': 'data/chartqa/train_human.jsonl',
-        'test': 'data/chartqa/test_human.jsonl',
-        'metric': 'relaxed_accuracy',
-        'max_new_tokens': 100,
-    },
-    'chartqa_test_augmented': {
-        'train': 'data/chartqa/train_augmented.jsonl',
-        'test': 'data/chartqa/test_augmented.jsonl',
-        'metric': 'relaxed_accuracy',
-        'max_new_tokens': 100,
-    },
-    'gqa_testdev': {
-        'train': 'data/gqa/train.jsonl',
-        'test': 'data/gqa/testdev_balanced.jsonl',
-        'metric': 'accuracy',
-        'max_new_tokens': 10,
-    },
-    'ocrvqa_val': {
-        'train': 'data/ocrvqa/ocrvqa_train.jsonl',
-        'test': 'data/ocrvqa/ocrvqa_val.jsonl',
-        'metric': 'accuracy',
-        'max_new_tokens': 100,
-    },
-    'ocrvqa_test': {
-        'train': 'data/ocrvqa/ocrvqa_train.jsonl',
-        'test': 'data/ocrvqa/ocrvqa_test.jsonl',
-        'metric': 'accuracy',
-        'max_new_tokens': 100,
-    },
-    'ai2diagram_test': {
-        'train': 'data/ai2diagram/train.jsonl',
-        'test': 'data/ai2diagram/test.jsonl',
-        'metric': 'accuracy',
-        'max_new_tokens': 10,
-    }
-}
-
-# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
-def relaxed_correctness(target: str,
-                        prediction: str,
-                        max_relative_change: float = 0.05) -> bool:
-    """Calculates relaxed correctness.
-
-    The correctness tolerates certain error ratio defined by max_relative_change.
-    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
-    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
-    numeric answers to allow a minor inaccuracy that may result from the automatic
-    data extraction process. We consider an answer to be correct if it is within
-    5% of the gold answer. For non-numeric answers, we still need an exact match
-    to consider an answer to be correct.”
-
-    Args:
-      target: Target string.
-      prediction: Predicted string.
-      max_relative_change: Maximum relative change.
-
-    Returns:
-      Whether the prediction was correct given the specified tolerance.
-    """
-
-    def _to_float(text: str) -> Optional[float]:
-        try:
-            if text.endswith('%'):
-                # Convert percentages to floats.
-                return float(text.rstrip('%')) / 100.0
-            else:
-                return float(text)
-        except ValueError:
-            return None
-
-    prediction_float = _to_float(prediction)
-    target_float = _to_float(target)
-    if prediction_float is not None and target_float:
-        relative_change = abs(prediction_float -
-                              target_float) / abs(target_float)
-        return relative_change <= max_relative_change
-    else:
-        return prediction.lower() == target.lower()
-
-
-def evaluate_relaxed_accuracy(entries):
-    scores = []
-    for elem in entries:
-        if isinstance(elem['annotation'], str):
-            elem['annotation'] = [elem['annotation']]
-        score = max([
-            relaxed_correctness(elem['answer'].strip(), ann)
-            for ann in elem['annotation']
-        ])
-        scores.append(score)
-    return sum(scores) / len(scores)
-
-
-def evaluate_exact_match_accuracy(entries):
-    scores = []
-    for elem in entries:
-        if isinstance(elem['annotation'], str):
-            elem['annotation'] = [elem['annotation']]
-        score = max([
-            (1.0 if
-             (elem['answer'].strip().lower() == ann.strip().lower()) else 0.0)
-            for ann in elem['annotation']
-        ])
-        scores.append(score)
-    return sum(scores) / len(scores)
-
-
-def collate_fn(batches, tokenizer):
-
-    questions = [_['question'] for _ in batches]
-    question_ids = [_['question_id'] for _ in batches]
-    annotations = [_['annotation'] for _ in batches]
-
-    input_ids = tokenizer(questions, return_tensors='pt', padding='longest')
-
-    return question_ids, input_ids.input_ids, input_ids.attention_mask, annotations
-
-
-class VQADataset(torch.utils.data.Dataset):
-
-    def __init__(self, train, test, prompt, few_shot):
-        self.test = open(test).readlines()
-        self.prompt = prompt
-
-        self.few_shot = few_shot
-        if few_shot > 0:
-            self.train = open(train).readlines()
-
-    def __len__(self):
-        return len(self.test)
-
-    def __getitem__(self, idx):
-        data = json.loads(self.test[idx].strip())
-        image, question, question_id, annotation = data['image'], data[
-            'question'], data['question_id'], data.get('answer', None)
-
-        few_shot_prompt = ''
-        if self.few_shot > 0:
-            few_shot_samples = random.sample(self.train, self.few_shot)
-            for sample in few_shot_samples:
-                sample = json.loads(sample.strip())
-                few_shot_prompt += self.prompt.format(
-                    sample['image'],
-                    sample['question']) + f" {sample['answer']}"
-
-        return {
-            'question': few_shot_prompt + self.prompt.format(image, question),
-            'question_id': question_id,
-            'annotation': annotation
-        }
-
-
-class InferenceSampler(torch.utils.data.sampler.Sampler):
-
-    def __init__(self, size):
-        self._size = int(size)
-        assert size > 0
-        self._rank = torch.distributed.get_rank()
-        self._world_size = torch.distributed.get_world_size()
-        self._local_indices = self._get_local_indices(size, self._world_size,
-                                                      self._rank)
-
-    @staticmethod
-    def _get_local_indices(total_size, world_size, rank):
-        shard_size = total_size // world_size
-        left = total_size % world_size
-        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
-
-        begin = sum(shard_sizes[:rank])
-        end = min(sum(shard_sizes[:rank + 1]), total_size)
-        return range(begin, end)
-
-    def __iter__(self):
-        yield from self._local_indices
-
-    def __len__(self):
-        return len(self._local_indices)
-
-
-def textVQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", dataset_path=None, tokenizer=None,
-                       batch_size=1, few_shot=0, seed=0, trust_remote_code=True, device="cuda:0"):
-    # torch.distributed.init_process_group(
-    #     backend='nccl',
-    #     world_size=int(os.getenv('WORLD_SIZE', '1')),
-    #     rank=int(os.getenv('RANK', '0')),
-    # )
-    if "cuda" in device:
-        torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
-    if isinstance(model_name, str):
-        config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
-        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=trust_remote_code).eval()
-        model = model.to(torch.device(device))
-        tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=trust_remote_code,
-                                              padding_side="right", use_fast=False)
-    else:
-        assert tokenizer is not None, "Two types of parameter passing are supported:model_path or model with tokenizer."
-        model = model_name
-    
-    tokenizer.padding_side = 'left'
-    tokenizer.pad_token_id = tokenizer.eod_id
-
-    prompt = '<img>{}</img>{} Answer:'
-    if dataset_path is not None:
-        for key in ds_collections[dataset_name].keys():
-            if isinstance(ds_collections[dataset_name][key], str) and "json" in ds_collections[dataset_name][key]:
-                ds_collections[dataset_name][key] = os.path.join(dataset_path,ds_collections[dataset_name][key])
-            
-    random.seed(seed)
-    dataset = VQADataset(
-        train=ds_collections[dataset_name]['train'],
-        test=ds_collections[dataset_name]['test'],
-        prompt=prompt,
-        few_shot=few_shot,
-    )
-
-    dataloader = torch.utils.data.DataLoader(
-        dataset=dataset,
-        # sampler=InferenceSampler(len(dataset)),
-        batch_size=batch_size,
-        # num_workers=0,
-        pin_memory=True,
-        drop_last=False,
-        collate_fn=partial(collate_fn, tokenizer=tokenizer),
-    )
-
-    outputs = []
-    for _, (question_ids, input_ids, attention_mask,
-            annotations) in tqdm(enumerate(dataloader)):
-        pred = model.generate(
-            input_ids=input_ids.to(device),
-            attention_mask=attention_mask.to(device),
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=ds_collections[dataset_name]['max_new_tokens'],
-            min_new_tokens=1,
-            length_penalty=1,
-            num_return_sequences=1,
-            output_hidden_states=True,
-            use_cache=True,
-            pad_token_id=tokenizer.eod_id,
-            eos_token_id=tokenizer.eod_id,
-        )
-        answers = [
-            tokenizer.decode(_[input_ids.size(1):].cpu(),
-                             skip_special_tokens=True).strip() for _ in pred
-        ]
-
-        for question_id, answer, annotation in zip(question_ids, answers,
-                                                   annotations):
-            if dataset_name in ['vqav2_val', 'vqav2_testdev', 'okvqa_val', 'textvqa_val', 'vizwiz_val']:
-                outputs.append({
-                    'question_id': question_id,
-                    'answer': answer,
-                })
-            elif dataset_name in ['docvqa_val', 'infographicsvqa', 'gqa_testdev', 'ocrvqa_val', 'ocrvqa_test']:
-                outputs.append({
-                    'questionId': question_id,
-                    'answer': answer,
-                    'annotation': annotation,
-                })
-            elif dataset_name in ['ai2diagram_test']:
-                outputs.append({
-                    'image': question_id,
-                    'answer': answer,
-                    'annotation': annotation,
-                })
-            elif dataset in ['chartqa_test_human', 'chartqa_test_augmented']:
-                outputs.append({
-                    'answer': answer,
-                    'annotation': annotation,
-                })
-            elif dataset_name in ['docvqa_test']:
-                outputs.append({
-                    'questionId': question_id,
-                    'answer': answer,
-                })
-            elif dataset_name in ['vizwiz_test']:
-                outputs.append({
-                    'image': question_id,
-                    'answer': answer,
-                })
-            else:
-                raise NotImplementedError
-
-    # torch.distributed.barrier()
-
-    # world_size = torch.distributed.get_world_size()
-    # merged_outputs = [None for _ in range(world_size)]
-    # torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
-
-    merged_outputs = [json.dumps(outputs)]
-    merged_outputs = [json.loads(_) for _ in merged_outputs]
-    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
-
-    # if torch.distributed.get_rank() == 0:
-    print(f"Evaluating {dataset_name} ...")
-    time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
-    results_file = f'{dataset}_{time_prefix}_fs{few_shot}_s{seed}.json'
-    json.dump(merged_outputs, open(results_file, 'w'), ensure_ascii=False)
-
-    if ds_collections[dataset_name]['metric'] == 'vqa_score':
-        vqa = VQA(ds_collections[dataset_name]['annotation'],
-                    ds_collections[dataset_name]['question'])
-        results = vqa.loadRes(
-            resFile=results_file,
-            quesFile=ds_collections[dataset_name]['question'])
-        vqa_scorer = VQAEval(vqa, results, n=2)
-        vqa_scorer.evaluate()
-
-        print(vqa_scorer.accuracy)
-
-    elif ds_collections[dataset_name]['metric'] == 'anls':
-        json.dump(merged_outputs,
-                    open(results_file, 'w'),
-                    ensure_ascii=False)
-        print('python infographicsvqa_eval.py -g ' +
-                ds_collections[dataset_name]['annotation'] + ' -s ' +
-                results_file)
-        os.system('python infographicsvqa_eval.py -g ' +
-                    ds_collections[dataset_name]['annotation'] + ' -s ' +
-                    results_file)
-    elif ds_collections[dataset_name]['metric'] == 'relaxed_accuracy':
-        print({
-            'relaxed_accuracy': evaluate_relaxed_accuracy(merged_outputs)
-        })
-    elif ds_collections[dataset_name]['metric'] == 'accuracy':
-        if 'gqa' in dataset:
-            for entry in merged_outputs:
-                response = entry['answer']
-                response = response.strip().split('.')[0].split(
-                    ',')[0].split('!')[0].lower()
-                if 'is ' in response:
-                    response = response.split('is ')[1]
-                if 'are ' in response:
-                    response = response.split('are ')[1]
-                if 'a ' in response:
-                    response = response.split('a ')[1]
-                if 'an ' in response:
-                    response = response.split('an ')[1]
-                if 'the ' in response:
-                    response = response.split('the ')[1]
-                if ' of' in response:
-                    response = response.split(' of')[0]
-                response = response.strip()
-                entry['answer'] = response
-        print({'accuracy': evaluate_exact_match_accuracy(merged_outputs)})
-
-    # torch.distributed.barrier()
-    
-    
-    
-
-if __name__ == "__main__":
-    import sys
-
-    import time
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", default="Qwen/Qwen-VL"
-    )
-    parser.add_argument(
-        "--base_model", default="Qwen/Qwen-VL"
-    )
-    parser.add_argument(
-        "--dataset_name", default="textvqa_val"
-    )
-    parser.add_argument(
-        "--eval_bs", default=4,
-    )
-    parser.add_argument(
-        "--trust_remote_code", action='store_true',
-        help="Whether to enable trust_remote_code"
-    )
-    args = parser.parse_args()
-    s = time.time()
-    evaluator = textVQA_evaluation(
-        args.model_name,
-        base_model=args.base_model,
-        dataset_name=args.dataset_name,
-        # dataset_path=args.eval_path,
-        batch_size=args.eval_bs,
-        trust_remote_code=args.trust_remote_code
-    )
-    print("cost time: ", time.time() - s)
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
deleted file mode 100644
index bbd6344921c..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
+++ /dev/null
@@ -1,102 +0,0 @@
-
-if __name__ == "__main__":
-
-    import sys
-
-    sys.path.insert(0, '../../../')
-    import time
-    import torch
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", default="/models/opt-125m/"
-    )
-    parser.add_argument(
-        "--eval_bs", default=4, type=int,
-    )
-    parser.add_argument(
-        "--trust_remote_code", action='store_true',
-        help="Whether to enable trust_remote_code"
-    )
-    parser.add_argument(
-        "--device", default="cpu",
-        help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation."
-    )
-    parser.add_argument(
-        "--base_model", default="Qwen/Qwen-VL"
-    )
-    parser.add_argument(
-        "--model_dtype", default=None, type=str,
-        help="force to convert the dtype, some backends supports fp16 dtype better"
-    )
-    parser.add_argument(
-        "--tasks",
-        default="textvqa_val,scienceqa_test_img",
-        help="lm-eval tasks for lm_eval version 0.4.2"
-    )
-
-    args = parser.parse_args()
-    s = time.time()
-    from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
-    from auto_round.utils import convert_dtype_torch2str
-
-    config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code)
-
-    if hasattr(config, "quantization_config"):
-        quantization_config = config.quantization_config
-        if "quant_method" in quantization_config and "auto-round" in quantization_config["quant_method"]:
-            from auto_round.auto_quantizer import AutoHfQuantizer
-        elif "quant_method" in quantization_config and quantization_config["quant_method"] == "gptq":
-            if args.device == "hpu":
-                from auto_round.auto_quantizer import AutoHfQuantizer
-    model_name = args.model_name
-    torch_dtype = torch.float
-    if args.model_dtype != None:
-        if args.model_dtype == "float16" or args.model_dtype == "fp16":
-            torch_dtype = torch.float16
-        if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
-            torch_dtype = torch.bfloat16
-    dtype_str = convert_dtype_torch2str(torch_dtype)
-    if dtype_str == "bf16":
-        model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=args.trust_remote_code, device_map=args.device, bf16=True).eval()
-    elif dtype_str == "fp16":
-        model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=args.trust_remote_code, device_map=args.device, fp16=True).eval()
-    else:
-        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=args.trust_remote_code, device_map=args.device).eval()
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=args.trust_remote_code, padding_side="right", use_fast=False)
-    tokenizer.pad_token_id = tokenizer.eod_id
-    test_tasks = args.tasks
-    if isinstance(test_tasks, str):
-        test_tasks = test_tasks.split(',')
-    device = args.device
-    for dataset in test_tasks:
-        if 'vqa' in dataset:
-            from evaluate_vqa import textVQA_evaluation
-            with torch.amp.autocast(device_type=device.split(":")[0], dtype=torch_dtype):
-                evaluator = textVQA_evaluation(
-                    model,
-                    dataset_name=dataset,
-                    # dataset_path=args.eval_path,
-                    tokenizer=tokenizer,
-                    batch_size=args.eval_bs,
-                    trust_remote_code=args.trust_remote_code,
-                    device=str(device)
-                )
-        elif 'scienceqa' in dataset:
-            from evaluate_multiple_choice import scienceQA_evaluation
-            with torch.amp.autocast(device_type=device.split(":")[0], dtype=torch_dtype):
-                evaluator = scienceQA_evaluation(
-                    model,
-                    dataset_name=dataset,
-                    # dataset_path=args.eval_path,
-                    tokenizer=tokenizer,
-                    batch_size=args.eval_bs,
-                    trust_remote_code=args.trust_remote_code,
-                    device=str(device)
-                )
-
-    print("cost time: ", time.time() - s)
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa.py
deleted file mode 100644
index d3b17d00903..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa.py
+++ /dev/null
@@ -1,206 +0,0 @@
-"""Copyright (c) 2022, salesforce.com, inc.
-
-All rights reserved.
-SPDX-License-Identifier: BSD-3-Clause
-For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
-"""
-
-__author__ = 'aagrawal'
-__version__ = '0.9'
-
-# Interface for accessing the VQA dataset.
-
-# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
-# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
-
-# The following functions are defined:
-#  VQA        - VQA class that loads VQA annotation file and prepares data structures.
-#  getQuesIds - Get question ids that satisfy given filter conditions.
-#  getImgIds  - Get image ids that satisfy given filter conditions.
-#  loadQA     - Load questions and answers with the specified question ids.
-#  showQA     - Display the specified questions and answers.
-#  loadRes    - Load result file and create result object.
-
-# Help on each function can be accessed by: "help(COCO.function)"
-
-import copy
-import datetime
-import json
-
-
-class VQA:
-
-    def __init__(self, annotation_file=None, question_file=None):
-        """Constructor of VQA helper class for reading and visualizing
-        questions and answers.
-
-        :param annotation_file (str): location of VQA annotation file
-        :return:
-        """
-        # load dataset
-        self.dataset = {}
-        self.questions = {}
-        self.qa = {}
-        self.qqa = {}
-        self.imgToQA = {}
-        if not annotation_file == None and not question_file == None:
-            print('loading VQA annotations and questions into memory...')
-            time_t = datetime.datetime.utcnow()
-            dataset = json.load(open(annotation_file, 'r'))
-            questions = json.load(open(question_file, 'r'))
-            self.dataset = dataset
-            self.questions = questions
-            self.createIndex()
-
-    def createIndex(self):
-        # create index
-        print('creating index...')
-        imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
-        qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
-        qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
-        for ann in self.dataset['annotations']:
-            imgToQA[ann['image_id']] += [ann]
-            qa[ann['question_id']] = ann
-        for quest in self.questions['questions']:
-            qqa[quest['question_id']] = quest
-        print('index created!')
-
-        # create class members
-        self.qa = qa
-        self.qqa = qqa
-        self.imgToQA = imgToQA
-
-    def info(self):
-        """Print information about the VQA annotation file.
-
-        :return:
-        """
-        for key, value in self.dataset['info'].items():
-            print('%s: %s' % (key, value))
-
-    def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
-        """Get question ids that satisfy given filter conditions. default skips
-        that filter.
-
-        :param  imgIds    (int array)   : get question ids for given imgs
-                        quesTypes (str array)   : get question ids for given question types
-                        ansTypes  (str array)   : get question ids for given answer types
-        :return:    ids   (int array)   : integer array of question ids
-        """
-        imgIds = imgIds if type(imgIds) == list else [imgIds]
-        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
-        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
-
-        if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
-            anns = self.dataset['annotations']
-        else:
-            if not len(imgIds) == 0:
-                anns = sum(
-                    [
-                        self.imgToQA[imgId]
-                        for imgId in imgIds if imgId in self.imgToQA
-                    ],
-                    [],
-                )
-            else:
-                anns = self.dataset['annotations']
-            anns = (anns if len(quesTypes) == 0 else
-                    [ann for ann in anns if ann['question_type'] in quesTypes])
-            anns = (anns if len(ansTypes) == 0 else
-                    [ann for ann in anns if ann['answer_type'] in ansTypes])
-        ids = [ann['question_id'] for ann in anns]
-        return ids
-
-    def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
-        """Get image ids that satisfy given filter conditions. default skips
-        that filter.
-
-         :param quesIds   (int array)   : get image ids for given question ids
-        quesTypes (str array)   : get image ids for given question types
-        ansTypes  (str array)   : get image ids for given answer types
-         :return: ids     (int array)   : integer array of image ids
-        """
-        quesIds = quesIds if type(quesIds) == list else [quesIds]
-        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
-        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
-
-        if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
-            anns = self.dataset['annotations']
-        else:
-            if not len(quesIds) == 0:
-                anns = sum([
-                    self.qa[quesId] for quesId in quesIds if quesId in self.qa
-                ], [])
-            else:
-                anns = self.dataset['annotations']
-            anns = (anns if len(quesTypes) == 0 else
-                    [ann for ann in anns if ann['question_type'] in quesTypes])
-            anns = (anns if len(ansTypes) == 0 else
-                    [ann for ann in anns if ann['answer_type'] in ansTypes])
-        ids = [ann['image_id'] for ann in anns]
-        return ids
-
-    def loadQA(self, ids=[]):
-        """Load questions and answers with the specified question ids.
-
-        :param ids (int array)       : integer ids specifying question ids
-        :return: qa (object array)   : loaded qa objects
-        """
-        if type(ids) == list:
-            return [self.qa[id] for id in ids]
-        elif type(ids) == int:
-            return [self.qa[ids]]
-
-    def showQA(self, anns):
-        """Display the specified annotations.
-
-        :param anns (array of object): annotations to display
-        :return: None
-        """
-        if len(anns) == 0:
-            return 0
-        for ann in anns:
-            quesId = ann['question_id']
-            print('Question: %s' % (self.qqa[quesId]['question']))
-            for ann in ann['answers']:
-                print('Answer %d: %s' % (ann['answer_id'], ann['answer']))
-
-    def loadRes(self, resFile, quesFile):
-        """Load result file and return a result object.
-
-        :param   resFile (str)     : file name of result file
-        :return: res (obj)         : result api object
-        """
-        res = VQA()
-        res.questions = json.load(open(quesFile))
-        res.dataset['info'] = copy.deepcopy(self.questions['info'])
-        res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
-        res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
-        res.dataset['data_subtype'] = copy.deepcopy(
-            self.questions['data_subtype'])
-        res.dataset['license'] = copy.deepcopy(self.questions['license'])
-
-        print('Loading and preparing results...     ')
-        time_t = datetime.datetime.utcnow()
-        anns = json.load(open(resFile))
-        assert type(anns) == list, 'results is not an array of objects'
-        annsQuesIds = [ann['question_id'] for ann in anns]
-        assert set(annsQuesIds) == set(
-            self.getQuesIds()
-        ), 'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is at least one question id that does not belong to the question ids in the annotation file.'
-        for ann in anns:
-            quesId = ann['question_id']
-            if res.dataset['task_type'] == 'Multiple Choice':
-                assert (
-                    ann['answer'] in self.qqa[quesId]['multiple_choices']
-                ), 'predicted answer is not one of the multiple choices'
-            qaAnn = self.qa[quesId]
-            ann['image_id'] = qaAnn['image_id']
-            ann['question_type'] = qaAnn['question_type']
-            ann['answer_type'] = qaAnn['answer_type']
-        print('DONE (t=%0.2fs)' %
-              ((datetime.datetime.utcnow() - time_t).total_seconds()))
-
-        res.dataset['annotations'] = anns
-        res.createIndex()
-        return res
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa_eval.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa_eval.py
deleted file mode 100644
index 218719e3126..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa_eval.py
+++ /dev/null
@@ -1,330 +0,0 @@
-"""Copyright (c) 2022, salesforce.com, inc.
-
-All rights reserved.
-SPDX-License-Identifier: BSD-3-Clause
-For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
-"""
-
-# coding=utf-8
-
-__author__ = 'aagrawal'
-
-import re
-# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
-# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
-import sys
-
-
-class VQAEval:
-
-    def __init__(self, vqa=None, vqaRes=None, n=2):
-        self.n = n
-        self.accuracy = {}
-        self.evalQA = {}
-        self.evalQuesType = {}
-        self.evalAnsType = {}
-        self.vqa = vqa
-        self.vqaRes = vqaRes
-        if vqa is not None:
-            self.params = {'question_id': vqa.getQuesIds()}
-        self.contractions = {
-            'aint': "ain't",
-            'arent': "aren't",
-            'cant': "can't",
-            'couldve': "could've",
-            'couldnt': "couldn't",
-            "couldn'tve": "couldn't've",
-            "couldnt've": "couldn't've",
-            'didnt': "didn't",
-            'doesnt': "doesn't",
-            'dont': "don't",
-            'hadnt': "hadn't",
-            "hadnt've": "hadn't've",
-            "hadn'tve": "hadn't've",
-            'hasnt': "hasn't",
-            'havent': "haven't",
-            'hed': "he'd",
-            "hed've": "he'd've",
-            "he'dve": "he'd've",
-            'hes': "he's",
-            'howd': "how'd",
-            'howll': "how'll",
-            'hows': "how's",
-            "Id've": "I'd've",
-            "I'dve": "I'd've",
-            'Im': "I'm",
-            'Ive': "I've",
-            'isnt': "isn't",
-            'itd': "it'd",
-            "itd've": "it'd've",
-            "it'dve": "it'd've",
-            'itll': "it'll",
-            "let's": "let's",
-            'maam': "ma'am",
-            'mightnt': "mightn't",
-            "mightnt've": "mightn't've",
-            "mightn'tve": "mightn't've",
-            'mightve': "might've",
-            'mustnt': "mustn't",
-            'mustve': "must've",
-            'neednt': "needn't",
-            'notve': "not've",
-            'oclock': "o'clock",
-            'oughtnt': "oughtn't",
-            "ow's'at": "'ow's'at",
-            "'ows'at": "'ow's'at",
-            "'ow'sat": "'ow's'at",
-            'shant': "shan't",
-            "shed've": "she'd've",
-            "she'dve": "she'd've",
-            "she's": "she's",
-            'shouldve': "should've",
-            'shouldnt': "shouldn't",
-            "shouldnt've": "shouldn't've",
-            "shouldn'tve": "shouldn't've",
-            "somebody'd": 'somebodyd',
-            "somebodyd've": "somebody'd've",
-            "somebody'dve": "somebody'd've",
-            'somebodyll': "somebody'll",
-            'somebodys': "somebody's",
-            'someoned': "someone'd",
-            "someoned've": "someone'd've",
-            "someone'dve": "someone'd've",
-            'someonell': "someone'll",
-            'someones': "someone's",
-            'somethingd': "something'd",
-            "somethingd've": "something'd've",
-            "something'dve": "something'd've",
-            'somethingll': "something'll",
-            'thats': "that's",
-            'thered': "there'd",
-            "thered've": "there'd've",
-            "there'dve": "there'd've",
-            'therere': "there're",
-            'theres': "there's",
-            'theyd': "they'd",
-            "theyd've": "they'd've",
-            "they'dve": "they'd've",
-            'theyll': "they'll",
-            'theyre': "they're",
-            'theyve': "they've",
-            'twas': "'twas",
-            'wasnt': "wasn't",
-            "wed've": "we'd've",
-            "we'dve": "we'd've",
-            'weve': "we've",
-            'werent': "weren't",
-            'whatll': "what'll",
-            'whatre': "what're",
-            'whats': "what's",
-            'whatve': "what've",
-            'whens': "when's",
-            'whered': "where'd",
-            'wheres': "where's",
-            'whereve': "where've",
-            'whod': "who'd",
-            "whod've": "who'd've",
-            "who'dve": "who'd've",
-            'wholl': "who'll",
-            'whos': "who's",
-            'whove': "who've",
-            'whyll': "why'll",
-            'whyre': "why're",
-            'whys': "why's",
-            'wont': "won't",
-            'wouldve': "would've",
-            'wouldnt': "wouldn't",
-            "wouldnt've": "wouldn't've",
-            "wouldn'tve": "wouldn't've",
-            'yall': "y'all",
-            "yall'll": "y'all'll",
-            "y'allll": "y'all'll",
-            "yall'd've": "y'all'd've",
-            "y'alld've": "y'all'd've",
-            "y'all'dve": "y'all'd've",
-            'youd': "you'd",
-            "youd've": "you'd've",
-            "you'dve": "you'd've",
-            'youll': "you'll",
-            'youre': "you're",
-            'youve': "you've",
-        }
-        self.manualMap = {
-            'none': '0',
-            'zero': '0',
-            'one': '1',
-            'two': '2',
-            'three': '3',
-            'four': '4',
-            'five': '5',
-            'six': '6',
-            'seven': '7',
-            'eight': '8',
-            'nine': '9',
-            'ten': '10',
-        }
-        self.articles = ['a', 'an', 'the']
-
-        self.periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')
-        self.commaStrip = re.compile('(\d)(,)(\d)')
-        self.punct = [
-            ';',
-            r'/',
-            '[',
-            ']',
-            '"',
-            '{',
-            '}',
-            '(',
-            ')',
-            '=',
-            '+',
-            '\\',
-            '_',
-            '-',
-            '>',
-            '<',
-            '@',
-            '`',
-            ',',
-            '?',
-            '!',
-        ]
-
-    def evaluate(self, quesIds=None):
-        if quesIds == None:
-            quesIds = [quesId for quesId in self.params['question_id']]
-        gts = {}
-        res = {}
-        for quesId in quesIds:
-            gts[quesId] = self.vqa.qa[quesId]
-            res[quesId] = self.vqaRes.qa[quesId]
-
-        # =================================================
-        # Compute accuracy
-        # =================================================
-        accQA = []
-        accQuesType = {}
-        accAnsType = {}
-        print('computing accuracy')
-        step = 0
-        for quesId in quesIds:
-            resAns = res[quesId]['answer']
-            resAns = resAns.replace('\n', ' ')
-            resAns = resAns.replace('\t', ' ')
-            resAns = resAns.strip()
-            resAns = self.processPunctuation(resAns)
-            resAns = self.processDigitArticle(resAns)
-            gtAcc = []
-            gtAnswers = [ann['answer'] for ann in gts[quesId]['answers']]
-            if len(set(gtAnswers)) > 1:
-                for ansDic in gts[quesId]['answers']:
-                    ansDic['answer'] = self.processPunctuation(
-                        ansDic['answer'])
-            for gtAnsDatum in gts[quesId]['answers']:
-                otherGTAns = [
-                    item for item in gts[quesId]['answers']
-                    if item != gtAnsDatum
-                ]
-                matchingAns = [
-                    item for item in otherGTAns if item['answer'] == resAns
-                ]
-                acc = min(1, float(len(matchingAns)) / 3)
-                gtAcc.append(acc)
-            quesType = gts[quesId]['question_type']
-            ansType = gts[quesId]['answer_type']
-            avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
-            accQA.append(avgGTAcc)
-            if quesType not in accQuesType:
-                accQuesType[quesType] = []
-            accQuesType[quesType].append(avgGTAcc)
-            if ansType not in accAnsType:
-                accAnsType[ansType] = []
-            accAnsType[ansType].append(avgGTAcc)
-            self.setEvalQA(quesId, avgGTAcc)
-            self.setEvalQuesType(quesId, quesType, avgGTAcc)
-            self.setEvalAnsType(quesId, ansType, avgGTAcc)
-            if step % 100 == 0:
-                self.updateProgress(step / float(len(quesIds)))
-            step = step + 1
-
-        self.setAccuracy(accQA, accQuesType, accAnsType)
-        print('Done computing accuracy')
-
-    def processPunctuation(self, inText):
-        outText = inText
-        for p in self.punct:
-            if (p + ' ' in inText or ' ' + p
-                    in inText) or (re.search(self.commaStrip, inText) != None):
-                outText = outText.replace(p, '')
-            else:
-                outText = outText.replace(p, ' ')
-        outText = self.periodStrip.sub('', outText, re.UNICODE)
-        return outText
-
-    def processDigitArticle(self, inText):
-        outText = []
-        tempText = inText.lower().split()
-        for word in tempText:
-            word = self.manualMap.setdefault(word, word)
-            if word not in self.articles:
-                outText.append(word)
-            else:
-                pass
-        for wordId, word in enumerate(outText):
-            if word in self.contractions:
-                outText[wordId] = self.contractions[word]
-        outText = ' '.join(outText)
-        return outText
-
-    def setAccuracy(self, accQA, accQuesType, accAnsType):
-        self.accuracy['overall'] = round(100 * float(sum(accQA)) / len(accQA),
-                                         self.n)
-        self.accuracy['perQuestionType'] = {
-            quesType: round(
-                100 * float(sum(accQuesType[quesType])) /
-                len(accQuesType[quesType]),
-                self.n,
-            )
-            for quesType in accQuesType
-        }
-        self.accuracy['perAnswerType'] = {
-            ansType: round(
-                100 * float(sum(accAnsType[ansType])) /
-                len(accAnsType[ansType]), self.n)
-            for ansType in accAnsType
-        }
-
-    def setEvalQA(self, quesId, acc):
-        self.evalQA[quesId] = round(100 * acc, self.n)
-
-    def setEvalQuesType(self, quesId, quesType, acc):
-        if quesType not in self.evalQuesType:
-            self.evalQuesType[quesType] = {}
-        self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
-
-    def setEvalAnsType(self, quesId, ansType, acc):
-        if ansType not in self.evalAnsType:
-            self.evalAnsType[ansType] = {}
-        self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
-
-    def updateProgress(self, progress):
-        barLength = 20
-        status = ''
-        if isinstance(progress, int):
-            progress = float(progress)
-        if not isinstance(progress, float):
-            progress = 0
-            status = 'error: progress var must be float\r\n'
-        if progress < 0:
-            progress = 0
-            status = 'Halt...\r\n'
-        if progress >= 1:
-            progress = 1
-            status = 'Done...\r\n'
-        block = int(round(barLength * progress))
-        text = '\rFinshed Percent: [{0}] {1}% {2}'.format(
-            '#' * block + '-' * (barLength - block), int(progress * 100),
-            status)
-        sys.stdout.write(text)
-        sys.stdout.flush()
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/requirements.txt
deleted file mode 100644
index a76b24c8e9e..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/requirements.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-transformers==4.45.2
-torch
-tiktoken
-transformers_stream_generator
-peft
-sentencepiece
-einops
-accelerate
-datasets
-protobuf
-auto-gptq
-openpyxl
-wandb
-py-cpuinfo
-# for Qwen2-VL
-Pillow
-qwen_vl_utils
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_autoround.sh
deleted file mode 100644
index 34cd76065b9..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_autoround.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  for var in "$@"
-  do
-    case $var in
-      --model_name=*)
-          model_name=$(echo $var |cut -f2 -d=)
-      ;;
-      --image_folder=*)
-          image_folder=$(echo $var |cut -f2 -d=)
-      ;;
-      --question_file=*)
-          question_file=$(echo $var |cut -f2 -d=)
-      ;;
-      --output_dir=*)
-          output_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    python main.py \
-            --model_name ${model_name} \
-            --bits 4 \
-            --group_size 128 \
-            --iters 200 \
-            --seqlen 512 \
-            --disable_quanted_input \
-            --quantize \
-            --image_folder ${image_folder} \
-            --question_file ${question_file} \
-            --output_dir ${output_dir}
-}
-
-main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh
deleted file mode 100644
index 7bc295f46c7..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_evaluation
-
-}
-
-# init params
-function init_params {
-  for var in "$@"
-  do
-    case $var in
-      --model_name=*)
-          model_name=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_evaluation
-function run_evaluation {
-    python main.py \
-        --accuracy \
-        --model_name ${model_name} \
-        --eval_bs 4
-}
-
-main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py
new file mode 100644
index 00000000000..e42e6f90226
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py
@@ -0,0 +1,569 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import argparse
+import json
+
+import torch
+import transformers
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+torch.use_deterministic_algorithms(True, warn_only=True)
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoProcessor
+
+from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
+                                                    get_layer_names_in_block,
+                                                    detect_device,
+                                                    find_matching_blocks,
+                                                    to_device,
+                                                    to_dtype
+                                                    )
+from neural_compressor.torch.quantization import (AutoRoundConfig,
+                                                    prepare,
+                                                    convert,
+                                                    load)
+
+def set_nontext_module_config(model, to_quant_block_names, quant_config):
+    all_block_list = get_multimodal_block_names(model, quant_vision=True)
+    all_block_set = set(tuple(block) for block in all_block_list)
+    quant_block_set = set(tuple(block) for block in to_quant_block_names)
+    set_to_full_prec = list(all_block_set - quant_block_set)
+    set_to_full_prec = get_layer_names_in_block(model, to_quant_block_names=set_to_full_prec)
+    for name in set_to_full_prec:
+        quant_config.set_local(name, AutoRoundConfig(dtype="fp32"))
+        
+    # skip layers not in blocks
+    quant_config.set_local("model.vision_embed_tokens.img_projection*", AutoRoundConfig(dtype="fp32"))
+    quant_config.set_local("transformer.visual.attn_pool.*_proj", AutoRoundConfig(dtype="fp32"))
+    quant_config.set_local("model.mm_projector*", AutoRoundConfig(dtype="fp32"))
+    quant_config.set_local("multi_modal_projector", AutoRoundConfig(dtype="fp32"))
+    quant_config.set_local("visual.merger", AutoRoundConfig(dtype="fp32"))
+
+
+@torch.no_grad()
+def run_fn(model, dataloader, **kargs):
+    for data in dataloader:
+        if isinstance(data, tuple) or isinstance(data, list):
+            model(*data)
+        elif isinstance(data, dict):
+            model(**data)
+        else:
+            model(data)
+
+
+class BasicArgumentParser(argparse.ArgumentParser):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.add_argument("--model", "--model_name", "--model_name_or_path",
+                          default="Qwen/Qwen2-VL-2B-Instruct",
+                          help="model name or path")
+
+        self.add_argument('--eval', action='store_true',
+                          help="whether to use eval only mode.")
+
+        self.add_argument("--bits", default=4, type=int,
+                          help="weight bits")
+        
+        self.add_argument("--quantize", action="store_true")
+
+        self.add_argument("--eval_bs", default=None, type=int,
+                          help="batch size in evaluation")
+
+        self.add_argument("--device", "--devices", default="auto", type=str,
+                          help="the device to be used for tuning. The default is set to auto,"
+                               "allowing for automatic detection."
+                               "Currently, device settings support CPU, GPU, and HPU.")
+
+        self.add_argument("--asym", action='store_true',
+                          help="whether to use asym quantization")
+
+        self.add_argument("--dataset", type=str, default=None,
+                          help="the dataset for quantization training."
+                               " current support NeelNanda/pile-10k,llava_conv_58k,llava_instruct_80k "
+                               "It can be a custom one. Default is NeelNanda/pile-10k")
+
+        self.add_argument("--lr", default=None, type=float,
+                          help="learning rate, if None, it will be set to 1.0/iters automatically")
+
+        self.add_argument("--minmax_lr", default=None, type=float,
+                          help="minmax learning rate, if None,it will beset to be the same with lr")
+
+        self.add_argument("--seed", default=42, type=int,
+                          help="random seed")
+
+        self.add_argument("--adam", action='store_true',
+                          help="whether to use adam optimizer instead of SignSGD")
+
+        self.add_argument("--gradient_accumulate_steps", default=1, type=int,
+                          help="gradient accumulate steps")
+
+        self.add_argument("--nblocks", default=1, type=int,
+                          help="how many blocks to tune together")
+
+        self.add_argument("--low_gpu_mem_usage", action='store_true',
+                          help="offload intermediate features to cpu")
+
+        self.add_argument("--export_format", default="auto_round:gptq", type=str,
+                          help="the format to save the model"
+                          )
+
+        self.add_argument("--data_type", "--dtype", default='int',
+                          help="data type for tuning, 'int', 'mx_fp' and etc")
+
+        self.add_argument("--scale_dtype", default='fp16', choices=["fp16", "float16",
+                                                                    "bf16", "bfloat16", "fp32", "float32"],
+                          help="scale data type to use for quantization")
+
+        self.add_argument("--output_dir", default="./tmp_autoround", type=str,
+                          help="the directory to save quantized model")
+
+        self.add_argument("--disable_amp", action='store_true',
+                          help="disable amp")
+
+        self.add_argument("--disable_minmax_tuning", action='store_true',
+                          help="whether disable enable weight minmax tuning")
+
+        self.add_argument("--enable_norm_bias_tuning", action='store_true',
+                          help="whether enable norm bias tuning")
+
+        self.add_argument("--disable_trust_remote_code", action='store_true',
+                          help="whether to disable trust_remote_code")
+
+        self.add_argument("--disable_quanted_input", action='store_true',
+                          help="whether to disuse the output of quantized block to tune the next block")
+
+        self.add_argument("--quant_lm_head", action='store_true',
+                          help="whether to quant lm_head")
+
+        self.add_argument("--low_cpu_mem_mode", default=0, type=int, choices=[0, 1, 2],
+                          help="choose which low cpu memory mode to use. "
+                               "Can significantly reduce cpu memory footprint but cost more time."
+                               "1 means choose block-wise mode, load the weights of each block"
+                               " from disk when tuning and release the memory of the block after tuning."
+                               "2 means choose layer-wise mode, load the weights of each layer from disk when tuning,"
+                               " minimum memory consumption and also slowest running speed."
+                               "others means not use low cpu memory. Default to 0, not use low cpu memory.")
+
+        self.add_argument("--low_cpu_mem_tmp_dir", default=None, type=str,
+                          help="temporary work space to store the temporary files "
+                               "when using low cpu memory mode. Will remove after tuning.")
+
+        self.add_argument("--model_dtype", default=None, type=str, choices=["fp16", "float16",
+                                                                            "bf16", "bfloat16", "fp32", "float32"],
+                          help="force to convert the dtype, some backends supports fp16 dtype better")
+
+        self.add_argument("--act_bits", default=32, type=int,
+                          help="activation bits")
+
+        self.add_argument("--fp_layers", default="", type=str,
+                          help="layers to maintain original data type")
+
+        self.add_argument("--not_use_best_mse", action='store_true',
+                          help="whether to use the iter of best mes loss in the tuning phase")
+
+        self.add_argument("--enable_torch_compile", default=None, type=bool,
+                          help="whether to enable torch compile")
+
+        ## ======================= VLM =======================
+        self.add_argument("--quant_nontext_module", action='store_true',
+                          help="whether to quantize non-text module, e.g. vision component")
+
+        self.add_argument("--extra_data_dir", default=None, type=str,
+                          help="dataset dir for storing images/audio/videos. "
+                               "Can be a dir path or multiple dir path with format as "
+                               "'image=path_to_image,video=path_to_video,audio=path_to_audio'"
+                               "By default, it will search in the relative path, "
+                               "and if not find, will automatic download.")
+
+        self.add_argument("--template", default=None, type=str,
+                          help="the template for building training dataset. It can be a custom one.")
+
+        self.add_argument("--truncation", action="store_true",
+                          help="whether to truncate sequences at the maximum length."
+                               " Default True for pile and False for llava dataset.")
+
+        self.add_argument("--to_quant_block_names", default=None, type=str,
+                          help="Names of quantitative blocks, please use commas to separate them.")
+
+
+def setup_parser():
+    parser = BasicArgumentParser()
+
+    parser.add_argument("--group_size", default=128, type=int,
+                        help="group size")
+
+    parser.add_argument("--batch_size", "--train_bs", "--bs", default=8, type=int,
+                        help="train batch size")
+
+    parser.add_argument("--iters", "--iter", default=200, type=int,
+                        help=" iters")
+
+    parser.add_argument("--seqlen", "--seq_len", default=None, type=int,
+                        help="sequence length, default 2048 for text-only, 512 for liuhaotian/llava")
+
+    parser.add_argument("--nsamples", default=128, type=int,
+                        help="number of samples")
+
+    args = parser.parse_args()
+    return args
+
+
+def tune(args):
+    model_name = args.model
+    if model_name[-1] == "/":
+        model_name = model_name[:-1]
+    print(f"start to quantize {model_name}")
+
+    devices = args.device.replace(" ", "").split(',')
+    use_auto_mapping = False
+
+    if all(s.isdigit() for s in devices):
+        if "CUDA_VISIBLE_DEVICES" in os.environ:
+            current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
+            current_visible_devices = current_visible_devices.split(',')
+            indices = [int(device) for device in devices]
+            try:
+                pick_device = [current_visible_devices[i] for i in indices]
+            except:
+                raise ValueError(
+                    "Invalid '--device' value: It must be smaller than the number of available devices. "
+                    "For example, with CUDA_VISIBLE_DEVICES=4,5, "
+                    "--device 0,1 is valid, but --device 4,5 is not supported.")
+            visible_devices = ','.join(pick_device)
+            os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
+        else:
+            os.environ["CUDA_VISIBLE_DEVICES"] = args.device
+            args.device = ",".join(map(str, range(len(devices))))
+            devices = args.device.replace(" ", "").split(',')
+        use_auto_mapping = True
+
+    device_str = detect_device(devices[0])
+
+    torch_dtype = "auto"
+    if "hpu" in device_str:
+        torch_dtype = torch.bfloat16
+
+    # load_model
+    processor, image_processor = None, None
+    if "llava" in model_name:
+        from llava.model.builder import load_pretrained_model  # pylint: disable=E0401
+        tokenizer, model, image_processor, _ = load_pretrained_model(
+            model_name, model_base=None, model_name=model_name,
+            torch_dtype=torch_dtype)
+        model_type = "llava"
+    else:
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+        model_type = config.model_type
+        if "qwen2_vl" in model_type:
+            from transformers import Qwen2VLForConditionalGeneration
+            cls = Qwen2VLForConditionalGeneration
+        elif "mllama" in model_type:
+            from transformers import MllamaForConditionalGeneration
+            cls = MllamaForConditionalGeneration
+        else:
+            cls = AutoModelForCausalLM
+        
+        kargs = {}
+        if "phi3_v" in model_type:
+            kargs['attn_implementation'] = 'eager'
+        model = cls.from_pretrained(
+            model_name, trust_remote_code=not args.disable_trust_remote_code, torch_dtype=torch_dtype,
+            device_map="auto" if use_auto_mapping else None, **kargs)
+        
+    if "cogvlm2" in model_name:
+        model.config.model_type = "cogvlm2"
+
+    from neural_compressor.torch.algorithms.weight_only.autoround import get_mllm_dataloader
+
+    model = model.eval()
+
+    if args.model_dtype != None:
+        try:
+            if args.model_dtype == "float16" or args.model_dtype == "fp16":
+                model = model.to(torch.float16)
+            elif args.model_dtype == "bfloat16" or args.model_dtype == "bfp16" or args.model_dtype == "bf16":
+                model = model.to(torch.bfloat16)
+            elif args.model_dtype == "float32" or args.model_dtype == "fp32":
+                model = model.to(torch.float32)
+        except:
+            raise ("please use more device to fit the device or just use one device")
+            exit()
+
+    all_blocks = get_multimodal_block_names(model, args.quant_nontext_module)
+    to_quant_block_names = find_matching_blocks(model, all_blocks, args.to_quant_block_names)
+    
+    # TODO check dataset?
+    dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen, nsamples = get_mllm_dataloader(
+        model=model,
+        tokenizer=tokenizer,
+        template=None,
+        dataset=args.dataset,
+        extra_data_dir=args.extra_data_dir,
+        seqlen=args.seqlen,
+        batch_size=args.batch_size,
+        split=None,
+        apply_template=None,
+        truncation=args.truncation,
+        seed=args.seed,
+        nsamples=args.nsamples,
+        gradient_accumulate_steps=args.gradient_accumulate_steps,
+        quant_nontext_module=args.quant_nontext_module,
+        processor=processor,
+        image_processor=image_processor,
+    )
+    quant_config = AutoRoundConfig(
+        is_mllm=True,
+        bits=args.bits,
+        use_sym=not args.asym,
+        group_size=args.group_size,
+        nsamples=nsamples,
+        batch_size=batch_size,
+        iters=args.iters,
+        seqlen=seqlen,
+        quant_nontext_module=args.quant_nontext_module,
+        truncation=truncation,
+        gradient_accumulate_steps=gradient_accumulate_steps,
+        nblocks=args.nblocks,
+        lr=args.lr,
+        minmax_lr=args.minmax_lr,
+        enable_quanted_input=not args.disable_quanted_input,
+        seed=args.seed,
+        scale_dtype=args.scale_dtype,
+        enable_minmax_tuning=not args.disable_minmax_tuning,
+        act_bits=args.act_bits,
+        to_quant_block_names=to_quant_block_names,
+        export_format=args.export_format
+    )
+        
+    # set_nontext_module_config(model, to_quant_block_names, quant_config)
+
+    format = args.export_format
+    if args.fp_layers != "":
+        fp_layers = args.fp_layers.replace(" ", "").split(",")
+        for n, m in model.named_modules():
+            if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
+                continue
+            for fp_layer in fp_layers:
+                if fp_layer in n:
+                    quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
+                    print(
+                        f"{n} will not be quantized.")
+
+    for n, m in model.named_modules():
+        if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
+            if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
+                print(
+                    f"{n} will not be quantized due to its shape not being divisible by 32,"
+                    " resulting in an exporting issue to autogptq")
+                
+    lm_head_layer_name = "lm_head"
+    for n, _ in model.named_modules():
+        lm_head_layer_name = n
+        
+    if args.quant_lm_head:
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+        if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
+            tied_keys = model._tied_weights_keys
+            for item in tied_keys:
+                if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
+                    args.quant_lm_head = False
+                    print(
+                        f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
+                        f"supported currently")
+                    break
+                
+    if not args.quant_lm_head:
+        quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
+    else:
+        if "auto_round" not in format:
+            raise ValueError(
+                f"{format} is not supported for lm-head quantization, please change to {auto_round_formats}")
+
+    if args.quant_lm_head and args.low_gpu_mem_usage:
+        print(f"warning, low_gpu_mem_usage=False is strongly recommended if the whole model could be loaded to "
+              f"gpu")
+
+    if "--truncation" not in sys.argv:
+        args.truncation = None
+
+    user_model = prepare(model=model, quant_config=quant_config)
+    run_fn(user_model, dataloader)
+    user_model = convert(user_model)
+        
+    model.eval()
+    if args.device != "cpu":
+        torch.cuda.empty_cache()
+    
+    from neural_compressor.torch.utils import (LoadFormat,)
+    kargs = {}
+    if "phi3_v" in model_type:
+        kargs['safe_serialization'] = 'False'
+    user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE, **kargs)
+    if tokenizer is not None:
+        tokenizer.save_pretrained(args.output_dir)
+    if processor is not None and hasattr(processor, 'chat_template'): # Avoiding phi-3.5-vision save errors
+        processor.save_pretrained(args.output_dir)
+
+
+
+def setup_mllm_eval_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", "--model_name", "--model_name_or_path",
+                          help="model name or path")
+    parser.add_argument("--tasks", type=str,
+                        default="MMBench_DEV_EN_V11,ScienceQA_VAL,TextVQA_VAL,POPE",
+                        help="eval tasks for VLMEvalKit.")
+    # Args that only apply to Video Dataset
+    parser.add_argument("--nframe", type=int, default=8,
+                        help="the number of frames to sample from a video,"
+                            " only applicable to the evaluation of video benchmarks.")
+    parser.add_argument("--pack", action='store_true',
+                        help="a video may associate with multiple questions, if pack==True,"
+                            " will ask all questions for a video in a single")
+    parser.add_argument("--fps", type=float, default=-1,
+                        help="set the fps for a video.")
+    # Work Dir
+    # Infer + Eval or Infer Only
+    parser.add_argument("--mode", type=str, default='all', choices=['all', 'infer'],
+                        help="when mode set to 'all', will perform both inference and evaluation;"
+                            " when set to 'infer' will only perform the inference.")
+    parser.add_argument('--eval_data_dir', type=str, default=None,
+                        help='path for VLMEvalKit to store the eval data. Default will store in ~/LMUData')
+    # API Kwargs, Apply to API VLMs and Judge API LLMs
+    parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs')
+    # Explicitly Set the Judge Model
+    parser.add_argument('--judge', type=str, default=None,
+                        help="whether is a judge model.")
+    # Logging Utils
+    parser.add_argument('--verbose', action='store_true',
+                        help="whether to display verbose information.")
+    # Configuration for Resume
+    # Ignore: will not rerun failed VLM inference
+    parser.add_argument('--ignore', action='store_true',
+                        help='ignore failed indices. ')
+    # Rerun: will remove all evaluation temp files
+    parser.add_argument('--rerun', action='store_true',
+                        help="if true, will remove all evaluation temp files and rerun.")
+    parser.add_argument("--output_dir", default="./eval_result", type=str,
+                          help="the directory to save quantized model")
+    args = parser.parse_args()
+    return args
+
+
+def setup_lmms_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", "--model_name", "--model_name_or_path",
+                        help="model name or path")
+    parser.add_argument(
+        "--tasks",
+        default="pope,textvqa_val,scienceqa,mmbench_en",
+        help="To get full list of tasks, use the command lmms-eval --tasks list",
+    )
+    parser.add_argument("--output_dir", default="./eval_result", type=str,
+                          help="the directory to save quantized model")
+    parser.add_argument(
+        "--num_fewshot",
+        type=int,
+        default=None,
+        help="Number of examples in few-shot context",
+    )
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use (e.g. cuda, cuda:0, cpu)",
+    )
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. " "If <1, limit is a percentage of the total"
+             " number of examples.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def mllm_eval(args):
+    if isinstance(args.tasks, str):
+        args.tasks = args.tasks.replace(' ', '').split(',')
+    from neural_compressor.torch.algorithms.weight_only.autoround import mllm_eval
+    mllm_eval(
+        args.model,
+        work_dir=args.output_dir,
+        data_store_dir=args.eval_data_dir,
+        dataset=args.tasks,
+        pack=args.pack,
+        fps=args.fps,
+        nframe=args.nframe,
+        rerun=args.rerun,
+        judge=args.judge,
+        verbose=args.verbose,
+        mode=args.mode,
+        ignore=args.ignore
+    )
+
+def lmms_eval(args):
+    from neural_compressor.torch.algorithms.weight_only.autoround import lmms_eval
+    results = lmms_eval(
+        model=args.model,
+        tasks=args.tasks,
+        output_dir=args.output_dir,
+        num_fewshot=args.num_fewshot,
+        limit=args.limit,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        device=args.device,
+        use_cache=None,
+        apply_chat_template=False,
+    )
+    return results
+
+
+if __name__ == '__main__':
+    if "--quantize" in sys.argv:
+        args = setup_parser()
+        tune(args)
+    elif "--accuracy" in sys.argv:
+        sys.argv.remove("--accuracy")
+        from neural_compressor.torch.quantization import load
+        if "--lmms" in sys.argv:
+            sys.argv.remove("--lmms")
+            args = setup_lmms_parser()
+            lmms_eval(args)
+        else:
+            if "--mllm_eval" in sys.argv:
+                sys.argv.remove("--mllm_eval")
+            args = setup_mllm_eval_parser()
+            mllm_eval(args)
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 2415e09117b..7a7285a15ee 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -35,6 +35,7 @@ def _is_auto_round_available():
 
 from auto_round import AutoRound, AutoRoundMLLM  # pylint: disable=E0401
 from auto_round.export.export_to_itrex.export import pack_model  # pylint: disable=E0401
+from auto_round.mllm import lmms_eval, mllm_eval
 from auto_round.mllm.template import Template, get_template
 
 from neural_compressor.torch.algorithms import Quantizer
@@ -339,20 +340,20 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
 
 
 def get_mllm_dataloader(
-    template,
     model,
     tokenizer,
+    template=None,
     processor=None,
     image_processor=None,
-    dataset="liuhaotian/llava_conv_58k",
+    dataset=None,
     extra_data_dir=None,
-    seqlen=512,
-    bs=1,
+    seqlen=None,
+    batch_size=8,
     split=None,
     apply_template=None,
-    truncation=False,
+    truncation=None,
     seed=42,
-    nsamples=512,
+    nsamples=128,
     gradient_accumulate_steps=1,
     quant_nontext_module=False,
 ):
@@ -377,6 +378,11 @@ def get_mllm_dataloader(
     from auto_round.mllm.autoround_mllm import _only_text_test
     from auto_round.mllm.mllm_dataset import get_mllm_dataloader  # pylint: disable=E0401
 
+    template = template if template is not None else model.config.model_type
+    template = get_template(
+        template, model=model, tokenizer=tokenizer, processor=processor, image_processor=image_processor
+    )
+    dataset = template.default_dataset if dataset is None else dataset
     if quant_nontext_module or (dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer)):
         if quant_nontext_module:
             logger.warning(
@@ -389,16 +395,19 @@ def get_mllm_dataloader(
                 " will use liuhaotian/llava_conv_58k with default config as an alternative."
             )
         dataset = "liuhaotian/llava_conv_58k"
+        seqlen = 512 if seqlen is None else seqlen
         truncation = False
+        gradient_accumulate_steps = batch_size * gradient_accumulate_steps
         batch_size = 1
-        gradient_accumulate_steps = 4
-        seqlen = 512
 
+    seqlen = 2048 if seqlen is None else seqlen  # set text only calibration default args
+    truncation = True if truncation is None else truncation
     dataset = dataset.replace(" ", "")
-    template = template if template is not None else model.config.model_type
-    template = get_template(
-        template, model=model, tokenizer=tokenizer, processor=processor, image_processor=image_processor
-    )
+
+    if nsamples % batch_size != 0:
+        nsamples = (nsamples // batch_size + 1) * batch_size
+        logger.warning(f"'nsamples' is not divisible by 'batch_size', will adjusted to {nsamples}")
+
     dataloader, batch_size, gradient_accumulate_steps = get_mllm_dataloader(
         template=template,
         model=model,
@@ -407,11 +416,11 @@ def get_mllm_dataloader(
         dataset=dataset,
         extra_data_dir=extra_data_dir,
         seqlen=seqlen,
-        bs=bs,
+        bs=batch_size,
         seed=seed,
         truncation=truncation,
         nsamples=nsamples,
         gradient_accumulate_steps=gradient_accumulate_steps,
         quant_nontext_module=quant_nontext_module,
     )
-    return dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen
+    return dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen, nsamples
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index 5fb93a711fe..dc1da4676e5 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -590,43 +590,38 @@ def is_valid_digit(s):
     return device
 
 
-def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):  # pragma: no cover
-    """Runs a model on a provided dataset with automatic device detection for vector-language models.
+def find_matching_blocks(model, all_blocks, to_quant_block_names=None):
+    """Find and return matching blocks in the model based on to_quant_block_names.
 
     Args:
-        model: The model to run.
-        dataloader: A PyTorch dataloader providing the input data for the model.
-        seqlen (int, optional): The minimum sequence length of input data to process. Defaults to 512.
-        nsamples (int, optional): The number of samples to process before stopping. Defaults to 512.
+        model: The model (not used in this specific function but kept for completeness).
+        all_blocks: List of lists, where each inner list contains full block names in the model.
+        to_quant_block_names: Comma-separated string of target block names to match.
 
     Returns:
-        None
+        target_blocks: List of lists containing full paths of matching blocks in the model.
     """
-    device = model.orig_model.device
-    total_cnt = 0
-    for org_data in dataloader:
-        if isinstance(org_data, torch.Tensor):
-            input_ids = org_data.to(device)
-            data = input_ids
-        elif isinstance(org_data, tuple) or isinstance(org_data, list):
-            data = org_data
-            input_ids = data[0]
-        else:
-            data = {}
-            for key in org_data.keys():
-                data[key] = to_device(org_data[key], device)
-                if key == "images":
-                    data[key] = to_dtype(org_data[key], model.orig_model.dtype)
-            input_ids = data["input_ids"]
-        if input_ids.shape[-1] < seqlen:
-            continue
-
-        if isinstance(data, tuple) or isinstance(data, list):
-            model(*data)
-        elif isinstance(data, dict):
-            model(**data)
-        else:
-            model(data)
-        total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
-        if total_cnt >= nsamples:
-            break
+    import re
+
+    if not to_quant_block_names:
+        return all_blocks
+    to_quant_block_list = to_quant_block_names
+    if isinstance(to_quant_block_names, list) or isinstance(to_quant_block_names, tuple):
+        return to_quant_block_names
+    if isinstance(to_quant_block_names, str):
+        to_quant_block_list = [name.strip() for name in to_quant_block_names.split(",")]
+    target_blocks = []
+    for block_list in all_blocks:
+        matched_sublist = []
+        for name in to_quant_block_list:
+            matches = [block for block in block_list if re.search(name, block)]
+            if matches:
+                matched_sublist.extend(matches)
+        if matched_sublist:
+            target_blocks.append(matched_sublist)
+        if not target_blocks:
+            raise ValueError(
+                "No block names matched. Please check the input for to_quant_block_name,"
+                "or set to_quant_block_name to None to automatically match quantizable blocks."
+            )
+    return target_blocks
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 55e0c7f0c16..89be63940a0 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -168,7 +168,6 @@ def test_utils(self):
             detect_device,
             get_layer_names_in_block,
             get_multimodal_block_names,
-            run_fn_for_vlm_autoround,
         )
 
         fp32_model = copy.deepcopy(self.gptj)
@@ -183,7 +182,7 @@ def test_utils(self):
         fp32_model.to(device)
         # quantizer execute
         model = prepare(model=fp32_model, quant_config=quant_config)
-        run_fn_for_vlm_autoround(model, self.dataloader, seqlen=32, nsamples=8)
+        run_fn(model, self.dataloader)
         q_model = convert(model)
         out = q_model(self.inp)[0]
         assert torch.allclose(out, self.label, atol=1e-1)
@@ -199,15 +198,15 @@ def test_mllm(self):
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
         model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
-        dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen = get_mllm_dataloader(
+        dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen, nsamples = get_mllm_dataloader(
             template=None,
             model=model,
             tokenizer=tokenizer,
             image_processor=None,
             dataset="liuhaotian/llava_conv_58k",
             extra_data_dir=None,
-            seqlen=2048,
-            bs=1,
+            seqlen=512,
+            batch_size=1,
             split=None,
             apply_template=None,
             truncation=False,
@@ -233,7 +232,7 @@ def test_mllm(self):
         model = prepare(model=model, quant_config=quant_config)
         run_fn(model, dataloader)
         q_model = convert(model)
-        assert isinstance(q_model.visual.blocks[0].attn.qkv, WeightOnlyLinear), "model quantization failed."
+        assert isinstance(q_model.model.layers[0].mlp.up_proj, WeightOnlyLinear), "model quantization failed."
 
     # def test_autoround_format_export(self):
     #     from neural_compressor.torch.quantization import load