From 34a0e96d463d37cf85cee9c2cd01397034e97573 Mon Sep 17 00:00:00 2001
From: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Date: Fri, 30 Aug 2024 11:11:39 +0700
Subject: [PATCH 01/51] [Kernel] changing fused moe kernel chunk size default
 to 32k (#7995)

---
 vllm/envs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 5906984163295..30320af5fa43a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -352,7 +352,7 @@ def get_default_config_root():
             os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
         )),
     "VLLM_FUSED_MOE_CHUNK_SIZE":
-    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
+    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
 
     # If set, vllm will skip the deprecation warnings.
     "VLLM_NO_DEPRECATION_WARNING":

From dc13e993484cf23c337e93cac9b28e7195dbbbed Mon Sep 17 00:00:00 2001
From: Yohan Na <nayohan13@gmail.com>
Date: Fri, 30 Aug 2024 15:34:20 +0900
Subject: [PATCH 02/51] [MODEL] add Exaone model support (#7819)

---
 docs/source/models/supported_models.rst     |   4 +
 vllm/model_executor/models/__init__.py      |   1 +
 vllm/model_executor/models/exaone.py        | 617 ++++++++++++++++++++
 vllm/transformers_utils/config.py           |  11 +-
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/exaone.py   | 190 ++++++
 6 files changed, 820 insertions(+), 5 deletions(-)
 create mode 100644 vllm/model_executor/models/exaone.py
 create mode 100644 vllm/transformers_utils/configs/exaone.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 223c68b40766e..f727c646b7da7 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -51,6 +51,10 @@ Decoder-only Language Models
     - DeciLM
     - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
     -
+  * - :code:`ExaoneForCausalLM`
+    - EXAONE-3
+    - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
+    - ✅︎
   * - :code:`FalconForCausalLM`
     - Falcon
     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 8591c276b0013..fc3d4922aea09 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -22,6 +22,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
+    "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
new file mode 100644
index 0000000000000..351bc7e67ca05
--- /dev/null
+++ b/vllm/model_executor/models/exaone.py
@@ -0,0 +1,617 @@
+# coding=utf-8
+# Adapted from
+# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
+# Copyright 2024 The LG U+ CTO AI Tech Lab.
+# Copyright 2021 The LG AI Research EXAONE Lab
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Exaone model compatible with HuggingFace weights."""
+
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    get_compressed_tensors_cache_scale)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.transformers_utils.configs.exaone import ExaoneConfig
+from vllm.utils import is_hip
+
+from .interfaces import SupportsLoRA
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+
+class ExaoneGatedMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.c_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class ExaoneAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: ExaoneConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim",
+                                self.hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.out_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class ExaoneBlockAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: ExaoneConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.attention = ExaoneAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=bias,
+            cache_config=cache_config,
+            prefix=prefix,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        return self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+
+class ExaoneDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ExaoneConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        self.attn = ExaoneBlockAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.mlp = ExaoneGatedMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.activation_function,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.ln_1(hidden_states)
+        else:
+            hidden_states, residual = self.ln_1(hidden_states, residual)
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ln_2(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class ExaoneModel(nn.Module):
+
+    def __init__(
+        self,
+        config: ExaoneConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.wte = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.wte = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.wte = PPMissingLayer()
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: ExaoneDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.h",
+        )
+        if get_pp_group().is_last_rank:
+            self.ln_f = RMSNorm(config.hidden_size,
+                                eps=config.layer_norm_epsilon)
+        else:
+            self.ln_f = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.h[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+        return hidden_states
+
+
+class ExaoneForCausalLM(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "c_fc_0",
+            "c_fc_1",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "out_proj",
+        "gate_up_proj",
+        "c_proj",
+        "wte",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "wte": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "c_fc_0": ("gate_up_proj", 0),
+        "c_fc_1": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: ExaoneConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.transformer = ExaoneModel(
+            config,
+            cache_config,
+            quant_config,
+            lora_config=lora_config,
+            prefix="model",
+        )
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.transformer.wte.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.transformer(input_ids, positions, kv_caches,
+                                        attn_metadata, intermediate_tensors)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros(
+                (batch_size, self.config.hidden_size),
+                dtype=dtype,
+                device=device,
+            ),
+            "residual":
+            torch.zeros(
+                (batch_size, self.config.hidden_size),
+                dtype=dtype,
+                device=device,
+            ),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".c_fc_0", 0),
+            (".gate_up_proj", ".c_fc_1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if scale_name := get_compressed_tensors_cache_scale(name):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+                quantization_param_path,
+                tp_rank,
+                tp_size,
+                self.config.num_hidden_layers,
+                self.config.__class__.model_type,
+        ):
+            if not isinstance(self.transformer.h[layer_idx], nn.Identity):
+                layer_self_attn = self.transformer.h[layer_idx].attn
+
+            if is_hip():
+                # The scaling factor convention we are assuming is
+                # quantized_value * scaling_factor ~= true_value
+                # which is consistent with the practice of setting
+                # scaling_factor = tensor_amax / FPtype_max
+                scaling_factor *= 2
+            if hasattr(layer_self_attn, "kv_scale"):
+                layer_self_attn.attn._kv_scale = scaling_factor
+            else:
+                raise RuntimeError("Self attention has no KV cache scaling "
+                                   "factor attribute!")
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c2276b075c1dd..4a03446590fe5 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -11,11 +11,11 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             EAGLEConfig, InternVLChatConfig,
-                                             JAISConfig, MedusaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
-                                             NemotronConfig, RWConfig,
-                                             UltravoxConfig)
+                                             EAGLEConfig, ExaoneConfig,
+                                             InternVLChatConfig, JAISConfig,
+                                             MedusaConfig, MLPSpeculatorConfig,
+                                             MPTConfig, NemotronConfig,
+                                             RWConfig, UltravoxConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -34,6 +34,7 @@
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
     "eagle": EAGLEConfig,
+    "exaone": ExaoneConfig,
     "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
     "ultravox": UltravoxConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index dc2fd6a859e3c..736878b35ad49 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,6 +1,7 @@
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
+from vllm.transformers_utils.configs.exaone import ExaoneConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
@@ -22,6 +23,7 @@
     "JAISConfig",
     "MedusaConfig",
     "EAGLEConfig",
+    "ExaoneConfig",
     "MLPSpeculatorConfig",
     "NemotronConfig",
     "UltravoxConfig",
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
new file mode 100644
index 0000000000000..805b8ad930039
--- /dev/null
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -0,0 +1,190 @@
+# coding=utf-8
+# Copied from
+# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
+# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Exaone model configuration"""
+
+from typing import Dict
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
+
+
+class ExaoneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:
+    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Exaone
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig`
+    and can be used to control the model outputs. Read the documentation from :
+    class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+            Vocabulary size of the GPT Lingvo model. Defines the number of
+            different tokens that can be represented by the :obj:`inputs_ids`
+            passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
+            size of the model.
+            Defines the different tokens that can be represented by the
+            `inputs_ids` passed to the forward method of :class:
+            `~transformers.EXAONEModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers (:obj:`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1 the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed by meanpooling
+            all the original heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
+        rotary_pct (`float`, *optional*, defaults to 0.25):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        intermediate_size (:obj:`int`, `optional`, defaults to 8192):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
+            the Transformer encoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`,
+        defaults to :obj:`"gelu_new"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
+            :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling
+            :class:`~transformers.EXAONEModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+            Only relevant if ``config.is_decoder=True``.
+        gradient_checkpointing (:obj:`bool`, `optional`,
+        defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense
+            of slower backward pass.
+        Example::
+
+            >>> from transformers import ExoneModel, ExaoneConfig
+
+            >>> # Initializing a EXAONE configuration
+            >>> configuration = ExaoneConfig()
+
+            >>> # Initializing a model from configuration
+            >>> model = ExoneModel(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+    """
+
+    model_type = "exaone"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        max_position_embeddings=2048,
+        hidden_size=2048,
+        num_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        intermediate_size=None,
+        activation_function="silu",
+        rotary_pct=0.25,
+        resid_dropout=0.0,
+        embed_dropout=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if intermediate_size:
+            self.intermediate_size = intermediate_size
+        else:
+            self.intermediate_size = hidden_size * 4
+        self.activation_function = activation_function
+        self.resid_dropout = resid_dropout
+        self.embed_dropout = embed_dropout
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rotary_pct = rotary_pct
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
+        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
+        self.use_gated = kwargs.pop("use_gated", False)
+        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
+        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
+        self.rotary_type = kwargs.pop("rotary_type", None)
+        self.scaling_factor = kwargs.pop("scaling_factor", 1)
+        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
+        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
+        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
+        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
+        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
+        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
+                                                 (rotary_pct == 0.25))
+        if self.use_rotary_pos:
+            self.use_absolute_pos = False

From 2148441fd371faf3e90748b310fdb4500939e527 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Fri, 30 Aug 2024 00:27:40 -0700
Subject: [PATCH 03/51] [TPU] Support single and multi-host TPUs on GKE (#7613)

---
 requirements-tpu.txt                          |  2 +-
 vllm/attention/backends/pallas.py             |  5 +++-
 .../device_communicators/tpu_communicator.py  | 27 +++++++++++++++--
 vllm/executor/ray_tpu_executor.py             | 15 ++++++++++
 vllm/executor/ray_utils.py                    | 29 +++++++++++++++++++
 5 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 5eb27b39eb623..4c606cf0a9105 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -4,4 +4,4 @@
 # Dependencies for TPU
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
-ray
+ray[default]
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index ac03b6d8b1ead..c324d62d44d79 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -123,7 +123,10 @@ def __init__(
             raise NotImplementedError("TPU version must be 4 or higher.")
 
         self.megacore_mode = None
-        tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
+        tpu_env = torch_xla.tpu.get_tpu_env()
+        tpu_type = tpu_env.get("TYPE") or tpu_env.get("ACCELERATOR_TYPE")
+        tpu_type = tpu_type.lower()
+
         if "lite" not in tpu_type:
             if self.num_kv_heads % 2 == 0:
                 self.megacore_mode = "kv_head"
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 81a141e86206a..765a0f9cb1c87 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,3 +1,5 @@
+import os
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
@@ -5,11 +7,12 @@
 from vllm.platforms import current_platform
 
 if current_platform.is_tpu():
-    import ray
     import torch_xla.core.xla_model as xm
     import torch_xla.runtime as xr
     from torch_xla._internal import pjrt
 
+    from vllm.executor import ray_utils
+
 
 class TpuCommunicator:
 
@@ -24,9 +27,29 @@ def __init__(self, group: ProcessGroup):
         # be simply calculated as follows.
         global_rank = dist.get_rank(group)
         global_world_size = dist.get_world_size(group)
-        num_nodes = len(ray.nodes())
+
+        # Calculate how many TPU nodes are in the current deployment. This
+        # is the Ray placement group if it is deployed with Ray. Default
+        # to the number of TPU nodes in the Ray cluster. The number of TPU
+        # nodes is computed by the total number of TPUs divided by the
+        # number of TPU accelerators per node, to account for clusters
+        # with both CPUs and TPUs.
+        num_nodes = ray_utils.get_num_tpu_nodes()
+        num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
+        if num_nodes_in_pg > 0:
+            num_nodes = num_nodes_in_pg
+
         local_world_size = global_world_size // num_nodes
         local_rank = global_rank % local_world_size
+
+        # Ensure environment variables are set for multihost deployments.
+        # On GKE, this is needed for libtpu and TPU driver to know which TPU
+        # chip is actually visible. Otherwise the TPU driver will fail to
+        # initialize because the number of devices would be different from
+        # the number of visible worker addresses.
+        os.environ["CLOUD_TPU_TASK_ID"] = str(global_rank)
+        os.environ["TPU_VISIBLE_CHIPS"] = str(local_rank)
+
         pjrt.initialize_multiprocess(local_rank, local_world_size)
         xr._init_world_size_ordinal()
 
diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
index 2a1fd35b65797..8f867b1d647a5 100644
--- a/vllm/executor/ray_tpu_executor.py
+++ b/vllm/executor/ray_tpu_executor.py
@@ -71,6 +71,19 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             worker_module_name = "vllm.worker.tpu_worker"
             worker_class_name = "TPUWorker"
 
+            # GKE does not fetch environment information from metadata server
+            # and instead sets these from within the Ray process. Therefore we
+            # need to override the Ray environment variables manually.
+            override_env = {}
+            if "TPU_CHIPS_PER_HOST_BOUNDS" in os.environ:
+                override_env.update({
+                    "TPU_CHIPS_PER_HOST_BOUNDS":
+                    os.environ["TPU_CHIPS_PER_HOST_BOUNDS"]
+                })
+            if "TPU_HOST_BOUNDS" in os.environ:
+                override_env.update(
+                    {"TPU_HOST_BOUNDS": os.environ["TPU_HOST_BOUNDS"]})
+
             worker = ray.remote(
                 num_cpus=0,
                 resources={"TPU": 1},
@@ -81,6 +94,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 worker_class_name=worker_class_name,
                 trust_remote_code=self.model_config.trust_remote_code,
             )
+            if override_env:
+                worker.override_env_vars.remote(override_env)
 
             worker_ip = ray.get(worker.get_node_ip.remote())
             if worker_ip == driver_ip and self.driver_dummy_worker is None:
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index bfdd0f5cf97b3..59e9854393b6b 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,3 +1,4 @@
+import os
 import time
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple, Union
@@ -84,6 +85,9 @@ def execute_model_spmd(
 
             return output
 
+        def override_env_vars(self, vars: Dict[str, str]):
+            os.environ.update(vars)
+
     ray_import_err = None
 
 except ImportError as e:
@@ -291,3 +295,28 @@ def initialize_ray_cluster(
     _verify_bundles(current_placement_group, parallel_config, device_str)
     # Set the placement group in the parallel config
     parallel_config.placement_group = current_placement_group
+
+
+def get_num_tpu_nodes() -> int:
+    from ray._private.accelerators import TPUAcceleratorManager
+    cluster_resources = ray.cluster_resources()
+    total_tpus = int(cluster_resources["TPU"])
+    tpus_per_node = TPUAcceleratorManager.get_current_node_num_accelerators()
+    assert total_tpus % tpus_per_node == 0
+    return total_tpus // tpus_per_node
+
+
+def get_num_nodes_in_placement_group() -> int:
+    pg_table = ray.util.placement_group_table()
+    current_pg = ray.util.get_current_placement_group()
+    num_nodes = 0
+
+    if current_pg:
+        nodes_in_pg = set()
+        for pg_key, pg in pg_table.items():
+            if pg_key == current_pg.id.hex():
+                for _, node in pg["bundles_to_node_id"].items():
+                    nodes_in_pg.add(node)
+        num_nodes = len(nodes_in_pg)
+
+    return num_nodes

From afd39a4511111aa05fd58834191d46328aed5a27 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 30 Aug 2024 23:03:28 +0800
Subject: [PATCH 04/51] [Bugfix] Fix import error in Exaone model (#8034)

---
 vllm/model_executor/models/exaone.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 351bc7e67ca05..4a1c367de3f62 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -43,13 +43,13 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
 from vllm.utils import is_hip
 

From f97be32d1da4cfda933a0dbfbc681861f96390d9 Mon Sep 17 00:00:00 2001
From: Jungho Christopher Cho <wjdgh6655@gmail.com>
Date: Sat, 31 Aug 2024 00:19:27 +0900
Subject: [PATCH 05/51] [VLM][Model] TP support for ViTs (#7186)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 tests/models/test_intern_vit.py          |   3 +-
 tests/models/test_internvl.py            |  63 ++++---
 vllm/model_executor/models/blip.py       |  79 ++++++++-
 vllm/model_executor/models/blip2.py      |   3 +-
 vllm/model_executor/models/clip.py       | 105 ++++++++++-
 vllm/model_executor/models/intern_vit.py |  64 +++++--
 vllm/model_executor/models/paligemma.py  |  48 +++---
 vllm/model_executor/models/phi3v.py      |  53 ++++--
 vllm/model_executor/models/siglip.py     | 211 ++++-------------------
 9 files changed, 340 insertions(+), 289 deletions(-)

diff --git a/tests/models/test_intern_vit.py b/tests/models/test_intern_vit.py
index e980446ff3570..816f846f69bae 100644
--- a/tests/models/test_intern_vit.py
+++ b/tests/models/test_intern_vit.py
@@ -6,8 +6,6 @@
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 
-from vllm.model_executor.models.intern_vit import InternVisionModel
-
 from ..conftest import _ImageAssets, cleanup
 
 pytestmark = pytest.mark.vlm
@@ -49,6 +47,7 @@ def run_intern_vit_test(
         for pixel_value in pixel_values
     ]
 
+    from vllm.model_executor.models.intern_vit import InternVisionModel
     vllm_model = InternVisionModel(config)
     vllm_model.load_weights(hf_model.state_dict().items())
 
diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py
index 243bc857c88de..42732cebc6567 100644
--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
@@ -6,9 +6,6 @@
 from PIL.Image import Image
 from transformers import AutoConfig
 
-from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
-                                                 IMG_START,
-                                                 image_to_pixel_values)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
 
@@ -33,35 +30,6 @@
 ]
 
 
-class InternVLProcessor:
-    """A simple processor for InternVL2 HF model which misses a processor."""
-
-    def __init__(self, hf_runner: HfRunner):
-        self.num_image_token = hf_runner.model.num_image_token
-        self.tokenizer = hf_runner.tokenizer
-        self.dtype = hf_runner.model.dtype
-
-        self.config = AutoConfig.from_pretrained(hf_runner.model_name)
-        self.vision_config = self.config.vision_config
-        self.use_thumbnail = self.config.use_thumbnail
-        self.min_num = self.config.min_dynamic_patch
-        self.max_num = self.config.max_dynamic_patch
-        self.image_size = self.vision_config.image_size
-
-    def __call__(self, text: str, images: Image, **kwargs):
-        pixel_values = image_to_pixel_values(images, self.image_size,
-                                             self.min_num, self.max_num,
-                                             self.use_thumbnail).to(self.dtype)
-        num_patches_list = [pixel_values.shape[0]]
-        for num_patches in num_patches_list:
-            context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
-            image_tokens = IMG_START + context_tokens + IMG_END
-            text = text.replace('<image>', image_tokens, 1)
-        prompt = self.tokenizer(text, return_tensors="pt")
-        prompt.update({"pixel_values": pixel_values})
-        return prompt
-
-
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
 def generate(
     self,
@@ -127,6 +95,37 @@ def run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
+    class InternVLProcessor:
+        """A simple processor for InternVL2 which misses a processor."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+            self.dtype = hf_runner.model.dtype
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Image, **kwargs):
+            from vllm.model_executor.models.internvl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+            pixel_values = image_to_pixel_values(
+                images, self.image_size, self.min_num, self.max_num,
+                self.use_thumbnail).to(self.dtype)
+            num_patches_list = [pixel_values.shape[0]]
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
                      max_model_len=4096,
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 830680fd990bf..e6acf8cd5d5bb 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -7,12 +7,14 @@
 import torch.nn as nn
 from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
-from transformers.models.blip.modeling_blip import BlipAttention
+from xformers import ops as xops
 
 from vllm.config import ModelConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal.utils import (cached_get_tokenizer,
@@ -154,6 +156,77 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+class BlipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: BlipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+        )
+        self.projection = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            quant_config=quant_config,
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        qkv_states, _ = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+        query_states = query_states.view(bsz, tgt_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+        key_states = key_states.view(bsz, tgt_len,
+                                     self.num_heads_per_partition,
+                                     self.head_dim)
+        value_states = value_states.view(bsz, tgt_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+
+        out = xops.memory_efficient_attention_forward(query_states,
+                                                      key_states,
+                                                      value_states,
+                                                      p=self.dropout,
+                                                      scale=self.scale)
+        out = out.view(bsz, tgt_len, -1)
+        attn_output, _ = self.projection(out)
+
+        return attn_output
+
+
 class BlipMLP(nn.Module):
 
     def __init__(self,
@@ -188,7 +261,7 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
-        self.self_attn = BlipAttention(config)
+        self.self_attn = BlipAttention(config, quant_config=quant_config)
         self.layer_norm1 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
         self.mlp = BlipMLP(config, quant_config=quant_config)
@@ -199,7 +272,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 0ed46f39cacd9..39f2b2d853a6b 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -714,8 +714,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             use_default_weight_loading = False
             if "vision" in name:
                 if self.vision_model is not None:
-                    # We only do sharding for language model and
-                    # not vision model for now.
+                    # BlipVisionModel does not need sharding
                     use_default_weight_loading = True
             else:
                 for (param_name, weight_name,
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 69bb9f6f3afee..ddfec91d6cab2 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -7,12 +7,14 @@
 import torch.nn as nn
 from PIL import Image
 from transformers import CLIPVisionConfig
-from transformers.models.clip.modeling_clip import CLIPAttention
+from xformers import ops as xops
 
 from vllm.config import ModelConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -160,6 +162,78 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            quant_config=quant_config,
+        )
+
+        self.out_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+
+        query_states = query_states.view(bsz, tgt_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+        key_states = key_states.view(bsz, tgt_len,
+                                     self.num_heads_per_partition,
+                                     self.head_dim)
+        value_states = value_states.view(bsz, tgt_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+
+        out = xops.memory_efficient_attention_forward(query_states,
+                                                      key_states,
+                                                      value_states,
+                                                      p=self.dropout,
+                                                      scale=self.scale)
+        out = out.view(bsz, tgt_len, -1)
+        attn_output, _ = self.out_proj(out)
+
+        return attn_output
+
+
 class CLIPMLP(nn.Module):
 
     def __init__(self,
@@ -192,7 +266,7 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
-        self.self_attn = CLIPAttention(config)
+        self.self_attn = CLIPAttention(config, quant_config=quant_config)
         self.layer_norm1 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
         self.mlp = CLIPMLP(config, quant_config=quant_config)
@@ -204,7 +278,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -304,7 +378,15 @@ def forward(self, pixel_values: Optional[torch.Tensor] = None):
     def device(self):
         return next(self.parameters()).device
 
+    # (TODO) Add prefix argument for filtering out weights to be loaded
+    #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
         params_dict = dict(self.named_parameters())
         layer_count = len(self.vision_model.encoder.layers)
 
@@ -318,7 +400,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if layer_idx >= layer_count:
                     continue
 
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 54c933e3e4959..ad5919150cad8 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -10,10 +10,13 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
+from xformers import ops as xops
 
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -81,7 +84,11 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
 class InternAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -94,9 +101,13 @@ def __init__(self, config: PretrainedConfig):
                 f' {self.num_heads}).')
 
         self.scale = self.head_dim**-0.5
-        self.qkv = nn.Linear(self.embed_dim,
-                             3 * self.embed_dim,
-                             bias=config.qkv_bias)
+        self.qkv = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+        )
 
         self.qk_normalization = config.qk_normalization
 
@@ -104,25 +115,40 @@ def __init__(self, config: PretrainedConfig):
             self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
             self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
 
-        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.proj = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            quant_config=quant_config,
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
     def forward(self, x):
         B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
-                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv.unbind(0)
-
-        if self.qk_normalization:
-            B_, H_, N_, D_ = q.shape
-            q = self.q_norm.forward_native(q.transpose(1, 2).flatten(
-                -2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
-            k = self.k_norm.forward_native(k.transpose(1, 2).flatten(
-                -2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
 
-        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
-        x = x.transpose(1, 2).reshape(B, N, C)
+        q = q.view(B, N, self.num_heads_per_partition, self.head_dim)
+        k = k.view(B, N, self.num_heads_per_partition, self.head_dim)
+        v = v.view(B, N, self.num_heads_per_partition, self.head_dim)
 
-        x = self.proj(x)
+        if self.qk_normalization:
+            B_, N_, H_, D_ = q.shape
+            q = self.q_norm.forward_native(q.flatten(-2,
+                                                     -1)).view(B_, N_, H_, D_)
+            k = self.k_norm.forward_native(k.flatten(-2,
+                                                     -1)).view(B_, N_, H_, D_)
+
+        x = xops.memory_efficient_attention_forward(
+            q,
+            k,
+            v,
+            scale=self.scale,
+        )
+        x = x.view(B, N, -1)
+
+        x, _ = self.proj(x)
         return x
 
 
@@ -161,7 +187,7 @@ def __init__(self,
         self.intermediate_size = config.intermediate_size
         self.norm_type = config.norm_type
 
-        self.attn = InternAttention(config)
+        self.attn = InternAttention(config, quant_config=quant_config)
         self.mlp = InternMLP(config, quant_config=quant_config)
         self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
                                              eps=config.layer_norm_eps)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 104b89e06fa5f..9b29ff69808a6 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -145,7 +145,6 @@ def __init__(self,
         self.config = config
         self.multimodal_config = multimodal_config
 
-        # TODO(ywang96): Port over SiglipVisionModel & TP
         self.vision_tower = SiglipVisionModel(config.vision_config)
         self.multi_modal_projector = PaliGemmaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
@@ -308,34 +307,27 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
             use_default_weight_loading = False
-            if "vision" in name:
-                if self.vision_tower is not None:
-                    # We only do sharding for language model and
-                    # not vision model for now.
-                    use_default_weight_loading = True
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
             else:
-                for (param_name, shard_name,
-                     shard_id) in stacked_params_mapping:
-                    if shard_name not in name:
-                        continue
-                    name = name.replace(shard_name, param_name)
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param, loaded_weight, shard_id)
-                    break
-                else:
-                    # lm_head is not used in vllm as it is tied with
-                    # embed_token. To prevent errors, skip loading
-                    # lm_head.weight.
-                    if "lm_head.weight" in name:
-                        continue
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    use_default_weight_loading = True
+                # lm_head is not used in vllm as it is tied with
+                # embed_token. To prevent errors, skip loading
+                # lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                use_default_weight_loading = True
 
             if use_default_weight_loading:
                 param = params_dict[name]
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 2fad3ec3e5651..c449e0fc759a3 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -71,6 +71,23 @@
                                                      projection_dim=768)
 
 
+def _init_img_processor(hf_config: PretrainedConfig):
+    clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
+    layer_idx = hf_config.img_processor.get('layer_idx', -2)
+
+    # Initialize the CLIP only up to the required feature layer
+    if layer_idx < 0:
+        num_hidden_layers = clip_config.num_hidden_layers + \
+            layer_idx + 1
+    else:
+        num_hidden_layers = layer_idx + 1
+
+    img_processor = CLIPVisionModel(
+        clip_config, num_hidden_layers_override=num_hidden_layers)
+
+    return img_processor
+
+
 class Phi3VImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: Union[torch.Tensor, List[torch.Tensor]]
@@ -139,18 +156,8 @@ def __init__(self, config: PretrainedConfig) -> None:
         hidden_size = config.n_embd if hasattr(
             config, 'n_embd') else config.hidden_size
 
-        clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
-        self.layer_idx = config.img_processor.get('layer_idx', -2)
-
-        # Initialize the CLIP only up to the required feature layer
-        if self.layer_idx < 0:
-            num_hidden_layers = clip_config.num_hidden_layers + \
-                self.layer_idx + 1
-        else:
-            num_hidden_layers = self.layer_idx + 1
+        self.img_processor = _init_img_processor(config)
 
-        self.img_processor = CLIPVisionModel(
-            clip_config, num_hidden_layers_override=num_hidden_layers)
         image_dim_out = config.img_processor['image_dim_out']
         self.num_img_tokens = config.img_processor['num_img_tokens']
 
@@ -656,23 +663,27 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
+
+        # TODO(ChristopherCho): This is a temporary fix to load
+        #     the vision weights with CLIPVisionModel.load_weights()
+        vision_weights = []
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            # post_layernorm is not needed in CLIPVisionModel
-            if "vision_model.post_layernorm" in name:
+            # Skip loading the img_processor weights since they are
+            # loaded separately.
+            if "vision_embed_tokens.img_processor" in name:
+                vision_weights.append((name, loaded_weight))
                 continue
+
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                # We only do sharding for language model
-                # and not vision model for now.
-                if "vision_embed_tokens" in name and self.vision_embed_tokens:
-                    continue
                 if weight_name not in name:
                     continue
+
                 param = params_dict[name.replace(weight_name, param_name)]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -686,3 +697,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+
+        # We use regex to extract the sub-module name
+        # from "model.vision_embed_tokens.img_processor.*"
+        vision_weights = [
+            (re.search(r"vision_embed_tokens\.img_processor\.(.*)",
+                       n).group(1), w) for n, w in vision_weights
+        ]
+        self.vision_embed_tokens.img_processor.load_weights(vision_weights)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 073f60bb3a056..e6f95af0ff49f 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -9,12 +9,10 @@
 from PIL import Image
 from torch import nn
 from transformers import SiglipVisionConfig
-from transformers.models.siglip.modeling_siglip import SiglipAttention
-from vllm_flash_attn import flash_attn_func
-from xformers.ops import memory_efficient_attention
+from xformers import ops as xops
 
 from vllm.config import ModelConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -221,9 +219,7 @@ def forward(self,
         return embeddings
 
 
-# NOTE: Not used - kept for later when we TP the ViT
-# TODO(ChristopherCho): Implement TP version of Attention
-class SiglipTPAttention(nn.Module):
+class SiglipAttention(nn.Module):
 
     def __init__(
         self,
@@ -233,38 +229,30 @@ def __init__(
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
-
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = config.num_attention_heads
-        if self.total_num_heads % tp_size != 0:
-            raise ValueError(
-                f"Number of attention heads ({self.total_num_heads}) "
-                "must be divisible by the tensor model parallel size"
-                f" ({tp_size}).")
-
-        self.num_heads = self.total_num_heads // tp_size
-        self.head_dim = self.embed_dim // self.total_num_heads
-        if self.head_dim * self.total_num_heads != self.embed_dim:
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(f"embed_dim must be divisible by num_heads (got "
                              "`embed_dim`: {self.embed_dim} and `num_heads`:"
                              f" {self.num_heads}).")
-        self.qkv_size = self.num_heads * self.head_dim
+
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
-
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.embed_dim,
             head_size=self.head_dim,
-            total_num_heads=self.total_num_heads,
+            total_num_heads=self.num_heads,
             quant_config=quant_config,
         )
+
         self.out_proj = RowParallelLinear(
             input_size=self.embed_dim,
             output_size=self.embed_dim,
             quant_config=quant_config,
         )
 
-        self.attn_fn = self._basic_attention_forward
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
     def forward(
         self,
@@ -274,163 +262,29 @@ def forward(
         batch_size, q_len, _ = hidden_states.size()
 
         qkv_states, _ = self.qkv_proj(hidden_states)
-        query_states, key_states, value_states = qkv_states.split(
-            [self.qkv_size] * 3, dim=-1)
-
-        attn_output = self.attn_fn(
-            q=query_states,
-            k=key_states,
-            v=value_states,
-            batch_size=batch_size,
-            q_len=q_len,
-        )
-
-        attn_output, _ = self.out_proj(attn_output)
-        return attn_output
-
-    def _basic_attention_forward(self, q, k, v, batch_size, q_len):
-        q = q.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-        k = k.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-        v = v.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-
-        k_v_seq_len = k.shape[-2]
-        attn_weights = torch.matmul(q, k.transpose(2, 3)) * self.scale
-
-        if attn_weights.size() != (
-                batch_size,
-                self.num_heads,
-                q_len,
-                k_v_seq_len,
-        ):
-            raise ValueError(
-                "Attention weights should be of size "
-                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}")
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(q.dtype)
-        attn_weights = nn.functional.dropout(attn_weights,
-                                             p=self.dropout,
-                                             training=self.training)
-        attn_output = torch.matmul(attn_weights, v)
-
-        if attn_output.size() != (
-                batch_size,
-                self.num_heads,
-                q_len,
-                self.head_dim,
-        ):
-            raise ValueError(
-                "`attn_output` should be of size "
-                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        return attn_output
-
-
-# NOTE: Not used - kept for later when we TP the ViT
-# TODO(ChristopherCho): flash_attn_func is not working properly.
-#                       It constantly throws a CUDA error.
-class SiglipFlashAttention2(SiglipTPAttention):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.attn_fn = self._flash_attention_forward
-
-    # Ported from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L449
-    # and https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/modeling_flash_attention_utils.py#L133
-    def _flash_attention_forward(self, q, k, v, batch_size, q_len, *args,
-                                 **kwargs):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            q, k, v: The tensor containing the
-                     query, key, and value. (B, S, H, D)
-        """
-
-        q = q.view(batch_size, q_len, self.num_heads, self.head_dim)
-        k = k.view(batch_size, q_len, self.num_heads, self.head_dim)
-        v = v.view(batch_size, q_len, self.num_heads, self.head_dim)
-
-        attn_output = flash_attn_func(
-            q,
-            k,
-            v,
-            dropout_p=self.dropout,
-            causal=False,
-        )
-
-        attn_output = attn_output.reshape(batch_size, q_len,
-                                          self.embed_dim).contiguous()
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+
+        query_states = query_states.view(batch_size, q_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+        key_states = key_states.view(batch_size, q_len,
+                                     self.num_heads_per_partition,
+                                     self.head_dim)
+        value_states = value_states.view(batch_size, q_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+
+        out = xops.memory_efficient_attention_forward(query_states,
+                                                      key_states,
+                                                      value_states,
+                                                      p=self.dropout,
+                                                      scale=self.scale)
+        out = out.view(batch_size, q_len, -1)
+        attn_output, _ = self.out_proj(out)
 
         return attn_output
 
 
-# NOTE: Not used - kept for later when we TP the ViT
-class SiglipSdpaAttention(SiglipTPAttention):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.is_causal = False
-        self.attn_fn = self._sdpa_attention_forward
-
-    def _sdpa_attention_forward(self, q, k, v, batch_size, q_len):
-        q = q.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-        k = k.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-        v = v.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            q, k, v, dropout_p=self.dropout, is_causal=False, scale=self.scale)
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(batch_size, q_len, self.embed_dim)
-
-        return attn_output
-
-
-# NOTE: Not used - kept for later when we TP the ViT
-class SiglipxFormersAttention(SiglipTPAttention):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.attn_fn = self._xformers_attention_forward
-
-    def _xformers_attention_forward(self, q, k, v, batch_size, q_len):
-        q = q.view(batch_size, q_len, self.num_heads, self.head_dim)
-        k = k.view(batch_size, q_len, self.num_heads, self.head_dim)
-        v = v.view(batch_size, q_len, self.num_heads, self.head_dim)
-
-        attn_output = memory_efficient_attention(q,
-                                                 k,
-                                                 v,
-                                                 p=0.0,
-                                                 scale=self.scale)
-        attn_output = attn_output.reshape(batch_size, q_len,
-                                          self.embed_dim).contiguous()
-
-        return attn_output
-
-
-# NOTE: Not used - kept for later when we TP the ViT
-SIGLIP_ATTENTION_CLASSES = {
-    "eager": SiglipTPAttention,
-    "flash_attention_2": SiglipFlashAttention2,
-    "sdpa": SiglipSdpaAttention,
-    "xformers": SiglipxFormersAttention,
-}
-
-
 class SiglipMLP(nn.Module):
 
     def __init__(
@@ -473,8 +327,7 @@ def __init__(
         super().__init__()
         self.embed_dim = config.hidden_size
 
-        # TODO(ChristopherCho): use TP'ed Attention block
-        self.self_attn = SiglipAttention(config)
+        self.self_attn = SiglipAttention(config, quant_config=quant_config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -491,7 +344,7 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states

From 98cef6a2278750ce7578ee6d6ae91e53d01c77a5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 30 Aug 2024 23:20:34 +0800
Subject: [PATCH 06/51] [Core] Increase default `max_num_batched_tokens` for
 multimodal models (#8028)

---
 vllm/config.py            | 36 ++++++++++++++++++++++++++----------
 vllm/engine/arg_utils.py  |  1 +
 vllm/engine/llm_engine.py |  6 +++++-
 vllm/worker/utils.py      |  2 +-
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 7e0b75eceae5b..b84d91d402370 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -32,6 +32,7 @@
 logger = init_logger(__name__)
 
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
+_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 4096
 
 _PP_SUPPORTED_MODELS = [
     "AquilaModel",
@@ -571,6 +572,10 @@ def is_embedding_model(self) -> bool:
         """Extract the embedding model flag."""
         return self.embedding_mode
 
+    @property
+    def is_multimodal_model(self) -> bool:
+        return self.multimodal_config is not None
+
 
 class CacheConfig:
     """Configuration for the KV cache.
@@ -947,25 +952,36 @@ def __init__(self,
                  num_lookahead_slots: int = 0,
                  delay_factor: float = 0.0,
                  enable_chunked_prefill: bool = False,
-                 embedding_mode: Optional[bool] = False,
+                 embedding_mode: bool = False,
+                 is_multimodal_model: bool = False,
                  preemption_mode: Optional[str] = None,
                  num_scheduler_steps: int = 1,
                  send_delta_data: bool = False) -> None:
-        if max_num_batched_tokens is not None:
-            self.max_num_batched_tokens = max_num_batched_tokens
-        else:
+        if max_num_batched_tokens is None:
             if enable_chunked_prefill:
                 # It is the values that have the best balance between ITL
                 # and TTFT on A100. Note it is not optimized for throughput.
-                self.max_num_batched_tokens = 512
-            elif embedding_mode:
-                # For embedding, choose specific value for higher throughput
-                self.max_num_batched_tokens = max(
-                    max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS)
+                max_num_batched_tokens = 512
             else:
                 # If max_model_len is too short, use 2048 as the default value
                 # for higher throughput.
-                self.max_num_batched_tokens = max(max_model_len, 2048)
+                max_num_batched_tokens = max(max_model_len, 2048)
+
+            if embedding_mode:
+                # For embedding, choose specific value for higher throughput
+                max_num_batched_tokens = max(
+                    max_num_batched_tokens,
+                    _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS,
+                )
+            if is_multimodal_model:
+                # The value needs to be at least the number of multimodal tokens
+                max_num_batched_tokens = max(
+                    max_num_batched_tokens,
+                    _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+                )
+
+        self.max_num_batched_tokens = max_num_batched_tokens
+
         if enable_chunked_prefill:
             logger.info(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6e66198e203fc..d98f57bc2d353 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -921,6 +921,7 @@ def create_engine_config(self) -> EngineConfig:
             delay_factor=self.scheduler_delay_factor,
             enable_chunked_prefill=self.enable_chunked_prefill,
             embedding_mode=model_config.embedding_mode,
+            is_multimodal_model=model_config.is_multimodal_model,
             preemption_mode=self.preemption_mode,
             num_scheduler_steps=self.num_scheduler_steps,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index aa33933c668ed..1eab83f3b9889 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2019,7 +2019,7 @@ def _validate_model_inputs(self, inputs: Union[LLMInputs,
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
 
-        if self.model_config.multimodal_config is not None:
+        if self.model_config.is_multimodal_model:
             max_prompt_len = self.model_config.max_model_len
 
             if len(prompt_ids) > max_prompt_len:
@@ -2030,3 +2030,7 @@ def _validate_model_inputs(self, inputs: Union[LLMInputs,
                     "number of text tokens plus multimodal tokens. For image "
                     "inputs, the number of image tokens depends on the number "
                     "of images, and possibly their aspect ratios as well.")
+
+            # TODO: Find out how many placeholder tokens are there so we can
+            # check that chunked prefill does not truncate them
+            # max_batch_len = self.scheduler_config.max_num_batched_tokens
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index 79c48896469e8..d73023e8e1724 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -39,7 +39,7 @@ def assert_enc_dec_mr_supported_scenario(
         raise NotImplementedError(
             STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP'])
 
-    if enc_dec_mr.model_config.multimodal_config is not None:
+    if enc_dec_mr.model_config.is_multimodal_model:
         raise NotImplementedError(
             STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_MM'])
 

From 058344f89a6594b560e2bb4925daed3f373c3fbc Mon Sep 17 00:00:00 2001
From: Kaunil Dhruv <dhruv.kaunil@gmail.com>
Date: Fri, 30 Aug 2024 08:21:02 -0700
Subject: [PATCH 07/51] [Frontend]-config-cli-args (#7737)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Kaunil Dhruv <kaunil_dhruv@intuit.com>
---
 docs/requirements-docs.txt                    |   3 +-
 .../serving/openai_compatible_server.md       |  26 +++++
 requirements-common.txt                       |   1 +
 tests/data/test_config.yaml                   |   2 +
 tests/test_utils.py                           |  44 ++++++++
 vllm/scripts.py                               |   9 ++
 vllm/utils.py                                 | 101 ++++++++++++++++++
 7 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/test_config.yaml

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 95a9be7806633..c358e23b6a37a 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -11,5 +11,6 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
-mistral_common >= 1.3.4
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+mistral_common >= 1.3.4
+openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a06c30d9c48c6..b2acde390083c 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -111,6 +111,32 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 :prog: vllm serve
 ```
 
+### Config file
+
+The `serve` module can also accept arguments from a config file in
+`yaml` format. The arguments in the yaml must be specified using the 
+long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): 
+
+For example:
+
+```yaml
+# config.yaml
+
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+
+```bash
+$ vllm serve SOME_MODEL --config config.yaml
+```
+---
+**NOTE**  
+In case an argument is supplied using command line and the config file, the value from the commandline will take precedence.
+The order of priorities is `command line > config file values > defaults`.
+
+---
+
 ## Tool calling in the chat completion API
 vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap.
 
diff --git a/requirements-common.txt b/requirements-common.txt
index 61daf99819756..d7e10c7591a79 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -27,3 +27,4 @@ soundfile # Required for audio processing
 gguf == 0.9.1
 importlib_metadata
 mistral_common >= 1.3.4
+pyyaml
\ No newline at end of file
diff --git a/tests/data/test_config.yaml b/tests/data/test_config.yaml
new file mode 100644
index 0000000000000..20d499624de2e
--- /dev/null
+++ b/tests/data/test_config.yaml
@@ -0,0 +1,2 @@
+port: 12312
+tensor_parallel_size: 2
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c157be1c08f81..c7cb663068c0f 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -132,6 +132,16 @@ def parser():
     return parser
 
 
+@pytest.fixture
+def parser_with_config():
+    parser = FlexibleArgumentParser()
+    parser.add_argument('serve')
+    parser.add_argument('--config', type=str)
+    parser.add_argument('--port', type=int)
+    parser.add_argument('--tensor-parallel-size', type=int)
+    return parser
+
+
 def test_underscore_to_dash(parser):
     args = parser.parse_args(['--image_input_type', 'pixel_values'])
     assert args.image_input_type == 'pixel_values'
@@ -176,3 +186,37 @@ def test_missing_required_argument(parser):
     parser.add_argument('--required-arg', required=True)
     with pytest.raises(SystemExit):
         parser.parse_args([])
+
+
+def test_cli_override_to_config(parser_with_config):
+    args = parser_with_config.parse_args([
+        'serve', '--config', './data/test_config.yaml',
+        '--tensor-parallel-size', '3'
+    ])
+    assert args.tensor_parallel_size == 3
+    args = parser_with_config.parse_args([
+        'serve', '--tensor-parallel-size', '3', '--config',
+        './data/test_config.yaml'
+    ])
+    assert args.tensor_parallel_size == 3
+
+
+def test_config_args(parser_with_config):
+    args = parser_with_config.parse_args(
+        ['serve', '--config', './data/test_config.yaml'])
+    assert args.tensor_parallel_size == 2
+
+
+def test_config_file(parser_with_config):
+    with pytest.raises(FileNotFoundError):
+        parser_with_config.parse_args(['serve', '--config', 'test_config.yml'])
+
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args(
+            ['serve', '--config', './data/test_config.json'])
+
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args([
+            'serve', '--tensor-parallel-size', '3', '--config', '--batch-size',
+            '32'
+        ])
diff --git a/vllm/scripts.py b/vllm/scripts.py
index a9ddfcf864133..e557961a335bf 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -125,6 +125,15 @@ def main():
     serve_parser.add_argument("model_tag",
                               type=str,
                               help="The model tag to serve")
+    serve_parser.add_argument(
+        "--config",
+        type=str,
+        default='',
+        required=False,
+        help="Read CLI options from a config file."
+        "Must be a YAML with the following options:"
+        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server"
+    )
     serve_parser = make_arg_parser(serve_parser)
     serve_parser.set_defaults(dispatch_function=serve)
 
diff --git a/vllm/utils.py b/vllm/utils.py
index dab8e5fe04359..657a3ecef696d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -25,6 +25,7 @@
 import psutil
 import torch
 import torch.types
+import yaml
 from packaging.version import Version
 from typing_extensions import ParamSpec, TypeIs, assert_never
 
@@ -1093,6 +1094,9 @@ def parse_args(self, args=None, namespace=None):
         if args is None:
             args = sys.argv[1:]
 
+        if '--config' in args:
+            args = FlexibleArgumentParser._pull_args_from_config(args)
+
         # Convert underscores to dashes and vice versa in argument names
         processed_args = []
         for arg in args:
@@ -1109,6 +1113,103 @@ def parse_args(self, args=None, namespace=None):
 
         return super().parse_args(processed_args, namespace)
 
+    @staticmethod
+    def _pull_args_from_config(args: List[str]) -> List[str]:
+        """Method to pull arguments specified in the config file
+        into the command-line args variable.
+        
+        The arguments in config file will be inserted between 
+        the argument list.
+        
+        example:
+        ```yaml
+            port: 12323
+            tensor-parallel-size: 4
+        ```
+        ```python
+        $: vllm {serve,chat,complete} "facebook/opt-12B" \
+            --config config.yaml -tp 2
+        $: args = [
+            "serve,chat,complete",
+            "facebook/opt-12B", 
+            '--config', 'config.yaml', 
+            '-tp', '2'
+        ]
+        $: args = [
+            "serve,chat,complete",
+            "facebook/opt-12B", 
+            '--port', '12323', 
+            '--tensor-parallel-size', '4', 
+            '-tp', '2'
+            ]
+        ```
+
+        Please note how the config args are inserted after the sub command.
+        this way the order of priorities is maintained when these are args 
+        parsed by super().
+        """
+        assert args.count(
+            '--config') <= 1, "More than one config file specified!"
+
+        index = args.index('--config')
+        if index == len(args) - 1:
+            raise ValueError("No config file specified! \
+                             Please check your command-line arguments.")
+
+        file_path = args[index + 1]
+
+        config_args = FlexibleArgumentParser._load_config_file(file_path)
+
+        # 0th index is for {serve,chat,complete}
+        # followed by config args
+        # followed by rest of cli args.
+        # maintaining this order will enforce the precedence
+        # of cli > config > defaults
+        args = [args[0]] + config_args + args[1:index] + args[index + 2:]
+
+        return args
+
+    @staticmethod
+    def _load_config_file(file_path: str) -> List[str]:
+        """Loads a yaml file and returns the key value pairs as a 
+        flattened list with argparse like pattern
+        ```yaml
+            port: 12323
+            tensor-parallel-size: 4
+        ```
+        returns:
+            processed_args: list[str] = [
+                '--port': '12323',
+                '--tensor-parallel-size': '4'
+            ]
+        
+        """
+
+        extension: str = file_path.split('.')[-1]
+        if extension not in ('yaml', 'yml'):
+            raise ValueError(
+                "Config file must be of a yaml/yml type.\
+                              %s supplied", extension)
+
+        # only expecting a flat dictionary of atomic types
+        processed_args: List[str] = []
+
+        config: Dict[str, Union[int, str]] = {}
+        try:
+            with open(file_path, 'r') as config_file:
+                config = yaml.safe_load(config_file)
+        except Exception as ex:
+            logger.error(
+                "Unable to read the config file at %s. \
+                Make sure path is correct", file_path)
+            raise ex
+
+        for key, value in config.items():
+            processed_args.append('--' + key)
+            processed_args.append(str(value))
+
+        return processed_args
+
 
 async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
                               **kwargs):

From 2684efc4678eb46d1dc7fe4311365a99215e2dc6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 30 Aug 2024 09:01:26 -0700
Subject: [PATCH 08/51] [TPU][Bugfix] Fix tpu type api (#8035)

---
 vllm/attention/backends/pallas.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index c324d62d44d79..83fdef16ef5cb 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -124,7 +124,10 @@ def __init__(
 
         self.megacore_mode = None
         tpu_env = torch_xla.tpu.get_tpu_env()
-        tpu_type = tpu_env.get("TYPE") or tpu_env.get("ACCELERATOR_TYPE")
+        tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None)
+                    or tpu_env.get("TYPE", None)
+                    or tpu_env.get("TPU_ACCELERATOR_TYPE", None))
+        assert tpu_type is not None
         tpu_type = tpu_type.lower()
 
         if "lite" not in tpu_type:

From 1248e8506a4d98b4f15cbfe729cf2af42fb4223a Mon Sep 17 00:00:00 2001
From: Wenxiang <8460860+wenxcs@users.noreply.github.com>
Date: Sat, 31 Aug 2024 03:42:57 +0800
Subject: [PATCH 09/51] [Model] Adding support for MSFT Phi-3.5-MoE (#7729)

Co-authored-by: Your Name <you@example.com>
Co-authored-by: Zeqi Lin <zelin@microsoft.com>
Co-authored-by: Zeqi Lin <Zeqi.Lin@microsoft.com>
---
 docs/source/models/supported_models.rst       |   4 +
 tests/models/test_phimoe.py                   | 111 ++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 130 ++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 130 ++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 130 ++++
 .../layers/fused_moe/fused_moe.py             |  19 +-
 vllm/model_executor/layers/fused_moe/layer.py |  90 ++-
 .../compressed_tensors_moe.py                 |  24 +-
 .../layers/quantization/experts_int8.py       |  26 +-
 .../model_executor/layers/quantization/fp8.py |  26 +-
 .../model_executor/layers/rotary_embedding.py |  26 +-
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/phimoe.py          | 620 ++++++++++++++++++
 13 files changed, 1255 insertions(+), 82 deletions(-)
 create mode 100644 tests/models/test_phimoe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/models/phimoe.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f727c646b7da7..2c20b6e48407d 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -147,6 +147,10 @@ Decoder-only Language Models
     - Phi-3-Small
     - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
     -
+  * - :code:`PhiMoEForCausalLM`
+    - Phi-3.5-MoE
+    - :code:`microsoft/Phi-3.5-MoE-instruct`, etc.
+    -
   * - :code:`PersimmonForCausalLM`
     - Persimmon
     - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
diff --git a/tests/models/test_phimoe.py b/tests/models/test_phimoe.py
new file mode 100644
index 0000000000000..2fb2eecc94672
--- /dev/null
+++ b/tests/models/test_phimoe.py
@@ -0,0 +1,111 @@
+"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
+
+Run `pytest tests/models/test_phimoe.py`.
+"""
+import pytest
+import torch
+
+from vllm.utils import is_cpu
+
+from .utils import check_logprobs_close
+
+MODELS = [
+    "microsoft/Phi-3.5-MoE-instruct",
+]
+
+
+def test_phimoe_routing_function():
+    from vllm.model_executor.models.phimoe import phimoe_routing_function
+    test_case = {
+        0: {
+            "hidden_states":
+            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                         dtype=torch.float32,
+                         requires_grad=False).view(4, 2),
+            "gating_output":
+            torch.tensor([0.1, 0.2, 0.3, 0.4],
+                         dtype=torch.float32,
+                         requires_grad=False),
+            "topk":
+            2,
+            "renormalize":
+            False,
+        },
+        1: {
+            "hidden_states":
+            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                         dtype=torch.float32,
+                         requires_grad=False).view(4, 2),
+            "gating_output":
+            torch.tensor([0.4, 0.2, 0.3, 0.4],
+                         dtype=torch.float32,
+                         requires_grad=False),
+            "topk":
+            2,
+            "renormalize":
+            False,
+        }
+    }
+
+    ground_truth = {
+        0: {
+            "topk_weights":
+            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
+            "topk_ids":
+            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+        },
+        1: {
+            "topk_weights":
+            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
+            "topk_ids":
+            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        }
+    }
+
+    for test_id in test_case:
+        topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
+        assert torch.allclose(topk_weights,
+                              ground_truth[test_id]["topk_weights"])
+        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
+
+
+def get_gpu_memory():
+    try:
+        props = torch.cuda.get_device_properties(torch.cuda.current_device())
+        gpu_memory = props.total_memory / (1024**3)
+        return gpu_memory
+    except Exception:
+        return 0
+
+
+@pytest.mark.skipif(condition=is_cpu(),
+                    reason="This test takes a lot time to run on CPU, "
+                    "and vllm CI's disk space is not enough for this model.")
+@pytest.mark.skipif(condition=get_gpu_memory() < 100,
+                    reason="Skip this test if GPU memory is insufficient.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..cd0cdbea0c337
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "3328": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "768": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3840": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..ba9041d008507
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "3840": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "768": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3328": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..57055453aa24c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3328": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "768": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3840": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index d2b152320e11e..05169eaddb256 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -2,7 +2,7 @@
 import functools
 import json
 import os
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, Optional, Tuple
 
 import torch
 import triton
@@ -446,7 +446,8 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
                      rand_perm1: torch.Tensor,
                      rand_perm2: torch.Tensor,
                      topk: int,
-                     renormalize: bool,
+                     custom_routing_function: Optional[Callable] = None,
+                     renormalize: bool = True,
                      override_config: Optional[Dict[str, Any]] = None,
                      use_fp8: bool = False,
                      w1_scale: Optional[torch.Tensor] = None,
@@ -497,8 +498,12 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
     E = w1.shape[0]
     N = w2.shape[1] * 16
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
+    if custom_routing_function is None:
+        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                            renormalize)
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states, gating_output, topk, renormalize)
 
     get_config_func = functools.partial(try_get_optimal_moe_config,
                                         w1.shape,
@@ -695,6 +700,7 @@ def fused_moe(
     use_grouped_topk: bool = False,
     num_expert_group: Optional[int] = None,
     topk_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
     use_fp8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
@@ -742,9 +748,12 @@ def fused_moe(
         topk_weights, topk_ids = grouped_topk(hidden_states, gating_output,
                                               topk, renormalize,
                                               num_expert_group, topk_group)
-    else:
+    elif custom_routing_function is None:
         topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
                                             renormalize)
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states, gating_output, topk, renormalize)
 
     return fused_experts(hidden_states,
                          w1,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 61ebef5e11f43..3df0b61a9ebe4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,6 +1,6 @@
 from abc import abstractmethod
 from enum import Enum
-from typing import List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple
 
 import torch
 
@@ -62,15 +62,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              router_logits: torch.Tensor,
-              top_k: int,
-              renormalize: bool,
-              use_grouped_topk: bool,
-              topk_group: Optional[int] = None,
-              num_expert_group: Optional[int] = None) -> torch.Tensor:
+    def apply(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            router_logits: torch.Tensor,
+            top_k: int,
+            renormalize: bool,
+            use_grouped_topk: bool,
+            topk_group: Optional[int] = None,
+            num_expert_group: Optional[int] = None,
+            custom_routing_function: Optional[Callable] = None
+    ) -> torch.Tensor:
 
         return self.forward(x=x,
                             layer=layer,
@@ -79,17 +82,21 @@ def apply(self,
                             renormalize=renormalize,
                             use_grouped_topk=use_grouped_topk,
                             topk_group=topk_group,
-                            num_expert_group=num_expert_group)
-
-    def forward_cuda(self,
-                     layer: torch.nn.Module,
-                     x: torch.Tensor,
-                     use_grouped_topk: bool,
-                     top_k: int,
-                     router_logits: torch.Tensor,
-                     renormalize: bool,
-                     topk_group: Optional[int] = None,
-                     num_expert_group: Optional[int] = None) -> torch.Tensor:
+                            num_expert_group=num_expert_group,
+                            custom_routing_function=custom_routing_function)
+
+    def forward_cuda(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            use_grouped_topk: bool,
+            top_k: int,
+            router_logits: torch.Tensor,
+            renormalize: bool,
+            topk_group: Optional[int] = None,
+            num_expert_group: Optional[int] = None,
+            custom_routing_function: Optional[Callable] = None
+    ) -> torch.Tensor:
 
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_experts)
@@ -101,7 +108,8 @@ def forward_cuda(self,
             top_k=top_k,
             renormalize=renormalize,
             topk_group=topk_group,
-            num_expert_group=num_expert_group)
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function)
 
         return fused_experts(hidden_states=x,
                              w1=layer.w13_weight,
@@ -114,20 +122,24 @@ def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError(
             "The CPU backend currently does not support MoE.")
 
-    def forward_tpu(self,
-                    layer: torch.nn.Module,
-                    x: torch.Tensor,
-                    use_grouped_topk: bool,
-                    top_k: int,
-                    router_logits: torch.Tensor,
-                    renormalize: bool,
-                    topk_group: Optional[int] = None,
-                    num_expert_group: Optional[int] = None) -> torch.Tensor:
+    def forward_tpu(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            use_grouped_topk: bool,
+            top_k: int,
+            router_logits: torch.Tensor,
+            renormalize: bool,
+            topk_group: Optional[int] = None,
+            num_expert_group: Optional[int] = None,
+            custom_routing_function: Optional[Callable] = None
+    ) -> torch.Tensor:
 
         from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
+        assert custom_routing_function is None
         return fused_moe(hidden_states=x,
                          w1=layer.w13_weight,
                          w2=layer.w2_weight,
@@ -172,6 +184,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         prefix: str = "",
+        custom_routing_function: Optional[Callable] = None,
     ):
         super().__init__()
 
@@ -190,6 +203,7 @@ def __init__(
             assert num_expert_group is not None and topk_group is not None
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
+        self.custom_routing_function = custom_routing_function
 
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -390,7 +404,8 @@ def select_experts(hidden_states: torch.Tensor,
                        use_grouped_topk: bool,
                        renormalize: bool,
                        topk_group: Optional[int] = None,
-                       num_expert_group: Optional[int] = None):
+                       num_expert_group: Optional[int] = None,
+                       custom_routing_function: Optional[Callable] = None):
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_topk, grouped_topk)
 
@@ -405,11 +420,17 @@ def select_experts(hidden_states: torch.Tensor,
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group)
-        else:
+        elif custom_routing_function is None:
             topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
                                                 gating_output=router_logits,
                                                 topk=top_k,
                                                 renormalize=renormalize)
+        else:
+            topk_weights, topk_ids = custom_routing_function(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize)
 
         return topk_weights, topk_ids
 
@@ -426,7 +447,8 @@ def forward(self, hidden_states: torch.Tensor,
             renormalize=self.renormalize,
             use_grouped_topk=self.use_grouped_topk,
             topk_group=self.topk_group,
-            num_expert_group=self.num_expert_group)
+            num_expert_group=self.num_expert_group,
+            custom_routing_function=self.custom_routing_function)
 
         if self.reduce_results and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 0e0ab9ce9169f..36323493d601e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1,6 +1,6 @@
 import enum
 from enum import Enum
-from typing import List, Optional
+from typing import Callable, List, Optional
 
 import torch
 
@@ -256,15 +256,18 @@ def marlin_moe_permute_scales(s: torch.Tensor, size_k: int,
         )
         replace_tensor("w2_weight_scale", marlin_w2_scales)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              router_logits: torch.Tensor,
-              top_k: int,
-              renormalize: bool = True,
-              use_grouped_topk: bool = False,
-              num_expert_group: Optional[int] = None,
-              topk_group: Optional[int] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+    ) -> torch.Tensor:
 
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_marlin_moe)
@@ -278,6 +281,7 @@ def apply(self,
                                 layer.w13_g_idx_sort_indices,
                                 layer.w2_g_idx_sort_indices,
                                 top_k,
+                                custom_routing_function,
                                 renormalize=renormalize,
                                 w1_scale=layer.w13_weight_scale,
                                 w2_scale=layer.w2_weight_scale)
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index dabf17df78fef..116a4ea0aed89 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 
@@ -96,15 +96,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
                                       requires_grad=False)
         layer.register_parameter("w2_scale", w2_scale)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              router_logits: torch.Tensor,
-              top_k: int,
-              renormalize: bool = True,
-              use_grouped_topk: bool = False,
-              num_expert_group: Optional[int] = None,
-              topk_group: Optional[int] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+    ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -114,7 +117,8 @@ def apply(self,
             top_k=top_k,
             renormalize=renormalize,
             topk_group=topk_group,
-            num_expert_group=num_expert_group)
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function)
 
         return fused_experts(x,
                              layer.w13_weight,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 1817dbcb023a7..32affe06b89b7 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 from torch.nn import Module
@@ -468,15 +468,18 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                         requires_grad=False)
             return
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              router_logits: torch.Tensor,
-              top_k: int,
-              renormalize: bool,
-              use_grouped_topk: bool,
-              topk_group: Optional[int] = None,
-              num_expert_group: Optional[int] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+    ) -> torch.Tensor:
 
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -487,7 +490,8 @@ def apply(self,
             top_k=top_k,
             renormalize=renormalize,
             topk_group=topk_group,
-            num_expert_group=num_expert_group)
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function)
 
         return fused_experts(x,
                              layer.w13_weight,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 0562b71aa7493..c5a0278e485d4 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -503,8 +503,8 @@ def __init__(
         dtype: torch.dtype,
         short_factor: List[float],
         long_factor: List[float],
-        short_mscale: float = 1.0,
-        long_mscale: float = 1.0,
+        short_mscale: Optional[float] = None,
+        long_mscale: Optional[float] = None,
     ):
         super().__init__()
 
@@ -523,18 +523,22 @@ def __init__(
         self.base = base
         self.short_factor = short_factor
         self.long_factor = long_factor
-        self.short_mscale = short_mscale
-        self.long_mscale = long_mscale
-
-        scale = (self.max_position_embeddings /
-                 self.original_max_position_embeddings)
 
+        scale = self.max_position_embeddings / \
+            self.original_max_position_embeddings
         if scale <= 1.0:
-            self.scaling_factor = 1.0
+            scaling_factor = 1.0
         else:
-            self.scaling_factor = math.sqrt(
+            scaling_factor = math.sqrt(
                 1 + math.log(scale) /
                 math.log(self.original_max_position_embeddings))
+        if short_mscale is None:
+            short_mscale = scaling_factor
+        if long_mscale is None:
+            long_mscale = scaling_factor
+
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
 
         short_cache = self._compute_cos_sin_cache(
             original_max_position_embeddings, short_factor, short_mscale)
@@ -571,8 +575,8 @@ def _compute_cos_sin_cache(
         inv_freq = self._compute_inv_freq(rescale_factors)
         t = torch.arange(max_position_embeddings, dtype=torch.float)
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos() * mscale * self.scaling_factor
-        sin = freqs.sin() * mscale * self.scaling_factor
+        cos = freqs.cos() * mscale
+        sin = freqs.sin() * mscale
         cache = torch.cat((cos, sin), dim=-1)
         return cache
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index fc3d4922aea09..f4c3e43c8f2a4 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -50,6 +50,7 @@
     "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
+    "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
new file mode 100644
index 0000000000000..c8128052a3ebe
--- /dev/null
+++ b/vllm/model_executor/models/phimoe.py
@@ -0,0 +1,620 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only PhiMoE model."""
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .interfaces import SupportsLoRA
+
+
+class PhiMoEConfig(PretrainedConfig):
+
+    model_type = "phimoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=16,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        attention_bias=False,
+        lm_head_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.attention_bias = attention_bias
+        self.lm_head_bias = lm_head_bias
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class mp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        scores: torch.Tensor,
+        multiplier: torch.Tensor,
+        selected_experts: torch.Tensor,
+        masked_gates: torch.Tensor,
+        mask_for_one: torch.Tensor,
+    ):
+        ctx.save_for_backward(multiplier, selected_experts, masked_gates)
+        return multiplier * mask_for_one
+
+    @staticmethod
+    def backward(
+        ctx,
+        grad_at_output: torch.Tensor,
+    ):
+        multiplier, selected_experts, masked_gates = ctx.saved_tensors
+
+        grad_at_output = grad_at_output * multiplier
+
+        grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1)
+        grad_at_scores_expaned.scatter_add_(
+            dim=-1,
+            index=selected_experts,
+            src=grad_at_output,
+        )
+
+        return (
+            grad_at_scores_expaned,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def sparsemixer(scores, jitter_eps=0.01):
+    ################ first expert ################
+
+    with torch.no_grad():
+        # compute mask for sparsity
+        mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = (
+            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+
+    # apply mask
+    masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
+    selected_experts = max_ind
+
+    # compute scores for gradients
+    masked_gates = torch.softmax(masked_gates, dim=-1)
+    multiplier_o = masked_gates.gather(dim=-1, index=selected_experts)
+
+    multiplier = multiplier_o
+
+    # masked out first expert
+    masked_scores = torch.scatter(
+        scores,
+        -1,
+        selected_experts,
+        float("-inf"),
+    )
+    with torch.no_grad():
+        # compute mask for sparsity
+        mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
+                                                           keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = (
+            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+
+    # apply mask
+    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
+                                                  float("-inf"))
+    selected_experts_top2 = max_ind
+    # compute scores for gradients
+    masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
+    multiplier_top2 = masked_gates_top2.gather(dim=-1,
+                                               index=selected_experts_top2)
+
+    multiplier = torch.concat((multiplier, multiplier_top2), dim=-1)
+    selected_experts = torch.concat((selected_experts, selected_experts_top2),
+                                    dim=-1)
+
+    return (
+        multiplier,
+        selected_experts,
+    )
+
+
+def phimoe_routing_function(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert topk == 2, "Only top-2 routing is supported"
+    assert renormalize is False, "Renormalization is not supported"
+
+    topk_weights, topk_ids = sparsemixer(gating_output)
+    return topk_weights, topk_ids
+
+
+class PhiMoE(nn.Module):
+    """A tensor-parallel MoE implementation for PhiMoE that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            renormalize=False,
+            quant_config=quant_config,
+            tp_size=tp_size,
+            custom_routing_function=phimoe_routing_function)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class PhiMoEAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: Optional[dict] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=None,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=True,
+            quant_config=None,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+            rope_scaling=self.rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class PhiMoEDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = PhiMoEAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=config.rope_scaling,
+        )
+        self.block_sparse_moe = PhiMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+        )
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps,
+                                            elementwise_affine=True)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps,
+                                                     elementwise_affine=True)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = hidden_states + residual
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.block_sparse_moe(hidden_states)
+
+        hidden_states = hidden_states + residual
+        return hidden_states, residual
+
+
+class PhiMoEModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.layers = nn.ModuleList([
+            PhiMoEDecoderLayer(config, cache_config, quant_config=quant_config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.rms_norm_eps,
+                                 elementwise_affine=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i], attn_metadata,
+                                            residual)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class PhiMoEForCausalLM(nn.Module, SupportsLoRA):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = PhiMoEModel(config,
+                                 cache_config,
+                                 quant_config,
+                                 lora_config=lora_config)
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=(
+                DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size),
+            quant_config=None,
+            bias=True,
+        )
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        weight_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)

From 622f8abff8e17a8274504cbbfb4b69c5724a0328 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Fri, 30 Aug 2024 22:18:50 -0700
Subject: [PATCH 10/51] [Bugfix] bugfix and add model test for flashinfer fp8
 kv cache. (#8013)

---
 tests/models/test_fp8kv_flashinfer.py | 96 +++++++++++++++++++++++++++
 vllm/attention/backends/flashinfer.py | 18 +++--
 2 files changed, 109 insertions(+), 5 deletions(-)
 create mode 100644 tests/models/test_fp8kv_flashinfer.py

diff --git a/tests/models/test_fp8kv_flashinfer.py b/tests/models/test_fp8kv_flashinfer.py
new file mode 100644
index 0000000000000..ff2a44162b6c3
--- /dev/null
+++ b/tests/models/test_fp8kv_flashinfer.py
@@ -0,0 +1,96 @@
+# flake8: noqa
+"""Tests fp8 models against ground truth generation
+This verifies the flashinfer backend with fp8 
+quantization and fp8 KV Cache without scaling 
+factors Note: these tests will only pass on H100 GPU.
+"""
+import os
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = [
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
+]
+
+EXPECTED_STRS_MAP = {
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": {
+        "auto": [
+            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+            'In the sterile, metallic halls of the robotics lab, a peculiar phenomenon occurred. Zeta-5',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, mushi o',
+        ],
+        "fp8": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
+        ]
+    }
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp8 implementation or
+# the hardware being run on.
+# No assert to prevent it from breaking the build
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+@pytest.mark.parametrize("backend", ["XFORMERS", "FLASHINFER"])
+def test_models(example_prompts, model_name, kv_cache_dtype, backend) -> None:
+    # Note that the golden strings may not work for FLASHINFER Backend.
+    # The intention is to test the path
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    model = LLM(model=model_name,
+                max_model_len=MAX_MODEL_LEN,
+                trust_remote_code=True,
+                quantization="fp8",
+                kv_cache_dtype=kv_cache_dtype)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: List[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+
+    print(f"Testing: {model_name} with kv_cache_dtype: {kv_cache_dtype}")
+    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        print(f"generated_str\n: {generated_str}")
+        print(f"expected_str\n: {expected_str}")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index f554fa2805bd2..aa9d4a71dbf87 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -186,9 +186,13 @@ def graph_capture_get_metadata_for_batch(self, batch_size: int):
             self._graph_decode_workspace_buffer, _indptr_buffer,
             self._graph_indices_buffer, _last_page_len_buffer, "NHD",
             use_tensor_cores)
+        if self.runner.kv_cache_dtype.startswith("fp8"):
+            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                self.runner.kv_cache_dtype)
+        else:
+            kv_cache_dtype = get_kv_cache_torch_dtype(
+                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
 
-        kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-            self.runner.kv_cache_dtype)
         paged_kv_indptr_tensor_host = torch.arange(0,
                                                    batch_size + 1,
                                                    dtype=torch.int32)
@@ -349,7 +353,7 @@ def begin_forward(self):
                 self.page_size,
                 # Disable flashinfer's pos encoding and use vllm's rope.
                 pos_encoding_mode="NONE",
-            )
+                data_type=self.data_type)
 
     def asdict_zerocopy(self,
                         skip_fields: Optional[Set[str]] = None
@@ -586,8 +590,12 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             paged_kv_indptr_tensor = None
             paged_kv_last_page_len_tensor = None
 
-        kv_cache_dtype = get_kv_cache_torch_dtype(
-            self.runner.kv_cache_dtype, self.runner.model_config.dtype)
+        if self.runner.kv_cache_dtype.startswith("fp8"):
+            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                self.runner.kv_cache_dtype)
+        else:
+            kv_cache_dtype = get_kv_cache_torch_dtype(
+                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
 
         return FlashInferMetadata(
             num_prefills=self.num_prefills,

From d05f0a9db2c32528f4aff7e741ff6caf21dd0802 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 31 Aug 2024 13:26:55 +0800
Subject: [PATCH 11/51] [Bugfix] Fix import error in Phi-3.5-MoE (#8052)

---
 vllm/model_executor/models/phimoe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index c8128052a3ebe..25bc0590c745c 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -38,13 +38,13 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA
 

From 4f5d8446ede9f85182126804c6b07a56e06fd3d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nicolo.lucchesi@gmail.com>
Date: Sat, 31 Aug 2024 09:27:58 +0200
Subject: [PATCH 12/51] [Bugfix] Fix ModelScope models in v0.5.5 (#8037)

---
 vllm/transformers_utils/config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4a03446590fe5..f3ac8d3178d4e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -108,6 +108,9 @@ def get_hf_image_processor_config(
     revision: Optional[str] = None,
     **kwargs,
 ) -> Dict[str, Any]:
+    # ModelScope does not provide an interface for image_processor
+    if VLLM_USE_MODELSCOPE:
+        return dict()
     # Separate model folder from file path for GGUF models
     if Path(model).is_file() and Path(model).suffix == ".gguf":
         model = Path(model).parent

From 8423aef4c867818524e90b2e2e58730b6ee5592c Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 31 Aug 2024 15:44:03 -0400
Subject: [PATCH 13/51] [BugFix][Core] Multistep Fix Crash on Request
 Cancellation (#8059)

---
 vllm/engine/output_processor/multi_step.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 0209b0adc9831..e182cee8ba18e 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -88,9 +88,15 @@ def process_outputs(self,
         # TODO: Add support for async if necessary
         assert not is_async
 
+        # Sequences can be in RUNNING or FINISHED_ABORTED state
+        # once scheduled, as a sequence is moved to FINSIHED_ABORTED
+        # if a client disconnects from the api server.
         seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
+        if seqs is None:
+            seqs = sequence_group.get_seqs(
+                status=SequenceStatus.FINISHED_ABORTED)
 
-        assert seqs, "expected running sequences"
+        assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences"
         assert len(seqs) == 1, (
             "Beam search not supported in multi-step decoding.")
         seq = seqs[0]

From 5231f0898e559671c6c8cc48efc53a859fce1841 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 31 Aug 2024 16:35:53 -0700
Subject: [PATCH 14/51] [Frontend][VLM] Add support for multiple multi-modal
 items (#8049)

---
 .buildkite/test-pipeline.yaml                 |   1 +
 examples/openai_vision_api_client.py          |  39 +++
 tests/entrypoints/openai/test_serving_chat.py |   2 +
 tests/entrypoints/openai/test_vision.py       |  71 ++--
 tests/entrypoints/test_chat_utils.py          | 305 ++++++++++++++++++
 vllm/entrypoints/chat_utils.py                | 228 +++++++------
 vllm/entrypoints/openai/serving_chat.py       |  10 +-
 .../openai/serving_tokenization.py            |   4 +-
 8 files changed, 524 insertions(+), 136 deletions(-)
 create mode 100644 tests/entrypoints/test_chat_utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 235db72eee4b9..86eddb576c42a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -90,6 +90,7 @@ steps:
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/openai
+  - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Distributed Tests (4 GPUs) # 10min
   working_dir: "/vllm-workspace/tests"
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index be90394511f89..e1d4055763e5f 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -1,7 +1,13 @@
 """An example showing how to use vLLM to serve VLMs.
 
 Launch the vLLM server with the following command:
+
+(single image inference with Llava)
 vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+
+(multi-image inference with Phi-3.5-vision-instruct)
+vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
+    --trust-remote-code --limit-mm-per-prompt image=2
 """
 import base64
 
@@ -84,3 +90,36 @@ def encode_image_base64_from_url(image_url: str) -> str:
 
 result = chat_completion_from_base64.choices[0].message.content
 print(f"Chat completion output:{result}")
+
+# Multi-image input inference
+image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What are the animals in these images?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url_duck
+                },
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url_lion
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print(f"Chat completion output:{result}")
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 3783b7cd66a6a..c3a6c65be1d90 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 from unittest.mock import MagicMock
 
+from vllm.config import MultiModalConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -20,6 +21,7 @@ class MockModelConfig:
     max_model_len = 100
     tokenizer_revision = None
     embedding_mode = False
+    multimodal_config = MultiModalConfig()
 
 
 @dataclass
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index d2ef3c2071efb..f61fa127b7d06 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -6,11 +6,10 @@
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja"
-assert LLAVA_CHAT_TEMPLATE.exists()
+MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
+MAXIMUM_IMAGES = 2
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
@@ -24,13 +23,9 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "4096",
-        "--enforce-eager",
-        "--chat-template",
-        str(LLAVA_CHAT_TEMPLATE),
+        "--dtype", "bfloat16", "--max-model-len", "4096", "--max-num-seqs",
+        "5", "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt",
+        f"image={MAXIMUM_IMAGES}"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -84,7 +79,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+        completion_tokens=10, prompt_tokens=772, total_tokens=782)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -139,7 +134,7 @@ async def test_single_chat_session_image_base64encoded(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+        completion_tokens=10, prompt_tokens=772, total_tokens=782)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -217,26 +212,22 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_url: str):
+                                 image_urls: List[str]):
 
     messages = [{
         "role":
         "user",
         "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
+            *({
                 "type": "image_url",
                 "image_url": {
                     "url": image_url
                 }
-            },
+            } for image_url in image_urls),
             {
                 "type": "text",
                 "text": "What's in this image?"
@@ -244,20 +235,30 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
         ],
     }]
 
-    with pytest.raises(openai.BadRequestError):  # test multi-image input
-        await client.chat.completions.create(
+    if len(image_urls) > MAXIMUM_IMAGES:
+        with pytest.raises(openai.BadRequestError):  # test multi-image input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
             model=model_name,
             messages=messages,
             max_tokens=10,
             temperature=0.0,
         )
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    completion = completion.choices[0].text
-    assert completion is not None and len(completion) >= 0
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
new file mode 100644
index 0000000000000..53f99189beb1c
--- /dev/null
+++ b/tests/entrypoints/test_chat_utils.py
@@ -0,0 +1,305 @@
+import warnings
+
+import pytest
+from PIL import Image
+
+from vllm.assets.image import ImageAsset
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import parse_chat_messages
+from vllm.multimodal.utils import encode_image_base64
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+
+PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+
+
+@pytest.fixture(scope="module")
+def phi3v_model_config():
+    return ModelConfig(PHI3V_MODEL_ID,
+                       PHI3V_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="bfloat16",
+                       seed=0,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+
+
+@pytest.fixture(scope="module")
+def phi3v_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=PHI3V_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
+@pytest.fixture(scope="module")
+def image_url():
+    image = ImageAsset('cherry_blossom')
+    base64 = encode_image_base64(image.pil_image)
+    return f"data:image/jpeg;base64,{base64}"
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_with_image_url(phi3v_model_config,
+                                                  phi3v_tokenizer, image_url):
+    conversation, mm_future = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in the image?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    mm_data = await mm_future
+    assert set(mm_data.keys()) == {"image"}
+    assert isinstance(mm_data["image"], Image.Image)
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images(phi3v_model_config,
+                                                   phi3v_tokenizer, image_url):
+    conversation, mm_future = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    mm_data = await mm_future
+    assert set(mm_data.keys()) == {"image"}
+    assert len(mm_data["image"]) == 2
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_placeholder_already_in_prompt(
+        phi3v_model_config, phi3v_tokenizer, image_url):
+    conversation, mm_future = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type":
+            "text",
+            "text":
+            "What's in <|image_1|> and how does it compare to <|image_2|>?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "What's in <|image_1|> and how does it compare to <|image_2|>?"
+    }]
+    mm_data = await mm_future
+    assert set(mm_data.keys()) == {"image"}
+    assert len(mm_data["image"]) == 2
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_placeholder_one_already_in_prompt(
+        phi3v_model_config, phi3v_tokenizer, image_url):
+    conversation, mm_future = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type":
+            "text",
+            "text":
+            "What's in <|image_1|> and how does it compare to the other one?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
+        "other one?"
+    }]
+    mm_data = await mm_future
+    assert set(mm_data.keys()) == {"image"}
+    assert len(mm_data["image"]) == 2
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_across_messages(
+        phi3v_model_config, phi3v_tokenizer, image_url):
+    conversation, mm_future = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in this image?"
+        }]
+    }, {
+        "role": "assistant",
+        "content": "Some stuff."
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What about this one?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?"
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat about this one?"
+        },
+    ]
+    mm_data = await mm_future
+    assert set(mm_data.keys()) == {"image"}
+    assert len(mm_data["image"]) == 2
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
+        phi3v_model_config, phi3v_tokenizer, image_url):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="coroutine 'async_get_and_parse_image' was never awaited")
+        with pytest.raises(
+                ValueError,
+                match="At most 2 image\\(s\\) may be provided in one request\\."
+        ):
+            parse_chat_messages([{
+                "role":
+                "user",
+                "content": [{
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "text",
+                    "text": "What's in these images?"
+                }]
+            }], phi3v_model_config, phi3v_tokenizer)
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_rejects_too_many_images_across_messages(
+        phi3v_model_config, phi3v_tokenizer, image_url):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="coroutine 'async_get_and_parse_image' was never awaited")
+        with pytest.raises(
+                ValueError,
+                match="At most 2 image\\(s\\) may be provided in one request\\."
+        ):
+            parse_chat_messages([{
+                "role":
+                "user",
+                "content": [{
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "text",
+                    "text": "What's in this image?"
+                }]
+            }, {
+                "role": "assistant",
+                "content": "Some stuff."
+            }, {
+                "role":
+                "user",
+                "content": [{
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "text",
+                    "text": "What about these two?"
+                }]
+            }], phi3v_model_config, phi3v_tokenizer)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c5368ac3bf026..c70c6d9330b10 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,9 +1,10 @@
+import asyncio
 import codecs
-from dataclasses import dataclass
+from collections import defaultdict
 from functools import lru_cache
 from pathlib import Path
-from typing import (Any, Awaitable, Iterable, List, Literal, Optional, Tuple,
-                    Union)
+from typing import (Any, Awaitable, Dict, Iterable, List, Literal, Mapping,
+                    Optional, Tuple, Union)
 
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -80,10 +81,90 @@ class ConversationMessage(TypedDict):
     content: str
 
 
-@dataclass(frozen=True)
-class ChatMessageParseResult:
-    messages: List[ConversationMessage]
-    mm_futures: List[Awaitable[MultiModalDataDict]]
+class MultiModalItemTracker:
+    """
+    Tracks multi-modal items in a given request and ensures that the number
+    of multi-modal items in a given request does not exceed the configured
+    maximum per prompt.
+    """
+
+    def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
+        self._model_config = model_config
+        self._tokenizer = tokenizer
+        self._allowed_items = (model_config.multimodal_config.limit_per_prompt
+                               if model_config.multimodal_config else {})
+        self._consumed_items = {k: 0 for k in self._allowed_items}
+        self._futures: List[Awaitable[MultiModalDataDict]] = []
+
+    @staticmethod
+    @lru_cache(maxsize=None)
+    def _cached_token_str(tokenizer: AnyTokenizer, token_index: int):
+        return tokenizer.decode(token_index)
+
+    def add(self, modality: Literal["image", "audio"],
+            mm_future: Awaitable[MultiModalDataDict]) -> Optional[str]:
+        """
+        Adds the multi-modal item to the current prompt and returns the
+        placeholder string to use, if any.
+        """
+        allowed_count = self._allowed_items.get(modality, 1)
+        current_count = self._consumed_items.get(modality, 0) + 1
+        if current_count > allowed_count:
+            raise ValueError(
+                f"At most {allowed_count} {modality}(s) may be provided in "
+                "one request.")
+
+        self._consumed_items[modality] = current_count
+        self._futures.append(mm_future)
+
+        # TODO: Let user specify how to insert image tokens into prompt
+        # (similar to chat template)
+        model_type = self._model_config.hf_config.model_type
+        if modality == "image":
+            if model_type == "phi3_v":
+                # Workaround since this token is not defined in the tokenizer
+                return f"<|image_{current_count}|>"
+            if model_type == "minicpmv":
+                return "(<image>./</image>)"
+            if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"):
+                # These models do not use image tokens in the prompt
+                return None
+            if model_type.startswith("llava"):
+                return MultiModalItemTracker._cached_token_str(
+                    self._tokenizer,
+                    self._model_config.hf_config.image_token_index)
+            if model_type in ("chameleon", "internvl_chat"):
+                return "<image>"
+
+            raise TypeError(f"Unknown model type: {model_type}")
+        elif modality == "audio":
+            if model_type == "ultravox":
+                return "<|reserved_special_token_0|>"
+            raise TypeError(f"Unknown model type: {model_type}")
+        else:
+            raise TypeError(f"Unknown modality: {modality}")
+
+    @staticmethod
+    async def _combine(futures: List[Awaitable[MultiModalDataDict]]):
+        mm_lists: Mapping[str, List[object]] = defaultdict(list)
+
+        # Merge all the multi-modal items
+        for single_mm_data in (await asyncio.gather(*futures)):
+            for mm_key, mm_item in single_mm_data.items():
+                if isinstance(mm_item, list):
+                    mm_lists[mm_key].extend(mm_item)
+                else:
+                    mm_lists[mm_key].append(mm_item)
+
+        # Unpack any single item lists for models that don't expect multiple.
+        return {
+            mm_key: mm_list[0] if len(mm_list) == 1 else mm_list
+            for mm_key, mm_list in mm_lists.items()
+        }
+
+    def all_mm_data(self) -> Optional[Awaitable[MultiModalDataDict]]:
+        return MultiModalItemTracker._combine(
+            self._futures) if self._futures else None
 
 
 def load_chat_template(
@@ -112,44 +193,30 @@ def load_chat_template(
     return resolved_chat_template
 
 
-@lru_cache(maxsize=None)
-def _mm_token_str(model_config: ModelConfig, tokenizer: AnyTokenizer,
-                  modality: Literal["image", "audio"]) -> Optional[str]:
-    # TODO: Let user specify how to insert image tokens into prompt
-    # (similar to chat template)
-    model_type = model_config.hf_config.model_type
-    if modality == "image":
-        if model_type == "phi3_v":
-            # Workaround since this token is not defined in the tokenizer
-            return "<|image_1|>"
-        if model_type == "minicpmv":
-            return "(<image>./</image>)"
-        if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"):
-            # These models do not use image tokens in the prompt
-            return None
-        if model_type.startswith("llava"):
-            return tokenizer.decode(model_config.hf_config.image_token_index)
-        if model_type in ("chameleon", "internvl_chat"):
-            return "<image>"
-
-        raise TypeError(f"Unknown model type: {model_type}")
-    elif modality == "audio":
-        if model_type == "ultravox":
-            return "<|reserved_special_token_0|>"
-        raise TypeError(f"Unknown model type: {model_type}")
-    else:
-        raise TypeError(f"Unknown modality: {modality}")
-
-
 # TODO: Let user specify how to insert multimodal tokens into prompt
 # (similar to chat template)
-def _get_full_multimodal_text_prompt(placeholder_token_str: str,
+def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
                                      text_prompt: str) -> str:
     """Combine multimodal prompts for a multimodal language model"""
 
-    # NOTE: For now we assume all model architectures use the same
-    # placeholder + text prompt format. This may change in the future.
-    return f"{placeholder_token_str}\n{text_prompt}"
+    # Look through the text prompt to check for missing placeholders
+    missing_placeholders = []
+    for placeholder in placeholder_counts:
+
+        # For any existing placeholder in the text prompt, we leave it as is
+        placeholder_counts[placeholder] -= text_prompt.count(placeholder)
+
+        if placeholder_counts[placeholder] < 0:
+            raise ValueError(
+                f"Found more '{placeholder}' placeholders in input prompt than "
+                "actual multimodal data items.")
+
+        missing_placeholders.extend([placeholder] *
+                                    placeholder_counts[placeholder])
+
+    # NOTE: For now we always add missing placeholders at the front of
+    # the prompt. This may change to be customizable in the future.
+    return "\n".join(missing_placeholders + [text_prompt])
 
 
 _TextParser = TypeAdapter(ChatCompletionContentPartTextParam)
@@ -160,12 +227,12 @@ def _get_full_multimodal_text_prompt(placeholder_token_str: str,
 def _parse_chat_message_content_parts(
     role: str,
     parts: Iterable[ChatCompletionContentPartParam],
-    model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
-) -> ChatMessageParseResult:
+    mm_tracker: MultiModalItemTracker,
+) -> List[ConversationMessage]:
     texts: List[str] = []
-    mm_futures: List[Awaitable[MultiModalDataDict]] = []
-    modality: Literal["image", "audio"] = "image"
+
+    # multimodal placeholder_string : count
+    mm_placeholder_counts: Dict[str, int] = {}
 
     for part in parts:
         part_type = part["type"]
@@ -173,11 +240,6 @@ def _parse_chat_message_content_parts(
             text = _TextParser.validate_python(part)["text"]
             texts.append(text)
         elif part_type == "image_url":
-            modality = "image"
-            if len(mm_futures) > 0:
-                raise NotImplementedError(
-                    "Multiple multimodal inputs is currently not supported.")
-
             image_url = _ImageParser.validate_python(part)["image_url"]
 
             if image_url.get("detail", "auto") != "auto":
@@ -185,60 +247,44 @@ def _parse_chat_message_content_parts(
                     "'image_url.detail' is currently not supported and "
                     "will be ignored.")
 
-            image_future = async_get_and_parse_image(image_url["url"])
-            mm_futures.append(image_future)
+            image_coro = async_get_and_parse_image(image_url["url"])
+            placeholder = mm_tracker.add("image", image_coro)
+            if placeholder:
+                mm_placeholder_counts[placeholder] = mm_placeholder_counts.get(
+                    placeholder, 0) + 1
         elif part_type == "audio_url":
-            modality = "audio"
-            if len(mm_futures) > 0:
-                raise NotImplementedError(
-                    "Multiple multimodal inputs is currently not supported.")
-
             audio_url = _AudioParser.validate_python(part)["audio_url"]
-            audio_future = async_get_and_parse_audio(audio_url["url"])
-            mm_futures.append(audio_future)
+            audio_coro = async_get_and_parse_audio(audio_url["url"])
+            placeholder = mm_tracker.add("audio", audio_coro)
+            if placeholder:
+                mm_placeholder_counts[placeholder] = mm_placeholder_counts.get(
+                    placeholder, 0) + 1
         else:
             raise NotImplementedError(f"Unknown part type: {part_type}")
 
     text_prompt = "\n".join(texts)
+    if mm_placeholder_counts:
+        text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts,
+                                                       text_prompt)
 
-    if mm_futures:
-        placeholder_token_str = _mm_token_str(model_config, tokenizer,
-                                              modality)
-        if placeholder_token_str is not None:
-            if placeholder_token_str in text_prompt:
-                logger.warning(
-                    "Detected multi-modal token string in the text prompt. "
-                    "Skipping prompt formatting.")
-            else:
-                text_prompt = _get_full_multimodal_text_prompt(
-                    placeholder_token_str=placeholder_token_str,
-                    text_prompt=text_prompt,
-                )
-
-    messages = [ConversationMessage(role=role, content=text_prompt)]
-
-    return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
+    return [ConversationMessage(role=role, content=text_prompt)]
 
 
 def _parse_chat_message_content(
-    message: ChatCompletionMessageParam,
-    model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
-) -> ChatMessageParseResult:
+        message: ChatCompletionMessageParam,
+        mm_tracker: MultiModalItemTracker) -> List[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
 
     if content is None:
-        return ChatMessageParseResult(messages=[], mm_futures=[])
+        return []
     if isinstance(content, str):
-        messages = [ConversationMessage(role=role, content=content)]
-        return ChatMessageParseResult(messages=messages, mm_futures=[])
+        return [ConversationMessage(role=role, content=content)]
 
     return _parse_chat_message_content_parts(
         role,
         content,  # type: ignore
-        model_config,
-        tokenizer,
+        mm_tracker,
     )
 
 
@@ -246,18 +292,16 @@ def parse_chat_messages(
     messages: List[ChatCompletionMessageParam],
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
-) -> Tuple[List[ConversationMessage], List[Awaitable[MultiModalDataDict]]]:
+) -> Tuple[List[ConversationMessage], Optional[Awaitable[MultiModalDataDict]]]:
     conversation: List[ConversationMessage] = []
-    mm_futures: List[Awaitable[MultiModalDataDict]] = []
+    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
     for msg in messages:
-        parse_result = _parse_chat_message_content(msg, model_config,
-                                                   tokenizer)
+        sub_messages = _parse_chat_message_content(msg, mm_tracker)
 
-        conversation.extend(parse_result.messages)
-        mm_futures.extend(parse_result.mm_futures)
+        conversation.extend(sub_messages)
 
-    return conversation, mm_futures
+    return conversation, mm_tracker.all_mm_data()
 
 
 def apply_chat_template(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d31ac4995fe2f..f7576509d06c8 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -94,7 +94,7 @@ async def create_chat_completion(
             tokenizer = await self.async_engine_client.get_tokenizer(
                 lora_request)
 
-            conversation, mm_futures = parse_chat_messages(
+            conversation, mm_data_future = parse_chat_messages(
                 request.messages, model_config, tokenizer)
 
             tool_dicts = None if request.tools is None else [
@@ -116,12 +116,8 @@ async def create_chat_completion(
 
         mm_data: Optional[MultiModalDataDict] = None
         try:
-            if len(mm_futures):
-                # since we support only single mm data currently
-                assert len(
-                    mm_futures
-                ) == 1, "Multiple 'image_url' input is currently not supported."
-                mm_data = await mm_futures[0]
+            if mm_data_future:
+                mm_data = await mm_data_future
         except Exception as e:
             logger.error("Error in loading multi-modal data: %s", e)
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 1aeabb7a7d729..fc9ca29e9cf86 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -65,10 +65,10 @@ async def create_tokenize(
         if isinstance(request, TokenizeChatRequest):
             model_config = self.model_config
 
-            conversation, mm_futures = parse_chat_messages(
+            conversation, mm_data_future = parse_chat_messages(
                 request.messages, model_config, tokenizer)
 
-            if mm_futures:
+            if mm_data_future:
                 logger.warning(
                     "Multi-modal inputs are ignored during tokenization")
 

From 5b86b19954d30acaebb24bc5441b184ae3fcf345 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 1 Sep 2024 14:46:57 -0700
Subject: [PATCH 15/51] [Misc] Optional installation of audio related packages
 (#8063)

---
 requirements-common.txt                |  4 +---
 requirements-test.txt                  |  4 +++-
 setup.py                               |  1 +
 tests/models/test_ultravox.py          |  4 ++--
 vllm/model_executor/models/ultravox.py |  6 +++++-
 vllm/multimodal/utils.py               | 20 +++++++++++++++++---
 6 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index d7e10c7591a79..4c5b681a0d5ab 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -22,9 +22,7 @@ typing_extensions >= 4.10
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
 pyzmq
 msgspec
-librosa # Required for audio processing
-soundfile # Required for audio processing
 gguf == 0.9.1
 importlib_metadata
 mistral_common >= 1.3.4
-pyyaml
\ No newline at end of file
+pyyaml
diff --git a/requirements-test.txt b/requirements-test.txt
index 46eb05fc31099..58cf1716b45ce 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -13,10 +13,12 @@ pytest-shard
 awscli
 einops # required for MPT, qwen-vl and Mamba
 httpx
+librosa # required for audio test
 peft
 requests
 ray
 sentence-transformers # required for embedding
+soundfile # required for audio test
 compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
@@ -30,4 +32,4 @@ aiohttp
 
 # quantization
 bitsandbytes==0.42.0
-buildkite-test-collector==0.1.8
\ No newline at end of file
+buildkite-test-collector==0.1.8
diff --git a/setup.py b/setup.py
index 21b0422c0f0bd..38d3f41663f2e 100644
--- a/setup.py
+++ b/setup.py
@@ -501,6 +501,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
+        "audio": ["librosa", "soundfile"]  # Required for audio processing
     },
     cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
     package_data=package_data,
diff --git a/tests/models/test_ultravox.py b/tests/models/test_ultravox.py
index 98de10aa08408..23008f9b8b563 100644
--- a/tests/models/test_ultravox.py
+++ b/tests/models/test_ultravox.py
@@ -1,11 +1,9 @@
 from typing import List, Optional, Tuple, Type
 
-import librosa
 import numpy as np
 import pytest
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 
-from vllm.assets.audio import AudioAsset
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
@@ -21,6 +19,7 @@
 
 @pytest.fixture(scope="session")
 def audio_and_sample_rate():
+    from vllm.assets.audio import AudioAsset
     return AudioAsset("mary_had_lamb").audio_and_sample_rate
 
 
@@ -109,6 +108,7 @@ def process(hf_inputs: BatchEncoding):
                    dtype=dtype,
                    postprocess_inputs=process,
                    auto_cls=AutoModel) as hf_model:
+        import librosa
 
         hf_outputs_per_audio = [
             hf_model.generate_greedy_logprobs_limit(
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 827a9493a70d2..7994945c5ac39 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -8,7 +8,6 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
                     TypedDict, Union, cast)
 
-import librosa
 import numpy as np
 import torch
 import torch.utils.checkpoint
@@ -107,6 +106,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
         feature_extractor = whisper_feature_extractor(ctx)
 
         if sr != feature_extractor.sampling_rate:
+            try:
+                import librosa
+            except ImportError:
+                raise ImportError(
+                    "Please install vllm[audio] for audio support.") from None
             audio = librosa.resample(audio,
                                      orig_sr=sr,
                                      target_sr=feature_extractor.sampling_rate)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 989b2e1a814c9..4bed267e99637 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,11 +1,9 @@
 import base64
 from functools import lru_cache
 from io import BytesIO
-from typing import List, Optional, Tuple, TypeVar, Union
+from typing import Any, List, Optional, Tuple, TypeVar, Union
 
-import librosa
 import numpy as np
-import soundfile
 from PIL import Image
 
 from vllm.connections import global_http_connection
@@ -73,10 +71,22 @@ async def async_fetch_image(image_url: str,
     return image.convert(image_mode)
 
 
+def try_import_audio_packages() -> Tuple[Any, Any]:
+    try:
+        import librosa
+        import soundfile
+    except ImportError:
+        raise ImportError(
+            "Please install vllm[audio] for audio support.") from None
+    return librosa, soundfile
+
+
 def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
     """
     Load audio from a URL.
     """
+    librosa, _ = try_import_audio_packages()
+
     if audio_url.startswith("http"):
         audio_bytes = global_http_connection.get_bytes(
             audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
@@ -95,6 +105,8 @@ async def async_fetch_audio(
     """
     Asynchronously fetch audio from a URL.
     """
+    librosa, _ = try_import_audio_packages()
+
     if audio_url.startswith("http"):
         audio_bytes = await global_http_connection.async_get_bytes(
             audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
@@ -123,6 +135,8 @@ def encode_audio_base64(
     sampling_rate: int,
 ) -> str:
     """Encode audio as base64."""
+    _, soundfile = try_import_audio_packages()
+
     buffered = BytesIO()
     soundfile.write(buffered, audio, sampling_rate, format="WAV")
 

From f8d60145b4d954b7a110073f77dc91842155a3d8 Mon Sep 17 00:00:00 2001
From: Shawn Tan <shawn@wtf.sg>
Date: Sun, 1 Sep 2024 21:37:18 -0400
Subject: [PATCH 16/51] [Model] Add Granite model (#7436)

Co-authored-by: Nick Hill <nickhill@us.ibm.com>
---
 tests/models/test_granite.py               |  49 ++
 vllm/model_executor/models/__init__.py     |   1 +
 vllm/model_executor/models/granite.py      | 543 +++++++++++++++++++++
 vllm/transformers_utils/configs/granite.py | 199 ++++++++
 4 files changed, 792 insertions(+)
 create mode 100644 tests/models/test_granite.py
 create mode 100644 vllm/model_executor/models/granite.py
 create mode 100644 vllm/transformers_utils/configs/granite.py

diff --git a/tests/models/test_granite.py b/tests/models/test_granite.py
new file mode 100644
index 0000000000000..2435b5dc3ff88
--- /dev/null
+++ b/tests/models/test_granite.py
@@ -0,0 +1,49 @@
+"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
+
+Run `pytest tests/models/test_granite.py`.
+"""
+import importlib.metadata
+
+import pytest
+
+from .utils import check_logprobs_close
+
+TRANSFORMERS_VERSION = tuple(
+    map(int,
+        importlib.metadata.version("transformers").split(".")))
+
+MODELS = [
+    "ibm/PowerLM-3b",
+]
+
+
+# GraniteForCausalLM will be in transformers >= 4.45
+@pytest.mark.skipif(TRANSFORMERS_VERSION < (4, 45),
+                    reason="granite model test requires transformers >= 4.45")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    # TODO(sang): Sliding window should be tested separately.
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index f4c3e43c8f2a4..e30370596496a 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -65,6 +65,7 @@
     "EAGLEModel": ("eagle", "EAGLE"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
+    "GraniteForCausalLM": ("granite", "GraniteForCausalLM")
 }
 
 _EMBEDDING_MODELS = {
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
new file mode 100644
index 0000000000000..b0325e8b616c8
--- /dev/null
+++ b/vllm/model_executor/models/granite.py
@@ -0,0 +1,543 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only IBM Granite model compatible with HuggingFace weights."""
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    get_compressed_tensors_cache_scale)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.granite import GraniteConfig
+from vllm.utils import is_hip
+
+from .interfaces import SupportsLoRA
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+
+class GraniteMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class GraniteAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim",
+                                self.hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.attention_multiplier
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GraniteDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.residual_multiplier = config.residual_multiplier
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        self.self_attn = GraniteAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.mlp = GraniteMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        return hidden_states
+
+
+class GraniteModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GraniteDecoderLayer(config=config,
+                                               cache_config=cache_config,
+                                               quant_config=quant_config,
+                                               prefix=prefix),
+            prefix=f"{prefix}.layers")
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        hidden_states *= self.config.embedding_multiplier
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class GraniteForCausalLM(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
+        "lm_head"
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = GraniteModel(config,
+                                  cache_config,
+                                  quant_config,
+                                  lora_config=lora_config,
+                                  prefix="model")
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, kv_caches,
+                                  attn_metadata, intermediate_tensors)
+        return model_output
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        logits /= self.config.logits_scaling
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if scale_name := get_compressed_tensors_cache_scale(name):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+                quantization_param_path, tp_rank, tp_size,
+                self.config.num_hidden_layers,
+                self.config.__class__.model_type):
+            if not isinstance(self.model.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.model.layers[layer_idx].self_attn
+
+            if is_hip():
+                # The scaling factor convention we are assuming is
+                # quantized_value * scaling_factor ~= true_value
+                # which is consistent with the practice of setting
+                # scaling_factor = tensor_amax / FPtype_max
+                scaling_factor *= 2
+            if hasattr(layer_self_attn, "kv_scale"):
+                layer_self_attn.attn._kv_scale = scaling_factor
+            else:
+                raise RuntimeError("Self attention has no KV cache scaling "
+                                   "factor attribute!")
diff --git a/vllm/transformers_utils/configs/granite.py b/vllm/transformers_utils/configs/granite.py
new file mode 100644
index 0000000000000..c12838be5d385
--- /dev/null
+++ b/vllm/transformers_utils/configs/granite.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Granite model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of
+    a [`GraniteModel`]. It is used to instantiate an Granite
+    model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Granite-3B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Granite model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`GraniteModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1` the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When converting
+            a multi-head checkpoint to a GQA checkpoint, each group key and
+            value head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the
+            decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE
+            embeddings. Currently supports two scaling strategies: linear and
+            dynamic. Their scaling factor must be a float greater than 1. The
+            expected format is
+            `{"type": strategy name, "factor": scaling factor}`.
+            When using this flag, don't update `max_position_embeddings` to
+            the expected new maximum. See the following thread for more
+            information on how these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/.
+            This is an experimental feature, subject to breaking API changes
+            in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers
+            in the MLP layers.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0):
+            embedding multiplier
+        logits_scaling (`float`, *optional*, defaults to 1.0):
+            divisor for output logits
+        residual_multiplier (`float`, *optional*, defaults to 1.0):
+            residual multiplier
+        attention_multiplier (`float`, *optional*, defaults to 1.0):
+            attention multiplier
+
+    ```python
+    >>> from transformers import GraniteModel, GraniteConfig
+
+    >>> # Initializing a Granite granite-3b style configuration
+    >>> configuration = GraniteConfig()
+
+    >>> # Initializing a model from the granite-7b style configuration
+    >>> model = GraniteModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granite"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        embedding_multiplier=1.0,
+        logits_scaling=1.0,
+        residual_multiplier=1.0,
+        attention_multiplier=1.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.residual_multiplier = residual_multiplier
+        self.attention_multiplier = attention_multiplier
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        rope_config_validation(self)

From e6a26ed0376f39c0ae99ee1af1e390087fc81f8a Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Sun, 1 Sep 2024 21:23:29 -0700
Subject: [PATCH 17/51] [SpecDecode][Kernel] Flashinfer Rejection Sampling
 (#7244)

---
 Dockerfile                                    |   2 +-
 tests/samplers/test_rejection_sampler.py      | 116 +++++++++--
 .../test_typical_acceptance_sampler.py        |  50 +++--
 tests/spec_decode/test_spec_decode_worker.py  |   5 +-
 vllm/envs.py                                  |   1 +
 .../layers/rejection_sampler.py               | 184 ++++++++++++++----
 .../layers/spec_decode_base_sampler.py        |  43 ++--
 .../layers/typical_acceptance_sampler.py      |   7 +-
 vllm/spec_decode/spec_decode_worker.py        |   7 +-
 9 files changed, 306 insertions(+), 109 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9bae9a12c0eb2..ec6069f605eb1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -162,7 +162,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
 
 
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 3ce4a5f658198..91a9d879eb4a5 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -44,12 +44,16 @@ def mock_causal_accepted_tensor(
     ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
 @pytest.mark.parametrize("disable_bonus_tokens", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
-def test_correct_output_format(which_tokens_accepted: str,
-                               disable_bonus_tokens: bool, seed: int,
-                               device: str):
+def test_correct_output_format(which_tokens_accepted: str, seed: int,
+                               disable_bonus_tokens: bool, device: str,
+                               use_flashinfer: bool):
     """Verify the output has correct format given predetermined accepted matrix.
     """
+    if use_flashinfer and disable_bonus_tokens:
+        pytest.skip("Flashinfer rejection sampler must enable bonus token.")
+
     set_random_seed(seed)
     torch.set_default_device(device)
 
@@ -85,7 +89,8 @@ def test_correct_output_format(which_tokens_accepted: str,
                                     dtype=torch.int64)
 
     rejection_sampler = RejectionSampler(
-        disable_bonus_tokens=disable_bonus_tokens)
+        disable_bonus_tokens=disable_bonus_tokens,
+        use_flashinfer=use_flashinfer)
     rejection_sampler.init_gpu_tensors(device=device)
     output_token_ids = rejection_sampler._create_output(  # pylint: disable=protected-access
         accepted,
@@ -133,15 +138,20 @@ def test_correct_output_format(which_tokens_accepted: str,
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
 def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
-                                    device: str):
+                                    device: str, use_flashinfer: bool):
     torch.set_default_device(device)
-    rejection_sampler = RejectionSampler()
+    rejection_sampler = RejectionSampler(disable_bonus_tokens=False,
+                                         use_flashinfer=use_flashinfer)
     rejection_sampler.init_gpu_tensors(device=device)
 
     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size,
+                              k + 1,
+                              vocab_size,
+                              dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
@@ -161,16 +171,21 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
 @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
 @pytest.mark.parametrize("n_rep", [100])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
 def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
-                                   frac_seeded: float, n_rep: int,
-                                   device: str):
+                                   frac_seeded: float, n_rep: int, device: str,
+                                   use_flashinfer: bool):
     torch.set_default_device(device)
-    rejection_sampler = RejectionSampler()
+    rejection_sampler = RejectionSampler(disable_bonus_tokens=False,
+                                         use_flashinfer=use_flashinfer)
     rejection_sampler.init_gpu_tensors(device=device)
 
     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size,
+                              k + 1,
+                              vocab_size,
+                              dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
@@ -198,23 +213,85 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                 assert torch.equal(results[j][i], results[0][i])
 
 
+@pytest.mark.parametrize("k", [1, 3, 6])
+@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
+                                       batch_size: int, device: str):
+    """
+    Test the flashinfer and nonflashinfer backend generate 
+    the same output metrics.
+    """
+    torch.set_default_device(device)
+    torch.manual_seed(0)
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size,
+                              k + 1,
+                              vocab_size,
+                              dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+
+    num_accepted_tokens = []
+    num_emitted_tokens = []
+    num_draft_tokens = []
+
+    def get_seeded_seqs():
+        return {
+            i: torch.Generator(device=device).manual_seed(i)
+            for i in range(batch_size)
+        }
+
+    for use_flashinfer in [True, False]:
+        rejection_sampler = RejectionSampler(disable_bonus_tokens=False,
+                                             use_flashinfer=use_flashinfer)
+        rejection_sampler.init_gpu_tensors(device=device)
+        # We use seeded sequences to ensure the same tokens are accepted
+        # for both flashinfer and nonflashinfer backends.
+        seeded_seqs = get_seeded_seqs()
+        rejection_sampler(target_probs, bonus_token_ids, draft_probs,
+                          draft_token_ids, seeded_seqs)
+        num_accepted_tokens.append(rejection_sampler.num_accepted_tokens)
+        num_emitted_tokens.append(rejection_sampler.num_emitted_tokens)
+        num_draft_tokens.append(rejection_sampler.num_draft_tokens)
+
+    assert num_accepted_tokens[0] == num_accepted_tokens[1]
+    assert num_emitted_tokens[0] == num_emitted_tokens[1]
+    assert num_draft_tokens[0] == num_draft_tokens[1]
+
+
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
 @pytest.mark.parametrize("which_token_ids",
                          ["bonus_token_ids", "draft_token_ids"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
 def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
-                               which_token_ids: str, device: str):
+                               which_token_ids: str, device: str,
+                               use_flashinfer: bool):
     k = 3
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
 
-    rejection_sampler = RejectionSampler(strict_mode=True)
+    rejection_sampler = RejectionSampler(disable_bonus_tokens=False,
+                                         use_flashinfer=use_flashinfer,
+                                         strict_mode=True)
     rejection_sampler.init_gpu_tensors(device=device)
 
     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size,
+                              k + 1,
+                              vocab_size,
+                              dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
@@ -248,9 +325,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
 
 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
 @pytest.mark.parametrize("seed", list(range(5)))
+@pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
 def test_rejection_sampling_approximates_target_distribution(
-        seed: int, draft_and_target_probs_equal: bool):
+        seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
     """Verify rejection sampling approximates target distribution,
     despite sampling from a potentially distinct draft distribution.
 
@@ -279,10 +357,10 @@ def test_rejection_sampling_approximates_target_distribution(
     """
     torch.set_default_device("cpu")
     set_random_seed(seed)
-
     helper = _CorrectnessTestHelper(
         vocab_size=10,
-        rejection_sampler=RejectionSampler(),
+        rejection_sampler=RejectionSampler(disable_bonus_tokens=False,
+                                           use_flashinfer=use_flashinfer),
     )
 
     draft_probs, target_probs, reference_probs = helper.generate_probs_for_test(
@@ -398,10 +476,10 @@ def _estimate_rejection_sampling_pdf(
         draft_probs = draft_probs.reshape(1, self.k, self.vocab_size).repeat(
             num_samples, 1, 1)
 
-        # Repeat target probs num_samples * k times.
+        # Repeat target probs num_samples * (k + 1) times.
         # Rejection sampler requires bonus token probs, but they aren't used.
         target_probs = target_probs.reshape(1, 1, self.vocab_size).repeat(
-            num_samples, self.k, 1)
+            num_samples, self.k + 1, 1)
 
         # Randomly sample draft token ids from draft probs.
         draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index aa3c1d29bdb36..e81ec4a0fdf1f 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -79,7 +79,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
     torch.set_default_device(device)
     typical_acceptance_sampler = get_acceptance_sampler()
     typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_with_bonus_probs = torch.rand(batch_size,
+                                         k + 1,
+                                         vocab_size,
+                                         dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
@@ -89,7 +92,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     size=(batch_size, k),
                                     dtype=torch.int64)
     # Verify that sampling succeeds for all cases.
-    typical_acceptance_sampler(target_probs,
+    typical_acceptance_sampler(target_with_bonus_probs,
                                bonus_token_ids,
                                draft_probs=None,
                                draft_token_ids=draft_token_ids)
@@ -112,7 +115,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     torch.set_default_device(device)
     typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
     typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_with_bonus_probs = torch.rand(batch_size,
+                                         k + 1,
+                                         vocab_size,
+                                         dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
@@ -141,7 +147,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     oob_token_ids[0][0] = rogue_token_id
 
     with pytest.raises(AssertionError):
-        typical_acceptance_sampler(target_probs,
+        typical_acceptance_sampler(target_with_bonus_probs,
                                    bonus_token_ids,
                                    draft_probs=None,
                                    draft_token_ids=draft_token_ids)
@@ -172,7 +178,10 @@ def test_uniform_target_distribution_accepts_all_tokens(
     typical_acceptance_sampler = get_acceptance_sampler(
         strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
     typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_with_bonus_probs = torch.rand(batch_size,
+                                         k + 1,
+                                         vocab_size,
+                                         dtype=torch.float32)
     draft_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, k),
@@ -182,7 +191,7 @@ def test_uniform_target_distribution_accepts_all_tokens(
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
     output_token_ids = typical_acceptance_sampler(
-        target_probs,
+        target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
         draft_token_ids=draft_token_ids)
@@ -229,8 +238,9 @@ def test_temperature_zero_target_distribution(seed: int,
     # Simulate temperature 0 probability distribution for target probabilities
     # and create target probabilities such that only 1 token id has
     # probability 1.0
-    target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist(
-        batch_size, k, vocab_size)
+    target_with_bonus_probs, zero_temperature_token_ids = \
+        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
+    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
     # Populate draft_token_ids such that they exclude the token_ids
     # with probability = 1.0
     draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
@@ -245,7 +255,7 @@ def test_temperature_zero_target_distribution(seed: int,
     # fallback to the greedy sampling for selecting 1 token for each sequence.
     # Verify the same.
     output_token_ids = typical_acceptance_sampler(
-        target_probs,
+        target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
         draft_token_ids=draft_token_ids)
@@ -289,8 +299,10 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
     # For sequences 0 and 2 set the distribution to a temperature
     # zero distribution. For sequences 1 and 3 set it to a uniform
     # distribution.
-    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
-        batch_size, k, vocab_size))
+    target_with_bonus_probs, zero_temperature_token_ids = \
+        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
+    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
+    target_probs = target_with_bonus_probs[:, :-1]
     draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
                                           zero_temperature_token_ids)
     uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32)
@@ -300,7 +312,7 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
     output_token_ids = typical_acceptance_sampler(
-        target_probs,
+        target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
         draft_token_ids=draft_token_ids)
@@ -356,15 +368,16 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
     # Create a temperature zero target probability distribution and ensure
     # all draft token ids correspond to the tokens with 1.0 probability.
     # Verify that all of them are accepted.
-    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
-        batch_size, k, vocab_size))
+    target_with_bonus_probs, zero_temperature_token_ids = \
+        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
+    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
     draft_token_ids = zero_temperature_token_ids
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
     output_token_ids = typical_acceptance_sampler(
-        target_probs,
+        target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
         draft_token_ids=draft_token_ids)
@@ -384,7 +397,7 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
     draft_token_ids = torch.cat(
         (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
     output_token_ids = typical_acceptance_sampler(
-        target_probs,
+        target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
         draft_token_ids=draft_token_ids)
@@ -421,8 +434,9 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
     # 0.00001. Populate draft_token_ids such that they exclude the token_ids
     # with probability = 1.0. Without any changes to the posterior thresholds
     # none of the draft tokens are accepted.
-    target_probs, zero_temperature_token_ids = (get_zero_temperature_prob_dist(
-        batch_size, k, vocab_size))
+    target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist(
+        batch_size, k + 1, vocab_size)
+    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
     target_probs[target_probs == 0] = 0.00001
     draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
                                           zero_temperature_token_ids)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index cbaffee2f41e2..501d05756e01c 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -230,9 +230,8 @@ def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int,
 
     assert torch.equal(actual.bonus_token_ids,
                        target_token_ids.reshape(batch_size, k + 1)[:, -1:])
-    assert torch.equal(
-        actual.target_probs,
-        target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1])
+    assert torch.equal(actual.target_with_bonus_probs,
+                       target_token_probs.reshape(batch_size, k + 1, -1))
     assert torch.equal(actual.draft_token_ids, proposal_token_ids)
     assert torch.equal(actual.draft_probs, proposal_probs)
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 30320af5fa43a..3c6b6adff82fc 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -31,6 +31,7 @@
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_USE_FLASHINFER_SAMPLER: bool = False
+    VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 2124196d06f9c..b2f333a5bcc80 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,12 +1,28 @@
 from functools import cached_property
+from importlib.util import find_spec
 from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.jit
 
+import vllm.envs as envs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeStochasticBaseSampler)
 
+logger = init_logger(__name__)
+
+if find_spec("flashinfer"):
+    """
+    Consider utilizing the FlashInfer rejection sampling kernel initially,
+    as it employs a dedicated kernel rather than relying on 
+    Torch tensor operations. This design choice helps to fuse operations, 
+    reduce memory I/O, and consequently enhances performance.
+    """
+    from flashinfer.sampling import chain_speculative_sampling
+else:
+    chain_speculative_sampling = None
+
 
 class RejectionSampler(SpecDecodeStochasticBaseSampler):
     """Apply modified rejection sampling as described in "Accelerating Large
@@ -16,7 +32,8 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
 
     def __init__(self,
                  disable_bonus_tokens: bool = True,
-                 strict_mode: bool = False):
+                 strict_mode: bool = False,
+                 use_flashinfer: Optional[bool] = None):
         """Create a rejection sampler.
 
         Args:
@@ -26,13 +43,29 @@ def __init__(self,
             strict_mode: Whether or not to perform shape/device/dtype checks
             during sampling. This catches correctness issues but adds
             nontrivial latency.
+            use_falshinfer: We will use this parameter to determine whether
+            to use the FlashInfer rejection sampling kernel or not. If it's
+            None, we will use the default value from the environment variable.
+            This parameter is only used for testing purposes.
         """
         super().__init__(disable_bonus_tokens=disable_bonus_tokens,
                          strict_mode=strict_mode)
+        if use_flashinfer is None:
+            self.use_flashinfer = envs.VLLM_USE_FLASHINFER_SAMPLER and (
+                chain_speculative_sampling is not None)
+        else:
+            self.use_flashinfer = use_flashinfer
+
+        if self.use_flashinfer:
+            assert not disable_bonus_tokens, \
+                "flashinfer will enable bonus token by default"
+            logger.info("Use flashinfer for rejection sampling.")
+        else:
+            logger.info("Use pytorch for rejection sampling.")
 
     def forward(
         self,
-        target_probs: torch.Tensor,
+        target_with_bonus_probs: torch.Tensor,
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
@@ -50,9 +83,9 @@ def forward(
         sequence.
 
         Args:
-            target_probs: The probability distribution over token ids given
-                context according to the target model.
-            shape = [batch_size, num_speculative_tokens, vocab_size]
+            target_with_bonus_probs: The probability distribution 
+                over token ids given context according to the target model.
+            shape = [batch_size, num_speculative_tokens + 1, vocab_size]
 
             bonus_token_ids: The "bonus" token ids that are accepted iff all
                 speculative tokens in a sequence are accepted.
@@ -78,23 +111,52 @@ def forward(
         # Only perform shape/dtype/device checking in strict mode, as it adds
         # overhead.
         if self._strict_mode:
-            self._raise_if_incorrect_input(target_probs, draft_token_ids,
-                                           bonus_token_ids, draft_probs)
+            self._raise_if_incorrect_input(target_with_bonus_probs,
+                                           draft_token_ids, bonus_token_ids,
+                                           draft_probs)
 
-        accepted, recovered_token_ids = (
-            self._batch_modified_rejection_sampling(
-                target_probs,
-                draft_probs,
-                draft_token_ids,
-                seeded_seqs,
-            ))
+        batch_size, k, _ = draft_probs.shape
 
-        output_token_ids = self._create_output(
-            accepted,
-            recovered_token_ids,
-            draft_token_ids,
-            bonus_token_ids,
-        )
+        # batch_size = 0 when all requests in the batch are
+        # non_spec requests. In this case, output_token_ids is
+        # just an empty tensor.
+        if batch_size == 0:
+            return torch.empty(0, k + 1, device=draft_probs.device, dtype=int)
+
+        # If use Flashinfer chain_speculative_sampling kernel
+        # for rejection sampling
+        if self.use_flashinfer:
+            batch_size, k, _ = draft_probs.shape
+            uniform_samples = self._create_uniform_samples(
+                seeded_seqs, batch_size, k, draft_probs.device)
+            output_token_ids, accepted_token_num, emitted_token_num \
+                = chain_speculative_sampling(
+                draft_probs, draft_token_ids, uniform_samples,
+                target_with_bonus_probs)
+
+            # num_emitted_tokens returned by flashinfer
+            # does not include the bonus token
+            # Flashinfer stops at the first token that violates
+            # the condition p >= q and does not include recovery/bonus token.
+            # Therefore, we need to add batch_size here.
+            self.num_accepted_tokens += accepted_token_num.sum()
+            self.num_emitted_tokens += emitted_token_num.sum() + batch_size
+            self.num_draft_tokens += batch_size * k
+        else:
+            accepted, recovered_token_ids = (
+                self._batch_modified_rejection_sampling(
+                    target_with_bonus_probs[:, :-1],
+                    draft_probs,
+                    draft_token_ids,
+                    seeded_seqs,
+                ))
+
+            output_token_ids = self._create_output(
+                accepted,
+                recovered_token_ids,
+                draft_token_ids,
+                bonus_token_ids,
+            )
 
         return output_token_ids
 
@@ -135,6 +197,63 @@ def _batch_modified_rejection_sampling(
 
         return accepted, recovered_token_ids
 
+    def _create_uniform_samples(self,
+                                seeded_seqs: Optional[Dict[int,
+                                                           torch.Generator]],
+                                batch_size: int, k: int,
+                                device: torch.device) -> torch.Tensor:
+        """
+        Generates a batch of uniform random samples, with optional seeding 
+        for specific sequences.
+
+        This method creates a tensor of shape `(batch_size, k + 1)` filled 
+        with uniform random values in the range [0, 1). If `seeded_seqs` 
+        is provided, the sequences corresponding to specific indices 
+        will be generated using the provided `torch.Generator` for 
+        reproducibility. The other sequences will be generated without 
+        a seed.
+
+        Args:
+            seeded_seqs : Optional[Dict[int, torch.Generator]]
+                A dictionary mapping indices in the batch to 
+                `torch.Generator` objects. If `None`, all samples are 
+                generated without a seed.
+            batch_size : int
+                The number of sequences to generate.
+            k : int
+                The number of random samples per sequence.
+            device : torch.device
+                The device on which to allocate the tensor.
+
+        Returns:
+            uniform_rand : torch.Tensor
+                A tensor of shape `(batch_size, k + 1)` containing uniform 
+                random values in the range [0, 1).
+        """
+        if not seeded_seqs:
+            return torch.rand(batch_size, k + 1, device=device)
+
+        uniform_rand = torch.empty(batch_size, k + 1, device=device)
+
+        non_seeded_indices = []
+        for idx in range(batch_size):
+            generator = seeded_seqs.get(idx)
+            if generator is None:
+                non_seeded_indices.append(idx)
+            else:
+                uniform_rand[idx, :] = torch.rand(1,
+                                                  k + 1,
+                                                  dtype=self.probs_dtype,
+                                                  device=device,
+                                                  generator=generator)
+        if non_seeded_indices:
+            uniform_rand[non_seeded_indices, :] = torch.rand(
+                len(non_seeded_indices),
+                k + 1,
+                dtype=self.probs_dtype,
+                device=device)
+        return uniform_rand
+
     def _get_accepted(
         self,
         target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
@@ -175,29 +294,8 @@ def _get_accepted(
         selected_target_probs = target_probs[batch_indices, probs_indicies,
                                              draft_token_ids]
 
-        if not seeded_seqs:
-            uniform_rand = torch.rand_like(selected_target_probs)
-        else:
-            uniform_rand = torch.empty_like(selected_target_probs)
-
-            non_seeded_indices = []
-            for idx in range(batch_size):
-                generator = seeded_seqs.get(idx)
-                if generator is None:
-                    non_seeded_indices.append(idx)
-                else:
-                    uniform_rand[idx, :] = torch.rand(
-                        1,
-                        k,
-                        dtype=self.probs_dtype,
-                        device=target_probs.device,
-                        generator=generator)
-            if non_seeded_indices:
-                uniform_rand[non_seeded_indices, :] = torch.rand(
-                    len(non_seeded_indices),
-                    k,
-                    dtype=self.probs_dtype,
-                    device=target_probs.device)
+        uniform_rand = self._create_uniform_samples(seeded_seqs, batch_size,
+                                                    k - 1, target_probs.device)
 
         capped_ratio = torch.minimum(
             selected_target_probs / selected_draft_probs,
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 467c43c41550e..f9532dffa92c0 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -130,29 +130,35 @@ def _create_output(
 
     def _raise_if_incorrect_input(
         self,
-        target_probs: torch.Tensor,
+        target_with_bonus_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
         bonus_token_ids: torch.Tensor,
         draft_probs: Optional[torch.Tensor] = None,
     ) -> None:
-        self._raise_if_incorrect_shape(target_probs, draft_token_ids,
-                                       bonus_token_ids, draft_probs)
-        self._raise_if_incorrect_dtype(target_probs, draft_token_ids,
-                                       bonus_token_ids, draft_probs)
-        self._raise_if_inconsistent_device(target_probs, draft_token_ids,
-                                           bonus_token_ids, draft_probs)
-        self._raise_if_out_of_bounds_vocab(target_probs.shape[-1],
+        self._raise_if_incorrect_shape(target_with_bonus_probs,
+                                       draft_token_ids, bonus_token_ids,
+                                       draft_probs)
+        self._raise_if_incorrect_dtype(target_with_bonus_probs,
+                                       draft_token_ids, bonus_token_ids,
+                                       draft_probs)
+        self._raise_if_inconsistent_device(target_with_bonus_probs,
+                                           draft_token_ids, bonus_token_ids,
+                                           draft_probs)
+        self._raise_if_out_of_bounds_vocab(target_with_bonus_probs.shape[-1],
                                            draft_token_ids, bonus_token_ids)
 
     def _raise_if_incorrect_shape(
         self,
-        target_probs: torch.Tensor,
+        target_with_bonus_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
         bonus_token_ids: torch.Tensor,
         draft_probs: Optional[torch.Tensor] = None,
     ) -> None:
         (target_batch_size, num_target_probs,
-         target_vocab_size) = target_probs.shape
+         target_vocab_size) = target_with_bonus_probs.shape
+
+        # Does not count the extra token
+        num_target_probs -= 1
 
         # validate the shape of draft token ids.
         draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape
@@ -175,12 +181,12 @@ def _raise_if_incorrect_shape(
 
     def _raise_if_incorrect_dtype(
         self,
-        target_probs: torch.Tensor,
+        target_with_bonus_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
         bonus_token_ids: torch.Tensor,
         draft_probs: Optional[torch.Tensor] = None,
     ) -> None:
-        assert target_probs.dtype == self.probs_dtype
+        assert target_with_bonus_probs.dtype == self.probs_dtype
         assert draft_token_ids.dtype == self.token_id_dtype
         assert bonus_token_ids.dtype == self.token_id_dtype
         if draft_probs is not None:
@@ -188,15 +194,16 @@ def _raise_if_incorrect_dtype(
 
     def _raise_if_inconsistent_device(
         self,
-        target_probs: torch.Tensor,
+        target_with_bonus_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
         bonus_token_ids: torch.Tensor,
         draft_probs: Optional[torch.Tensor] = None,
     ) -> None:
         devices = [
-            t.device for t in
-            [target_probs, bonus_token_ids, draft_probs, draft_token_ids]
-            if t is not None
+            t.device for t in [
+                target_with_bonus_probs, bonus_token_ids, draft_probs,
+                draft_token_ids
+            ] if t is not None
         ]
         assert all([devices[0] == device for device in devices])
 
@@ -220,7 +227,7 @@ class SpecDecodeDeterministicBaseSampler(SpecDecodeBaseSampler):
     @abstractmethod
     def forward(
         self,
-        target_probs: torch.Tensor,
+        target_with_bonus_probs: torch.Tensor,
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
@@ -236,7 +243,7 @@ class SpecDecodeStochasticBaseSampler(SpecDecodeBaseSampler):
     @abstractmethod
     def forward(
         self,
-        target_probs: torch.Tensor,
+        target_with_bonus_probs: torch.Tensor,
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index a87ea0eee57de..7428d33ea720d 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -41,7 +41,7 @@ def __init__(
 
     def forward(
         self,
-        target_probs: torch.Tensor,
+        target_with_bonus_probs: torch.Tensor,
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
@@ -80,8 +80,9 @@ def forward(
         # Only perform shape/dtype/device checking in strict mode, as it adds
         # overhead.
         if self._strict_mode:
-            self._raise_if_incorrect_input(target_probs, draft_token_ids,
-                                           bonus_token_ids)
+            self._raise_if_incorrect_input(target_with_bonus_probs,
+                                           draft_token_ids, bonus_token_ids)
+        target_probs = target_with_bonus_probs[:, :-1]
         accepted = self._evaluate_accepted_tokens(target_probs,
                                                   draft_token_ids)
         recovered_token_ids = self._replacement_token_ids(target_probs)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 78beb2ce44773..91f0a98c7bc38 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -625,8 +625,8 @@ def _verify_tokens(
             seq_group_metadata_list, proposal_lens_list)
         original_indices = spec_indices + non_spec_indices
 
-        # Get probabilities of target model, excluding bonus token.
-        proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1]
+        # Get probabilities of target model, including bonus tokens.
+        proposal_verifier_probs = proposal_scores.probs[spec_indices]
 
         # Get non-speculative sampled tokens from target model.
         non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
@@ -651,13 +651,12 @@ def _verify_tokens(
             }
 
         accepted_token_ids = self.spec_decode_sampler(
-            target_probs=proposal_verifier_probs,
+            target_with_bonus_probs=proposal_verifier_probs,
             bonus_token_ids=bonus_token_ids,
             draft_probs=proposal_probs,
             draft_token_ids=proposal_token_ids,
             **sampler_extra_kwargs,
         )
-
         # Append output tokens from non-speculative sequences to
         # the accepted token ids tensor.
         non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len +

From e2b2aa5a0fdd3e682dd1fbd62e2ba81b8aa054d2 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 1 Sep 2024 23:09:46 -0700
Subject: [PATCH 18/51] [TPU] Align worker index with node boundary (#7932)

---
 vllm/executor/ray_tpu_executor.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
index 8f867b1d647a5..8c8b5f741488b 100644
--- a/vllm/executor/ray_tpu_executor.py
+++ b/vllm/executor/ray_tpu_executor.py
@@ -111,12 +111,40 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 # Else, added to the list of workers.
                 self.workers.append(worker)
 
+        logger.debug("workers: %s", self.workers)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
         if self.driver_dummy_worker is None:
             raise ValueError(
                 "Ray does not allocate any TPUs on the driver node. Consider "
                 "adjusting the Ray placement group or running the driver on a "
                 "TPU node.")
 
+        worker_ips = [
+            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
+            for worker in self.workers
+        ]
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        def sort_by_driver_then_worker_ip(worker):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = ray.get(worker.get_node_ip.remote())
+            return (ip != driver_ip, ip_counts[ip], ip)
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
+
         # Get the set of TPU IDs used on each node.
         worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
                                                     use_dummy_driver=True)

From 4ca65a97638054ed04b37c2bf3e868d4c1209e9c Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 2 Sep 2024 20:43:26 +0800
Subject: [PATCH 19/51] [Core][Bugfix] Accept GGUF model without .gguf
 extension (#8056)

---
 vllm/engine/arg_utils.py             |  3 ++-
 vllm/transformers_utils/config.py    |  5 +++--
 vllm/transformers_utils/tokenizer.py |  4 ++--
 vllm/transformers_utils/utils.py     | 16 ++++++++++++++++
 4 files changed, 23 insertions(+), 5 deletions(-)
 create mode 100644 vllm/transformers_utils/utils.py

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d98f57bc2d353..8dbe6504d21bd 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -16,6 +16,7 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import FlexibleArgumentParser
 
 if TYPE_CHECKING:
@@ -753,7 +754,7 @@ def from_cli_args(cls, args: argparse.Namespace):
 
     def create_engine_config(self) -> EngineConfig:
         # gguf file needs a specific model loader and doesn't use hf_repo
-        if self.model.endswith(".gguf"):
+        if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
 
         # bitsandbytes quantization needs a specific model loader
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f3ac8d3178d4e..dfe83ddb731d4 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -16,6 +16,7 @@
                                              MedusaConfig, MLPSpeculatorConfig,
                                              MPTConfig, NemotronConfig,
                                              RWConfig, UltravoxConfig)
+from vllm.transformers_utils.utils import check_gguf_file
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -56,7 +57,7 @@ def get_config(
 ) -> PretrainedConfig:
 
     # Separate model folder from file path for GGUF models
-    is_gguf = Path(model).is_file() and Path(model).suffix == ".gguf"
+    is_gguf = check_gguf_file(model)
     if is_gguf:
         kwargs["gguf_file"] = Path(model).name
         model = Path(model).parent
@@ -112,7 +113,7 @@ def get_hf_image_processor_config(
     if VLLM_USE_MODELSCOPE:
         return dict()
     # Separate model folder from file path for GGUF models
-    if Path(model).is_file() and Path(model).suffix == ".gguf":
+    if check_gguf_file(model):
         model = Path(model).parent
     return get_image_processor_config(model, revision=revision, **kwargs)
 
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 2866975850db3..f9fb8d1e103b7 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -12,6 +12,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizers import (BaichuanTokenizer,
                                                 MistralTokenizer)
+from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import make_async
 
 logger = init_logger(__name__)
@@ -96,8 +97,7 @@ def get_tokenizer(
         kwargs["truncation_side"] = "left"
 
     # Separate model folder from file path for GGUF models
-    is_gguf = Path(tokenizer_name).is_file() and Path(
-        tokenizer_name).suffix == ".gguf"
+    is_gguf = check_gguf_file(tokenizer_name)
     if is_gguf:
         kwargs["gguf_file"] = Path(tokenizer_name).name
         tokenizer_name = Path(tokenizer_name).parent
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
new file mode 100644
index 0000000000000..7a9041b04fbb9
--- /dev/null
+++ b/vllm/transformers_utils/utils.py
@@ -0,0 +1,16 @@
+from os import PathLike
+from pathlib import Path
+from typing import Union
+
+
+def check_gguf_file(model: Union[str, PathLike]) -> bool:
+    """Check if the file is a GGUF model."""
+    model = Path(model)
+    if not model.is_file():
+        return False
+    elif model.suffix == ".gguf":
+        return True
+
+    with open(model, "rb") as f:
+        header = f.read(4)
+    return header == b"GGUF"

From dd2a6a82e3f41b4673b1dbb24b2e99230ea96981 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 2 Sep 2024 23:48:56 +0800
Subject: [PATCH 20/51] [Bugfix] Fix internlm2 tensor parallel inference
 (#8055)

---
 vllm/model_executor/models/internlm2.py | 47 ++++++++++++++++++-------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 9b7cada187ce1..23669b540f561 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from functools import partial
 from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
@@ -7,7 +8,10 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -70,20 +74,21 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
         self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
+        assert self.total_num_heads % self.tp_size == 0
+        self.num_heads = self.total_num_heads // self.tp_size
         self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
+        if self.total_num_kv_heads >= self.tp_size:
             # Number of KV heads is greater than TP size, so we partition
             # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
+            assert self.total_num_kv_heads % self.tp_size == 0
         else:
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+            assert self.tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
         self.head_dim = hidden_size // self.total_num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
@@ -122,11 +127,27 @@ def __init__(
                               quant_config=quant_config)
 
     def split_qkv(self, qkv: torch.Tensor):
-        qkv = qkv.view(-1, self.num_kv_heads, self.key_value_groups + 2, 128)
-        q, k, v = torch.split(qkv, [self.key_value_groups, 1, 1], dim=2)
-        q = q.reshape(-1, self.q_size)
-        k = k.reshape(-1, self.kv_size)
-        v = v.reshape(-1, self.kv_size)
+        seq_len = qkv.shape[0]
+        if self.tp_size > 1:
+            qkv_map = [self.q_size, self.kv_size, self.kv_size] * self.tp_size
+            qkv = tensor_model_parallel_all_gather(qkv)
+            qkv = torch.split(qkv, qkv_map, dim=-1)
+            qkv = qkv[::3] + qkv[1::3] + qkv[2::3]
+            qkv = torch.cat(qkv, dim=-1)
+
+        qkv = qkv.view(seq_len, self.total_num_kv_heads,
+                       self.key_value_groups + 2, self.head_dim)
+        q, k, v = torch.split(qkv, [self.key_value_groups, 1, 1], dim=-2)
+        q = q.reshape(seq_len, self.q_size * self.tp_size)
+        k = k.reshape(seq_len, self.kv_size * self.tp_size)
+        v = v.reshape(seq_len, self.kv_size * self.tp_size)
+
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
         return q, k, v
 
     def forward(

From 6e36f4fa6ce64619b9ea94c88a157f5783a63a65 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 3 Sep 2024 05:20:12 +0800
Subject: [PATCH 21/51] improve chunked prefill performance

[Bugfix] Fix #7592 vllm 0.5.4 enable_chunked_prefill throughput is slightly lower than 0.5.3~0.5.0. (#7874)
---
 tests/basic_correctness/test_chunked_prefill.py |  3 +++
 vllm/core/scheduler.py                          | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index fc6f829c37b06..a63ac380e8598 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -116,6 +116,9 @@ def test_models_with_fp8_kv_cache(
         pytest.skip(
             "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
         )
+    if ((model, kv_cache_dtype, chunked_prefill_token_size) == (
+            "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", "fp8_e4m3", 4)):
+        pytest.skip("flakey test, see: #7874 #8051")
 
     max_num_seqs = chunked_prefill_token_size
     max_num_batched_tokens = chunked_prefill_token_size
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 4c2f715820317..81c78bda3b505 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1027,16 +1027,21 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
 
         # Update waiting requests.
         self.waiting.extendleft(running_scheduled.preempted)
+
         # Update new running requests.
-        self.running.extend([s.seq_group for s in prefills.seq_groups])
-        self.running.extend(
-            [s.seq_group for s in running_scheduled.decode_seq_groups])
-        self.running.extend(
-            [s.seq_group for s in running_scheduled.prefill_seq_groups])
+        # By default, vLLM scheduler prioritizes prefills.
+        # Once chunked prefill is enabled,
+        # the policy is changed to prioritize decode requests.
         self.running.extend(
             [s.seq_group for s in swapped_in.decode_seq_groups])
         self.running.extend(
             [s.seq_group for s in swapped_in.prefill_seq_groups])
+        self.running.extend(
+            [s.seq_group for s in running_scheduled.decode_seq_groups])
+        self.running.extend(
+            [s.seq_group for s in running_scheduled.prefill_seq_groups])
+        self.running.extend([s.seq_group for s in prefills.seq_groups])
+
         # Update swapped requests.
         self.swapped.extend(running_scheduled.swapped_out)
         return SchedulerOutputs(

From 0fbc6696c28f41009d8493c57e74f5971d6f5026 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 2 Sep 2024 20:35:42 -0700
Subject: [PATCH 22/51] [Bugfix] Fix single output condition in output
 processor (#7881)

---
 vllm/engine/output_processor/single_step.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 422e6d30522f5..e288aa0c4aafd 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -113,7 +113,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                                         outputs: SequenceGroupOutput,
                                         is_async: bool) -> None:
         sampling_params = seq_group.sampling_params
-        if sampling_params.n == 1 and not sampling_params.use_beam_search:
+        if sampling_params.best_of == 1 and not sampling_params.use_beam_search:
             # only have one output sample
             sample = outputs.samples[0]
             # only have one sequence

From ec266536b7c4d4d308566ac928a69fcb9ef94462 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 3 Sep 2024 21:37:52 +0800
Subject: [PATCH 23/51] [Bugfix][VLM] Add fallback to SDPA for ViT model
 running on CPU backend (#8061)

---
 vllm/model_executor/models/blip.py       | 25 ++++++--
 vllm/model_executor/models/clip.py       | 28 +++++++--
 vllm/model_executor/models/intern_vit.py | 79 +++++++++++++++++++++---
 vllm/model_executor/models/paligemma.py  | 42 +++++++------
 vllm/model_executor/models/siglip.py     | 27 ++++++--
 5 files changed, 157 insertions(+), 44 deletions(-)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index e6acf8cd5d5bb..583d5d217903b 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -7,7 +7,7 @@
 import torch.nn as nn
 from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
-from xformers import ops as xops
+from transformers.models.blip.modeling_blip import BlipAttention
 
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -21,6 +21,12 @@
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
 
+try:
+    from xformers import ops as xops
+    USE_XFORMERS_OPS = True
+except ImportError:
+    USE_XFORMERS_OPS = False
+
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -156,7 +162,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-class BlipAttention(nn.Module):
+class BlipParallelAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
@@ -224,7 +230,7 @@ def forward(
         out = out.view(bsz, tgt_len, -1)
         attn_output, _ = self.projection(out)
 
-        return attn_output
+        return attn_output, None
 
 
 class BlipMLP(nn.Module):
@@ -261,7 +267,16 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
-        self.self_attn = BlipAttention(config, quant_config=quant_config)
+        # fallback to sdpa attention if tp unavailable
+        num_heads = config.num_attention_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        if USE_XFORMERS_OPS and num_heads % tp_size == 0:
+            self.self_attn = BlipParallelAttention(config,
+                                                   quant_config=quant_config)
+        else:
+            # Blip doesn't have SDPA attention implemented in transformers
+            # use eager attention instead for cpu backend
+            self.self_attn = BlipAttention(config)
         self.layer_norm1 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
         self.mlp = BlipMLP(config, quant_config=quant_config)
@@ -272,7 +287,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states=hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index ddfec91d6cab2..b581a501e3333 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -7,7 +7,7 @@
 import torch.nn as nn
 from PIL import Image
 from transformers import CLIPVisionConfig
-from xformers import ops as xops
+from transformers.models.clip.modeling_clip import CLIPSdpaAttention
 
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -22,6 +22,12 @@
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
 
+try:
+    from xformers import ops as xops
+    USE_XFORMERS_OPS = True
+except ImportError:
+    USE_XFORMERS_OPS = False
+
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -162,7 +168,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-class CLIPAttention(nn.Module):
+class CLIPParallelAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
@@ -231,7 +237,7 @@ def forward(
         out = out.view(bsz, tgt_len, -1)
         attn_output, _ = self.out_proj(out)
 
-        return attn_output
+        return attn_output, None
 
 
 class CLIPMLP(nn.Module):
@@ -266,7 +272,13 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
-        self.self_attn = CLIPAttention(config, quant_config=quant_config)
+        num_heads = config.num_attention_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        if USE_XFORMERS_OPS and num_heads % tp_size == 0:
+            self.self_attn = CLIPParallelAttention(config,
+                                                   quant_config=quant_config)
+        else:
+            self.self_attn = CLIPSdpaAttention(config)
         self.layer_norm1 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
         self.mlp = CLIPMLP(config, quant_config=quant_config)
@@ -278,7 +290,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states=hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -365,6 +377,10 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None,
                  num_hidden_layers_override: Optional[int] = None):
         super().__init__()
+        tp_size = get_tensor_model_parallel_world_size()
+        num_heads = config.num_attention_heads
+        self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0
+
         self.vision_model = CLIPVisionTransformer(
             config=config,
             quant_config=quant_config,
@@ -386,7 +402,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
-        ]
+        ] if self.shard_weight else []
         params_dict = dict(self.named_parameters())
         layer_count = len(self.vision_model.encoder.layers)
 
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index ad5919150cad8..33b4a3acaa559 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -10,7 +10,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
-from xformers import ops as xops
 
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
@@ -21,6 +20,12 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
+try:
+    from xformers import ops as xops
+    USE_XFORMERS_OPS = True
+except ImportError:
+    USE_XFORMERS_OPS = False
+
 NORM2FN = {
     'rms_norm': RMSNorm,
     'layer_norm': nn.LayerNorm,
@@ -81,7 +86,7 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         return embeddings
 
 
-class InternAttention(nn.Module):
+class InternParallelAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
@@ -140,18 +145,67 @@ def forward(self, x):
             k = self.k_norm.forward_native(k.flatten(-2,
                                                      -1)).view(B_, N_, H_, D_)
 
-        x = xops.memory_efficient_attention_forward(
-            q,
-            k,
-            v,
-            scale=self.scale,
-        )
+        x = xops.memory_efficient_attention_forward(q, k, v, scale=self.scale)
         x = x.view(B, N, -1)
 
         x, _ = self.proj(x)
         return x
 
 
+class InternSdpaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads '
+                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).')
+
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(self.embed_dim,
+                             3 * self.embed_dim,
+                             bias=config.qkv_bias)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        q = q.view(B, N, self.num_heads, self.head_dim)
+        k = k.view(B, N, self.num_heads, self.head_dim)
+        v = v.view(B, N, self.num_heads, self.head_dim)
+
+        if self.qk_normalization:
+            B_, N_, H_, D_ = q.shape
+            q = self.q_norm.forward_native(q.flatten(-2,
+                                                     -1)).view(B_, N_, H_, D_)
+            k = self.k_norm.forward_native(k.flatten(-2,
+                                                     -1)).view(B_, N_, H_, D_)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        x = x.transpose(1, 2).view(B, N, -1)
+
+        x = self.proj(x)
+        return x
+
+
 class InternMLP(nn.Module):
 
     def __init__(self,
@@ -187,7 +241,14 @@ def __init__(self,
         self.intermediate_size = config.intermediate_size
         self.norm_type = config.norm_type
 
-        self.attn = InternAttention(config, quant_config=quant_config)
+        # fallback to sdpa attention if tp unavailable
+        tp_size = get_tensor_model_parallel_world_size()
+        num_heads = config.num_attention_heads
+        if USE_XFORMERS_OPS and num_heads % tp_size == 0:
+            self.attn = InternParallelAttention(config,
+                                                quant_config=quant_config)
+        else:
+            self.attn = InternSdpaAttention(config)
         self.mlp = InternMLP(config, quant_config=quant_config)
         self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
                                              eps=config.layer_norm_eps)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 9b29ff69808a6..b6f4275fbc948 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -307,26 +307,30 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
             use_default_weight_loading = False
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
+            if "vision" not in name or self.vision_tower.shard_weight:
+                for (param_name, shard_name,
+                     shard_id) in stacked_params_mapping:
+                    if shard_name not in name:
+                        continue
+                    name = name.replace(shard_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    # lm_head is not used in vllm as it is tied with
+                    # embed_token. To prevent errors, skip loading
+                    # lm_head.weight.
+                    if "lm_head.weight" in name:
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    use_default_weight_loading = True
             else:
-                # lm_head is not used in vllm as it is tied with
-                # embed_token. To prevent errors, skip loading
-                # lm_head.weight.
-                if "lm_head.weight" in name:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
                 use_default_weight_loading = True
 
             if use_default_weight_loading:
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index e6f95af0ff49f..114dbf09b0c53 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -9,7 +9,7 @@
 from PIL import Image
 from torch import nn
 from transformers import SiglipVisionConfig
-from xformers import ops as xops
+from transformers.models.siglip.modeling_siglip import SiglipSdpaAttention
 
 from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -26,6 +26,12 @@
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
 
+try:
+    from xformers import ops as xops
+    USE_XFORMERS_OPS = True
+except ImportError:
+    USE_XFORMERS_OPS = False
+
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     # Since interpolation is applied, the image size need not be divisible
@@ -219,7 +225,7 @@ def forward(self,
         return embeddings
 
 
-class SiglipAttention(nn.Module):
+class SiglipParallelAttention(nn.Module):
 
     def __init__(
         self,
@@ -282,7 +288,7 @@ def forward(
         out = out.view(batch_size, q_len, -1)
         attn_output, _ = self.out_proj(out)
 
-        return attn_output
+        return attn_output, None
 
 
 class SiglipMLP(nn.Module):
@@ -327,7 +333,14 @@ def __init__(
         super().__init__()
         self.embed_dim = config.hidden_size
 
-        self.self_attn = SiglipAttention(config, quant_config=quant_config)
+        num_heads = config.num_attention_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        if USE_XFORMERS_OPS and num_heads % tp_size == 0:
+            self.self_attn = SiglipParallelAttention(config,
+                                                     quant_config=quant_config)
+        else:
+            self.self_attn = SiglipSdpaAttention(config)
+
         self.layer_norm1 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -344,7 +357,7 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states=hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -476,6 +489,10 @@ def __init__(
         num_hidden_layers_override: Optional[int] = None,
     ):
         super().__init__()
+        num_heads = config.num_attention_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0
+
         self.vision_model = SiglipVisionTransformer(
             config,
             quant_config,

From bd852f2a8b9e9129de69fa7349906a9115538d5a Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 3 Sep 2024 10:49:18 -0700
Subject: [PATCH 24/51] [Performance] Enable chunked prefill and prefix caching
 together (#8120)

Co-authored-by: Tao He <sighingnow@gmail.com>
Co-authored-by: Juelianqvq <Juelianqvq@noreply.github.com>

From 95a178f86120f42d183b3af5ee1ce58ee05c8889 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 3 Sep 2024 11:32:27 -0700
Subject: [PATCH 25/51] [CI] Only PR reviewers/committers can trigger CI on PR
 (#8124)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .github/workflows/add_label_ready_comment.yml | 23 -------------------
 .github/workflows/reminder_comment.yml        |  2 +-
 .../remove_label_not_ready_comment.yml        | 23 -------------------
 3 files changed, 1 insertion(+), 47 deletions(-)
 delete mode 100644 .github/workflows/add_label_ready_comment.yml
 delete mode 100644 .github/workflows/remove_label_not_ready_comment.yml

diff --git a/.github/workflows/add_label_ready_comment.yml b/.github/workflows/add_label_ready_comment.yml
deleted file mode 100644
index 729c1452af03d..0000000000000
--- a/.github/workflows/add_label_ready_comment.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Add Ready Label on Ready Comment
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  add-ready-label:
-    runs-on: ubuntu-latest
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
-    steps:
-        -   name: Add label
-            uses: actions/github-script@v5
-            with:
-                script: |
-                    github.rest.issues.addLabels({
-                        owner: context.repo.owner,
-                        repo: context.repo.repo,
-                        issue_number: context.issue.number,
-                        labels: ['ready']
-                    })
-            env:
-                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 390c88bb65308..15c35f8d442f5 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -15,7 +15,7 @@ jobs:
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you don't have permission to unblock, ping @simon-mo or @khluu to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
             })
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/remove_label_not_ready_comment.yml b/.github/workflows/remove_label_not_ready_comment.yml
deleted file mode 100644
index d1da7726eaee3..0000000000000
--- a/.github/workflows/remove_label_not_ready_comment.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Remove ready Label on notready Comment
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  add-ready-label:
-    runs-on: ubuntu-latest
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
-    steps:
-        -   name: Remove ready label
-            uses: actions/github-script@v5
-            with:
-                script: |
-                    github.rest.issues.removeLabel({
-                        owner: context.repo.owner,
-                        repo: context.repo.repo,
-                        issue_number: context.issue.number,
-                        name: 'ready'
-                    })
-            env:
-                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 6d646d08a2e0e73e83e313a5ae470c1f9e4f200e Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:50:29 -0400
Subject: [PATCH 26/51] [Core] Optimize Async + Multi-step (#8050)

---
 .../multi_step/test_correctness_async_llm.py  |   4 +-
 vllm/engine/async_llm_engine.py               | 109 +++++----
 vllm/engine/llm_engine.py                     | 222 ++++++++----------
 vllm/engine/output_processor/multi_step.py    |  62 +++--
 vllm/sequence.py                              |   4 +-
 vllm/worker/model_runner.py                   |   4 +-
 vllm/worker/multi_step_model_runner.py        | 165 ++++++++++---
 vllm/worker/multi_step_worker.py              |   4 +-
 8 files changed, 326 insertions(+), 248 deletions(-)

diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index d054ca341694a..0cbe8371e235a 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -103,13 +103,13 @@ async def test_multi_step(
         model,
         server_args + distributed_args,
         num_logprobs,
-        max_wait_seconds=3 * 240)
+        max_wait_seconds=5 * 240)
     test_completions = await completions_with_server_args(
         prompts,
         model,
         ms_server_args + distributed_args,
         num_logprobs,
-        max_wait_seconds=3 * 240)
+        max_wait_seconds=5 * 240)
 
     # Assert multi-step scheduling produces identical tokens
     # to single-step scheduling.
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 159281dabde4a..7fe8053fffb7b 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -280,40 +280,27 @@ async def step_async(
         scheduler_outputs = cached_outputs.scheduler_outputs
         allow_async_output_proc = cached_outputs.allow_async_output_proc
 
-        # Detect async + multi-step
-        use_async_and_multi_step = (self.scheduler_config.is_multi_step
-                                    and allow_async_output_proc)
-
         ctx = self.scheduler_contexts[virtual_engine]
 
+        # Clear outputs for each new scheduler iteration
+        ctx.request_outputs.clear()
+
         # skip the scheduler if there are any remaining steps in the seq groups.
         # This ensures that the scheduler is only called again when the current
         # batch has completed.
         if not self._has_remaining_steps(seq_group_metadata_list):
 
-            # Clear outputs on scheduler iteration start
-            ctx.request_outputs.clear()
-
             # Schedule iteration
             (seq_group_metadata_list, scheduler_outputs,
              allow_async_output_proc
              ) = self.scheduler[virtual_engine].schedule()
 
-            # Detect async + multi-step
-            use_async_and_multi_step = (self.scheduler_config.is_multi_step
-                                        and allow_async_output_proc)
+            ctx.seq_group_metadata_list = seq_group_metadata_list
+            ctx.scheduler_outputs = scheduler_outputs
 
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
-                self._process_model_outputs(virtual_engine=virtual_engine,
-                                            is_async=True)
-
-            # For async + multi-step, init the queue
-            if use_async_and_multi_step:
-                assert len(ctx.output_queue) == 0
-                assert seq_group_metadata_list is not None
-                ctx.output_queue.append(
-                    (None, seq_group_metadata_list, scheduler_outputs))
+                self._process_model_outputs(ctx=ctx)
 
             if (self.scheduler_config.is_multi_step
                     and scheduler_outputs.num_lookahead_slots > 0):
@@ -351,26 +338,20 @@ async def step_async(
                 last_sampled_token_ids=last_sampled_token_ids)
 
             if allow_async_output_proc:
-                async_callback = self.async_callback_multi_step[
-                    virtual_engine] if use_async_and_multi_step \
-                    else self.async_callback[virtual_engine]
-
-                execute_model_req.async_callback = async_callback
-                execute_model_req.use_async_and_multi_step = \
-                    use_async_and_multi_step
+                execute_model_req.async_callback = self.async_callbacks[
+                    virtual_engine]
 
             # Execute the model.
             output = await self.model_executor.execute_model_async(
                 execute_model_req)
+
             # we need to do this here so that last step's sampled_token_ids can
             # be passed to the next iteration for PP.
             if self.scheduler_config.is_multi_step:
                 self._update_cached_scheduler_output(virtual_engine, output)
         else:
-            if not use_async_and_multi_step and len(ctx.output_queue) > 0:
-                assert not self.scheduler_config.is_multi_step
-                self._process_model_outputs(virtual_engine=virtual_engine,
-                                            is_async=True)
+            if len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
             output = []
 
         # Finish the current step for all the sequence groups.
@@ -384,24 +365,22 @@ async def step_async(
                 self.cached_scheduler_outputs[
                     virtual_engine] = SchedulerOutputState()
 
-            if use_async_and_multi_step:
-                # For async + multi-step, clear the queue
-                ctx.output_queue.clear()
-            else:
-                ctx.output_queue.append(
-                    (output, seq_group_metadata_list, scheduler_outputs))
+            is_async = allow_async_output_proc
+            is_last_step = True
+            ctx.output_queue.append(
+                (output, seq_group_metadata_list, scheduler_outputs, is_async,
+                 is_last_step))
 
-                if output and allow_async_output_proc:
-                    assert len(
-                        output
-                    ) == 1, "Multi step decoding does not work with async output processing."  # noqa: E501
-                    self._advance_to_next_step(
-                        output[0], seq_group_metadata_list,
-                        scheduler_outputs.scheduled_seq_groups)
+            if output and allow_async_output_proc:
+                assert len(
+                    output
+                ) == 1, "Async postprocessor expects only a single output set"
+                self._advance_to_next_step(
+                    output[0], seq_group_metadata_list,
+                    scheduler_outputs.scheduled_seq_groups)
 
             if not allow_async_output_proc:
-                self._process_model_outputs(virtual_engine=virtual_engine,
-                                            is_async=False)
+                self._process_model_outputs(ctx=ctx)
 
                 # Log stats.
                 self.do_log_stats(scheduler_outputs, output)
@@ -411,17 +390,12 @@ async def step_async(
 
         else:
             # Multi-step case
-            if use_async_and_multi_step:
-                return []
-            else:
-                ctx.request_outputs = []
+            return ctx.request_outputs
 
         if not self.has_unfinished_requests():
             # Drain async postprocessor (if exists)
             if len(ctx.output_queue) > 0:
-                assert not self.scheduler_config.is_multi_step
-                self._process_model_outputs(virtual_engine=virtual_engine,
-                                            is_async=True)
+                self._process_model_outputs(ctx=ctx)
             assert len(ctx.output_queue) == 0
 
         return ctx.request_outputs
@@ -640,6 +614,17 @@ def __init__(self,
         self.log_requests = log_requests
         self.engine = self._init_engine(*args, **kwargs)
 
+        # This ensures quick processing of request outputs
+        # so the append to asyncio queues is not delayed,
+        # especially for multi-step.
+        #
+        # TODO: Currently, disabled for engine_use_ray, ask
+        # Cody/Will/Woosuk about this case.
+        self.use_process_request_outputs_callback = not self.engine_use_ray
+        if self.use_process_request_outputs_callback:
+            self.engine.process_request_outputs_callback = \
+                self.process_request_outputs
+
         if self.engine_use_ray:
             print_warning_once(
                 "DEPRECATED. `--engine-use-ray` is deprecated and will "
@@ -883,13 +868,27 @@ async def engine_step(self, virtual_engine: int) -> bool:
             request_outputs = await self.engine.step_async(virtual_engine)
 
         # Put the outputs into the corresponding streams.
-        finished = True
+        # If used as a callback, then already invoked inside
+        # LLMEngine's _process_model_outputs
+        if not self.use_process_request_outputs_callback:
+            all_finished = self.process_request_outputs(request_outputs)
+        else:
+            # For callback case, we only need to detect when all
+            # requests are finished
+            all_finished = all(request_output.finished
+                               for request_output in request_outputs)
+
+        return not all_finished
+
+    def process_request_outputs(self, request_outputs) -> bool:
+        # Put the outputs into the corresponding streams.
+        all_finished = True
         for request_output in request_outputs:
             self._request_tracker.process_request_output(
                 request_output, verbose=self.log_requests)
-            finished = finished and request_output.finished
+            all_finished = all_finished and request_output.finished
 
-        return not finished
+        return all_finished
 
     async def _engine_abort(self, request_ids: Iterable[str]):
         if self.engine_use_ray:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 1eab83f3b9889..8c5ca81fb1905 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -93,13 +93,14 @@ class SchedulerOutputState:
 @dataclass
 class SchedulerContext:
     output_queue: Deque[Tuple[Optional[List[SamplerOutput]],
-                              List[SequenceGroupMetadata],
-                              SchedulerOutputs]] = field(
-                                  default_factory=lambda: deque())
-
+                              List[SequenceGroupMetadata], SchedulerOutputs,
+                              bool,
+                              bool]] = field(default_factory=lambda: deque())
     request_outputs: List[Union[RequestOutput,
                                 EmbeddingRequestOutput]] = field(
                                     default_factory=lambda: [])
+    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
+    scheduler_outputs: Optional[SchedulerOutputs] = None
 
 
 class LLMEngine:
@@ -357,6 +358,26 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
             # different process.
             self.tokenizer.ping()
 
+        self.cached_scheduler_outputs = [
+            SchedulerOutputState()
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+        self.scheduler_contexts = [
+            SchedulerContext()
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+        self.async_callbacks = [
+            functools.partial(self._process_model_outputs,
+                              ctx=self.scheduler_contexts[v_id])
+            for v_id in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+        # Currently used by AsyncLLMEngine to ensure quick append
+        # of request outputs to asyncio queues
+        self.process_request_outputs_callback = None
+
         # Create the scheduler.
         # NOTE: the cache_config here have been updated with the numbers of
         # GPU and CPU blocks, which are profiled in the distributed executor.
@@ -364,9 +385,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
             Scheduler(
                 scheduler_config, cache_config, lora_config,
                 parallel_config.pipeline_parallel_size,
-                functools.partial(self._process_model_outputs,
-                                  virtual_engine=v_id,
-                                  is_async=True)
+                self.async_callbacks[v_id]
                 if model_config.use_async_output_proc else None)
             for v_id in range(parallel_config.pipeline_parallel_size)
         ]
@@ -417,30 +436,6 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
                 ),
             ))
 
-        self.cached_scheduler_outputs = [
-            SchedulerOutputState()
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        self.scheduler_contexts = [
-            SchedulerContext()
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        self.async_callback = [
-            functools.partial(self._process_model_outputs,
-                              virtual_engine=v_id,
-                              is_async=True)
-            for v_id in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        self.async_callback_multi_step = [
-            functools.partial(self._process_model_outputs,
-                              virtual_engine=v_id,
-                              is_async=False)
-            for v_id in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
     def _initialize_kv_caches(self) -> None:
         """Initialize the KV cache in the worker(s).
 
@@ -1249,11 +1244,7 @@ def _process_sequence_group_outputs(
 
         return
 
-    def _process_model_outputs(self,
-                               virtual_engine: int,
-                               is_async: bool,
-                               sampler_output: Optional[SamplerOutput] = None,
-                               is_last_output: bool = False) -> None:
+    def _process_model_outputs(self, ctx: SchedulerContext) -> None:
         """Apply the model output to the sequences in the scheduled seq groups.
 
         virtual_engine: The engine id to operate on
@@ -1273,24 +1264,12 @@ def _process_model_outputs(self,
         """
         now = time.time()
 
-        is_multi_step = sampler_output is not None
-
-        ctx: SchedulerContext = self.scheduler_contexts[virtual_engine]
-
         if len(ctx.output_queue) == 0:
             return None
 
-        if is_multi_step:
-            # Async + multi-step case
-            (outputs, seq_group_metadata_list,
-             scheduler_outputs) = ctx.output_queue[0]
-            assert outputs is None
-            outputs = [sampler_output]
-        else:
-            # Async standard case
-            (outputs, seq_group_metadata_list,
-             scheduler_outputs) = ctx.output_queue.popleft()
-
+        # Get pending async postprocessor
+        (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
+         is_last_step) = ctx.output_queue.popleft()
         assert outputs is not None
 
         # Sanity check
@@ -1306,6 +1285,7 @@ def _process_model_outputs(self,
             outputs_by_sequence_group = outputs
 
         finished_before: List[int] = []
+        finished_now: List[int] = []
         for i, seq_group_meta in enumerate(seq_group_metadata_list):
             scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
 
@@ -1343,26 +1323,44 @@ def _process_model_outputs(self,
 
             if self.model_config.embedding_mode:
                 self._process_sequence_group_outputs(seq_group, output)
-                continue
+            else:
+                self.output_processor.process_prompt_logprob(seq_group, output)
+                if seq_group_meta.do_sample:
+                    self.output_processor.process_outputs(
+                        seq_group, output, is_async)
 
-            self.output_processor.process_prompt_logprob(seq_group, output)
-            if seq_group_meta.do_sample:
-                self.output_processor.process_outputs(seq_group, output,
-                                                      is_async)
+            if seq_group.is_finished():
+                finished_now.append(i)
 
-        # For async + multi-step, free finished seqs and create outputs
-        # only on the final step.
-        if is_multi_step and not is_last_output:
-            return
+        # Generate outputs for the requests that finished this iteration
+        for i in finished_now:
+            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
 
-        for scheduler in self.scheduler:
-            scheduler.free_finished_seq_groups()
+            seq_group = scheduled_seq_group.seq_group
+            seq_group.maybe_set_first_token_time(now)
+            request_output = RequestOutputFactory.create(seq_group)
+            ctx.request_outputs.append(request_output)
 
-        # Create the outputs.
-        for i, _ in enumerate(seq_group_metadata_list):
-            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
+        # Free currently finished requests
+        if finished_now:
+            for scheduler in self.scheduler:
+                scheduler.free_finished_seq_groups()
+
+        # For multi-step, do not create outputs each iteration
+        if not is_last_step:
+            # Immediately process request outputs here (if callback is given)
+            if (finished_now
+                    and self.process_request_outputs_callback is not None):
+                self.process_request_outputs_callback(ctx.request_outputs)
+            return
+
+        # Create the outputs
+        # Note: scheduled_seq_groups and seq_group_metadata_list
+        # must match with the indices
+        for i, scheduled_seq_group in enumerate(
+                scheduler_outputs.scheduled_seq_groups):
 
-            if not is_multi_step and i in finished_before:
+            if i in finished_before or i in finished_now:
                 continue  # Avoids double processing
 
             seq_group = scheduled_seq_group.seq_group
@@ -1376,11 +1374,15 @@ def _process_model_outputs(self,
             request_output = RequestOutputFactory.create(seq_group)
             ctx.request_outputs.append(request_output)
 
-        # For async + multi-step, do stats only on the last output.
-        # Otherwise, do stats if the execution is async
-        do_stats = is_multi_step or is_async
+        # Immediately process request outputs here (if callback is given)
+        if (ctx.request_outputs
+                and self.process_request_outputs_callback is not None):
+            self.process_request_outputs_callback(ctx.request_outputs)
 
-        if do_stats:
+        # For async case, we need to record the stats here.
+        # For non-async case, the stats are done in the
+        # LLMEngine/AsyncLLMEngine directly
+        if is_async:
             # Log stats.
             self.do_log_stats(scheduler_outputs, outputs, finished_before)
 
@@ -1485,40 +1487,26 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         scheduler_outputs = cached_outputs.scheduler_outputs
         allow_async_output_proc = cached_outputs.allow_async_output_proc
 
-        # Detect async + multi-step
-        use_async_and_multi_step = (self.scheduler_config.is_multi_step
-                                    and allow_async_output_proc)
-
         ctx = self.scheduler_contexts[virtual_engine]
 
+        # Clear outputs for each new scheduler iteration
+        ctx.request_outputs.clear()
+
         # Skip the scheduler if there are any remaining steps in the seq groups.
         # This ensures that the scheduler is only called again when the current
         # batch has completed.
         if not self._has_remaining_steps(seq_group_metadata_list):
-
-            # Clear outputs on scheduler iteration start
-            ctx.request_outputs.clear()
-
             # Schedule iteration
             (seq_group_metadata_list, scheduler_outputs,
              allow_async_output_proc
              ) = self.scheduler[virtual_engine].schedule()
 
-            # Detect async + multi-step
-            use_async_and_multi_step = (self.scheduler_config.is_multi_step
-                                        and allow_async_output_proc)
+            ctx.seq_group_metadata_list = seq_group_metadata_list
+            ctx.scheduler_outputs = scheduler_outputs
 
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
-                self._process_model_outputs(virtual_engine=virtual_engine,
-                                            is_async=True)
-
-            # For async + multi-step, init the queue
-            if use_async_and_multi_step:
-                assert len(ctx.output_queue) == 0
-                assert seq_group_metadata_list is not None
-                ctx.output_queue.append(
-                    (None, seq_group_metadata_list, scheduler_outputs))
+                self._process_model_outputs(ctx=ctx)
 
             if (self.scheduler_config.is_multi_step
                     and scheduler_outputs.num_lookahead_slots > 0):
@@ -1555,13 +1543,8 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
                 last_sampled_token_ids=last_sampled_token_ids)
 
             if allow_async_output_proc:
-                async_callback = self.async_callback_multi_step[
-                    virtual_engine] if use_async_and_multi_step \
-                    else self.async_callback[virtual_engine]
-
-                execute_model_req.async_callback = async_callback
-                execute_model_req.use_async_and_multi_step = \
-                    use_async_and_multi_step
+                execute_model_req.async_callback = self.async_callbacks[
+                    virtual_engine]
 
             output = self.model_executor.execute_model(
                 execute_model_req=execute_model_req)
@@ -1573,10 +1556,8 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         else:
             # Nothing scheduled => If there is pending async postprocessor,
             # then finish it here.
-            if not use_async_and_multi_step and len(ctx.output_queue) > 0:
-                assert not self.scheduler_config.is_multi_step
-                self._process_model_outputs(virtual_engine=virtual_engine,
-                                            is_async=True)
+            if len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
             # No outputs in this case
             output = []
 
@@ -1590,28 +1571,24 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
             if self.scheduler_config.is_multi_step:
                 self.cached_scheduler_outputs[0] = SchedulerOutputState()
 
-            if use_async_and_multi_step:
-                # For async + multi-step, clear the queue
-                ctx.output_queue.clear()
-            else:
-                # Add results to the output_queue
-                # (for async or non-async postprocessing)
-                ctx.output_queue.append(
-                    (output, seq_group_metadata_list, scheduler_outputs))
+            # Add results to the output_queue
+            is_async = allow_async_output_proc
+            is_last_step = True
+            ctx.output_queue.append(
+                (output, seq_group_metadata_list, scheduler_outputs, is_async,
+                 is_last_step))
 
-                if output and allow_async_output_proc:
-                    assert len(output) == 1, (
-                        "Multi step decoding does not work "
-                        "with async output processing.")
+            if output and allow_async_output_proc:
+                assert len(output) == 1, (
+                    "Async postprocessor expects only a single output set")
 
-                    self._advance_to_next_step(
-                        output[0], seq_group_metadata_list,
-                        scheduler_outputs.scheduled_seq_groups)
+                self._advance_to_next_step(
+                    output[0], seq_group_metadata_list,
+                    scheduler_outputs.scheduled_seq_groups)
 
             # Check if need to run the usual non-async path
             if not allow_async_output_proc:
-                self._process_model_outputs(virtual_engine=virtual_engine,
-                                            is_async=False)
+                self._process_model_outputs(ctx=ctx)
 
                 # Log stats.
                 self.do_log_stats(scheduler_outputs, output)
@@ -1620,17 +1597,12 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
                 self.do_tracing(scheduler_outputs)
         else:
             # Multi-step case
-            if use_async_and_multi_step:
-                return []
-            else:
-                ctx.request_outputs = []
+            return ctx.request_outputs
 
         if not self.has_unfinished_requests():
             # Drain async postprocessor (if exists)
             if len(ctx.output_queue) > 0:
-                assert not self.scheduler_config.is_multi_step
-                self._process_model_outputs(virtual_engine=virtual_engine,
-                                            is_async=True)
+                self._process_model_outputs(ctx=ctx)
             assert len(ctx.output_queue) == 0
 
             # Stop the execute model loop in parallel workers until there are
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index e182cee8ba18e..c73db765fc3b5 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -85,9 +85,6 @@ def process_outputs(self,
             no tokens need to be appended since it is already done
             externally (before the next schedule() call)
         """
-        # TODO: Add support for async if necessary
-        assert not is_async
-
         # Sequences can be in RUNNING or FINISHED_ABORTED state
         # once scheduled, as a sequence is moved to FINSIHED_ABORTED
         # if a client disconnects from the api server.
@@ -101,19 +98,41 @@ def process_outputs(self,
             "Beam search not supported in multi-step decoding.")
         seq = seqs[0]
 
-        # Since there's only one sequence per sequence group, we can take the
-        # first sample.
-        samples = [output.samples[0] for output in outputs]
-
-        # -1 means the output token is not valid (eg. due to spec decode
-        # rejecting tokens).
-        valid_samples = [
-            sample for sample in samples if sample.output_token != -1
-        ]
-        assert valid_samples
-
-        self._process_seq_outputs(seq, valid_samples,
-                                  sequence_group.sampling_params)
+        if is_async:
+            # Async case: We process tokens one by one. Here, we know the token
+            # was already appended, so we only need to do the rest of the
+            # postprocessor: Detokenization + stopping logic
+            self._process_decode_and_stop(seq, sequence_group.sampling_params)
+        else:
+            # Standard multi-step case
+
+            # Since there's only one sequence per sequence group,
+            # we can take the first sample.
+            samples = [output.samples[0] for output in outputs]
+
+            # -1 means the output token is not valid (eg. due to spec decode
+            # rejecting tokens).
+            valid_samples = [
+                sample for sample in samples if sample.output_token != -1
+            ]
+            assert valid_samples
+
+            self._process_seq_outputs(seq, valid_samples,
+                                      sequence_group.sampling_params)
+
+    def _process_decode_and_stop(self, seq: Sequence,
+                                 sampling_params: SamplingParams) -> None:
+        new_char_count = 0
+        if sampling_params.detokenize:
+            new_char_count = self.detokenizer.decode_sequence_inplace(
+                seq, sampling_params)
+
+        # TODO(sang): Support lora.
+        self.stop_checker.maybe_stop_sequence(
+            seq,
+            new_char_count=new_char_count,
+            sampling_params=sampling_params,
+        )
 
     def _process_seq_outputs(self, seq: Sequence,
                              valid_samples: List[SequenceOutput],
@@ -151,16 +170,7 @@ def _process_seq_outputs(self, seq: Sequence,
                 logprobs=output_logprob,
             )
 
-            new_char_count = 0
-            if sampling_params.detokenize:
-                new_char_count = self.detokenizer.decode_sequence_inplace(
-                    seq, sampling_params)
+            self._process_decode_and_stop(seq, sampling_params)
 
-            # TODO(sang): Support lora.
-            self.stop_checker.maybe_stop_sequence(
-                seq,
-                new_char_count=new_char_count,
-                sampling_params=sampling_params,
-            )
             if seq.is_finished():
                 break
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 87b3d21fa7ae3..a5ebf152ce776 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1225,7 +1225,6 @@ class ExecuteModelRequest(
     last_sampled_token_ids: Optional[torch.Tensor] = None
     # Async callback
     async_callback: Optional[Callable] = None
-    use_async_and_multi_step: bool = False
 
     @property
     def is_first_multi_step(self) -> bool:
@@ -1272,5 +1271,4 @@ def clone(
             finished_requests_ids=self.finished_requests_ids,
             last_sampled_token_ids=self.last_sampled_token_ids.clone()
             if self.last_sampled_token_ids is not None else None,
-            async_callback=self.async_callback,
-            use_async_and_multi_step=self.use_async_and_multi_step)
+            async_callback=self.async_callback)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8a3c99a45b149..74f7d4e0860d3 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -21,6 +21,7 @@
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ObservabilityConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
+from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
@@ -96,7 +97,8 @@ class ModelInputForGPU(ModelRunnerInputBase):
     finished_requests_ids: Optional[List[str]] = None
     virtual_engine: int = 0
     async_callback: Optional[Callable] = None
-    use_async_and_multi_step: bool = False
+    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
+    scheduler_outputs: Optional[SchedulerOutputs] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index be0c75bc00dbd..b52f2a07e344e 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -22,6 +22,7 @@
                                                 get_pythonized_sample_results)
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceGroupMetadata, SequenceOutput)
+from vllm.utils import PyObjectCache
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUWithSamplingMetadata)
 from vllm.worker.model_runner_base import (
@@ -37,6 +38,29 @@
 logger = init_logger(__name__)
 
 
+def seq_output_builder():
+    return SequenceOutput(
+        0, 0,
+        {0: Logprob(logprob=float('inf'), rank=None, decoded_token=None)})
+
+
+def completion_seq_group_output_builder():
+    return CompletionSequenceGroupOutput([], None)
+
+
+# Used by pythonization to reduce python object allocations
+class PythonizationCache:
+
+    def __init__(self):
+        self.cached_seq_output = PyObjectCache(seq_output_builder)
+        self.cached_completion_seq_group_output = PyObjectCache(
+            completion_seq_group_output_builder)
+
+    def reset(self):
+        self.cached_seq_output.reset()
+        self.cached_completion_seq_group_output.reset()
+
+
 @dataclass
 class ModelOutput:
     """The output of a single model forward pass.
@@ -59,6 +83,7 @@ class ModelOutput:
     pythonized: bool = False
     # On-device tensor containing the logprobs of each token.
     logprobs: Optional["torch.Tensor"] = None
+    pythonization_cache: Optional[PythonizationCache] = None
 
     def pythonize(self, input_metadata: "StatefulModelInput",
                   copy_stream: torch.cuda.Stream,
@@ -97,7 +122,8 @@ def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput",
         with torch.cuda.stream(copy_stream):
             _pythonize_sampler_output(input_metadata, self.sampler_output,
                                       pinned_sampled_token_buffer,
-                                      self.sampled_token_ids, self.logprobs)
+                                      self.sampled_token_ids, self.logprobs,
+                                      self.pythonization_cache)
 
         # Erase the logprobs GPU-side tensor.
         # Note that although _pythonize_sampler_output() runs in its
@@ -209,6 +235,8 @@ def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
         self._copy_stream = torch.cuda.Stream()
         self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
 
+        self.pythonization_cache = PythonizationCache()
+
     def make_model_input_from_broadcasted_tensor_dict(
             self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
         model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
@@ -237,14 +265,22 @@ def _async_process_outputs(self, model_input: StatefulModelInput,
                                output_proc_callback: Callable):
         # Proceed with pythonization and output_proc in order.
         # Stop on the first one that fails to pythonize
+        output_proc_callback()
+
         cont = True
         for model_output in model_input.cached_outputs:
             if not model_output.pythonized:
                 model_output.maybe_pythonize(model_input, self._copy_stream,
                                              self.pinned_sampled_token_ids)
                 if model_output.pythonized:
-                    output_proc_callback(
-                        sampler_output=model_output.sampler_output)
+                    ctx = output_proc_callback.keywords["ctx"]
+                    is_async = False
+                    is_last_step = False
+                    ctx.output_queue.append(
+                        ([model_output.sampler_output
+                          ], ctx.seq_group_metadata_list,
+                         ctx.scheduler_outputs, is_async, is_last_step))
+                    output_proc_callback()
                 else:
                     cont = False
 
@@ -255,21 +291,46 @@ def _final_process_outputs(self, model_input: StatefulModelInput,
                                output_proc_callback: Optional[Callable]):
         assert model_input.frozen_model_input is not None
 
+        has_async_callback = output_proc_callback is not None
+
         outputs = []
         for output_id in range(len(model_input.cached_outputs)):
-            is_last_output = output_id == len(model_input.cached_outputs) - 1
-
             output = model_input.cached_outputs[output_id]
-            if not output.pythonized:
+            is_last_step = output_id == len(model_input.cached_outputs) - 1
+
+            # For non-async case:
+            #   -- We simply add the outputs
+            # For async case:
+            #   -- Invoke callback, pythonize, add to callback queue and repeat
+            #   -- For last output, just add to callback queue
+            if has_async_callback:
+                assert output_proc_callback is not None
+
+                # Invoke callback before pythonize (to overlap with GPU)
+                output_proc_callback()
+
+                # Pythonize
+                if not output.pythonized:
+                    output.pythonize(model_input, self._copy_stream,
+                                     self.pinned_sampled_token_ids)
+
+                    # For non last step, add to callback queue to chain
+                    # callbacks=>pythonize pairs (for GPU overlap)
+                    if not is_last_step:
+                        ctx = output_proc_callback.keywords[  # type: ignore
+                            "ctx"]  # type: ignore
+                        is_async = False
+                        is_last_step = False
+                        ctx.output_queue.append(
+                            ([output.sampler_output
+                              ], ctx.seq_group_metadata_list,
+                             ctx.scheduler_outputs, is_async, is_last_step))
+                    else:
+                        outputs.append(output.sampler_output)
+            else:
                 output.pythonize(model_input, self._copy_stream,
                                  self.pinned_sampled_token_ids)
-
-                if model_input.frozen_model_input.use_async_and_multi_step:
-                    assert output_proc_callback is not None
-                    output_proc_callback(sampler_output=output.sampler_output,
-                                         is_last_output=is_last_output)
-
-            outputs.append(output.sampler_output)
+                outputs.append(output.sampler_output)
 
         return outputs
 
@@ -330,7 +391,7 @@ def execute_model(
                 model_input, model_input.cached_outputs[-1].sampler_output)
 
         output_proc_callback = None
-        if frozen_model_input.use_async_and_multi_step:
+        if frozen_model_input.async_callback is not None:
             output_proc_callback = frozen_model_input.async_callback
             assert output_proc_callback is not None
             async_callback = functools.partial(
@@ -367,7 +428,7 @@ def execute_model(
             model_input.cached_outputs.append(
                 ModelOutput(output[0], output_ready_event,
                             output[0].sampled_token_ids, False,
-                            output[0].logprobs))
+                            output[0].logprobs, self.pythonization_cache))
 
             # These GPU tensors are not required by multi-step;
             # erase them to ensure they are not pythonized or
@@ -378,7 +439,7 @@ def execute_model(
 
             # Pythonize the output if CPU is ahead and the previous step is
             # ready.
-            if not frozen_model_input.use_async_and_multi_step:
+            if frozen_model_input.async_callback is None:
                 for model_output in model_input.cached_outputs:
                     model_output.maybe_pythonize(model_input,
                                                  self._copy_stream,
@@ -397,6 +458,7 @@ def execute_model(
         if model_input.is_last_step:
             outputs = self._final_process_outputs(model_input,
                                                   output_proc_callback)
+            self.pythonization_cache.reset()
             return outputs
 
         # should be [SamplerOutput]
@@ -537,6 +599,7 @@ def _pythonize_sampler_output(
     pinned_sampled_token_buffer: torch.Tensor,
     sampled_token_ids: torch.Tensor,
     logprobs_tensor: Optional[torch.Tensor],
+    cache: Optional[PythonizationCache],
 ) -> None:
     """ This function is only called when the output tensors are ready. 
     See :class:`ModelOutput`. 
@@ -597,6 +660,9 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
+        if seq_group.sampling_params.logits_processors:
+            assert len(seq_group.sampling_params.logits_processors) == 0, (
+                "Logits Processors are not supported in multi-step decoding")
 
         if do_pythonize_logprobs:
             assert prompt_logprobs is not None
@@ -621,23 +687,56 @@ def _pythonize_sampler_output(
         seq_ids = seq_group.seq_ids
         next_token_ids = sample_result
         parent_ids = [0]
-        seq_outputs: List[SequenceOutput] = []
-        if seq_group.sampling_params.logits_processors:
-            assert len(seq_group.sampling_params.logits_processors) == 0, (
-                "Logits Processors are not supported in multi-step decoding")
+
+        if cache is not None:
+            completion_seq_group_output: CompletionSequenceGroupOutput = \
+                cache.cached_completion_seq_group_output.get_object()
+            completion_seq_group_output.samples.clear()
+            seq_outputs: List[
+                SequenceOutput] = completion_seq_group_output.samples
+        else:
+            seq_outputs = []
+
         for tdx, (parent_id,
                   next_token_id) in enumerate(zip(parent_ids, next_token_ids)):
-            seq_outputs.append(
-                SequenceOutput(seq_ids[parent_id], next_token_id,
-                               (group_sample_logprobs[tdx]
-                                if logprobs_are_requested else {
-                                    next_token_id:
-                                    Logprob(logprob=float('inf'),
-                                            rank=None,
-                                            decoded_token=None)
-                                })))
-        output.outputs.append(
-            CompletionSequenceGroupOutput(
-                seq_outputs,
-                (group_prompt_logprobs if logprobs_are_requested else None)))
+            if cache is not None:
+                seq_output: SequenceOutput = cache.cached_seq_output.get_object(
+                )
+                seq_output.parent_seq_id = seq_ids[parent_id]
+                seq_output.output_token = next_token_id
+
+                if logprobs_are_requested:
+                    seq_output.logprobs = group_sample_logprobs[tdx]
+                else:
+                    logprobs = next(iter(seq_output.logprobs.values()))
+                    seq_output.logprobs.clear()
+
+                    logprobs.logprob = float('inf')
+                    logprobs.rank = None
+                    logprobs.decoded_token = None
+
+                    seq_output.logprobs[next_token_id] = logprobs
+
+                seq_outputs.append(seq_output)
+
+            else:
+                seq_outputs.append(
+                    SequenceOutput(seq_ids[parent_id], next_token_id,
+                                   (group_sample_logprobs[tdx]
+                                    if logprobs_are_requested else {
+                                        next_token_id:
+                                        Logprob(logprob=float('inf'),
+                                                rank=None,
+                                                decoded_token=None)
+                                    })))
+        if cache is not None:
+            completion_seq_group_output.prompt_logprobs = \
+                group_prompt_logprobs if logprobs_are_requested else None
+            output.outputs.append(completion_seq_group_output)
+        else:
+            output.outputs.append(
+                CompletionSequenceGroupOutput(
+                    seq_outputs, (group_prompt_logprobs
+                                  if logprobs_are_requested else None)))
+
     assert len(output.outputs) > 0
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
index 517b0ab78c460..562285f828cc7 100644
--- a/vllm/worker/multi_step_worker.py
+++ b/vllm/worker/multi_step_worker.py
@@ -67,9 +67,7 @@ def _get_driver_input_and_broadcast(
             if execute_model_req.async_callback:
                 model_input.frozen_model_input = dataclasses.replace(  # type: ignore
                     model_input.frozen_model_input,
-                    async_callback=execute_model_req.async_callback,
-                    use_async_and_multi_step=execute_model_req.
-                    use_async_and_multi_step)
+                    async_callback=execute_model_req.async_callback)
         else:
             # on subsequent steps we reuse the worker input and model input
             multi_step_state = self.multi_step_states[virtual_engine]

From 652c83b697ac64923fac9b253a3e09a2b653eb46 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 3 Sep 2024 12:28:25 -0700
Subject: [PATCH 27/51] [Misc] Raise a more informative exception in
 add/remove_logger (#7750)

---
 vllm/engine/llm_engine.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 8c5ca81fb1905..7da4f7b25db9e 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1671,11 +1671,19 @@ def _get_last_sampled_token_ids(
         return None
 
     def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
+        if not self.log_stats:
+            raise RuntimeError(
+                "Stat logging is disabled. Set `disable_log_stats=False` "
+                "argument to enable.")
         if logger_name in self.stat_loggers:
             raise KeyError(f"Logger with name {logger_name} already exists.")
         self.stat_loggers[logger_name] = logger
 
     def remove_logger(self, logger_name: str) -> None:
+        if not self.log_stats:
+            raise RuntimeError(
+                "Stat logging is disabled. Set `disable_log_stats=False` "
+                "argument to enable.")
         if logger_name not in self.stat_loggers:
             raise KeyError(f"Logger with name {logger_name} does not exist.")
         del self.stat_loggers[logger_name]

From c02638efb36007458b11710e0f7428cffac7cbe4 Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Tue, 3 Sep 2024 22:37:08 +0300
Subject: [PATCH 28/51] [CI/Build] make pip install vllm work in macos (for
 import only) (#8118)

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 38d3f41663f2e..1e08a5bd70cd3 100644
--- a/setup.py
+++ b/setup.py
@@ -362,7 +362,8 @@ def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "version.py"))
 
     if _no_device():
-        version += "+empty"
+        if envs.VLLM_TARGET_DEVICE == "empty":
+            version += "+empty"
     elif _is_cuda():
         cuda_version = str(get_nvcc_cuda_version())
         if cuda_version != MAIN_CUDA_VERSION:

From f1575dc99f68292e96bf0688c4dcd353c7d66f7f Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 3 Sep 2024 13:25:09 -0700
Subject: [PATCH 29/51] [ci] Fix GHA workflow  (#8129)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .github/workflows/reminder_comment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 15c35f8d442f5..1aa538c53ac62 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -15,7 +15,7 @@ jobs:
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you don't have permission to unblock, ping @simon-mo or @khluu to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping @simon-mo or @khluu to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
             })
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 0af3abe3d3225449c907d75eb3d2ae4b83bd21a1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 3 Sep 2024 13:29:24 -0700
Subject: [PATCH 30/51] [TPU][Bugfix] Fix next_token_ids shape (#8128)

---
 vllm/worker/tpu_model_runner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index a0498315516b8..684c54b7d8139 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -601,7 +601,7 @@ def _execute_model(*args):
                 batch_idx += 1
             else:
                 for seq_id in seq_ids:
-                    next_token_id = next_token_ids[batch_idx][0]
+                    next_token_id = next_token_ids[batch_idx]
                     seq_outputs.append(
                         SequenceOutput(seq_id, next_token_id,
                                        {next_token_id: zero_logprob}))
@@ -722,6 +722,9 @@ def forward(
         sampled_token_ids = torch.multinomial(probs,
                                               num_samples,
                                               replacement=True)
+        if num_samples == 1:
+            argmax_token_ids = argmax_token_ids.squeeze(dim=-1)
+            sampled_token_ids = sampled_token_ids.squeeze(dim=-1)
         next_token_ids = torch.where(t != 0, sampled_token_ids,
                                      argmax_token_ids)
         return next_token_ids

From dc0b6066ab9dcdf290286e5ad2b630b462fc87e4 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 3 Sep 2024 14:11:42 -0700
Subject: [PATCH 31/51] [CI] Change PR remainder to avoid at-mentions (#8134)

---
 .github/workflows/reminder_comment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 1aa538c53ac62..99827756d2066 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -15,7 +15,7 @@ jobs:
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping @simon-mo or @khluu to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
             })
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 2188a60c7e0e5a414a87a4f0fd798333b2e0f625 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 3 Sep 2024 17:21:44 -0400
Subject: [PATCH 32/51] [Misc] Update `GPTQ` to use `vLLMParameters` (#7976)

---
 tests/weight_loading/models.txt               |   6 +
 tests/weight_loading/test_weight_loading.py   |   7 +-
 vllm/model_executor/layers/linear.py          |  25 +++--
 .../layers/quantization/gptq.py               | 103 ++++++++++--------
 .../layers/vocab_parallel_embedding.py        |   9 +-
 vllm/model_executor/parameter.py              |   5 +-
 6 files changed, 93 insertions(+), 62 deletions(-)

diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index cbe30305c14f6..1dc529037a98e 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -4,6 +4,12 @@ gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
 gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
 gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
 gptq_marlin, TechxGenus/gemma-1.1-2b-it-GPTQ, main
+gptq, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main
+gptq, TheBloke/Llama-2-7B-GPTQ, main
+gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
+gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
+gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
+gptq, TechxGenus/gemma-1.1-2b-it-GPTQ, main
 compressed-tensors, nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change, main
 compressed-tensors, nm-testing/tinyllama-oneshot-w8-channel-a8-tensor, main
 compressed-tensors, nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2, main
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index c13313df93f66..d8bca05e204c0 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,5 +1,7 @@
 import os
 
+import torch
+
 MAX_MODEL_LEN = 1024
 MODEL_NAME = os.environ.get("MODEL_NAME",
                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
@@ -8,9 +10,12 @@
 
 
 def test_weight_loading(vllm_runner):
+    """
+    Test parameter weight loading with tp>1.
+    """
     with vllm_runner(model_name=MODEL_NAME,
                      revision=REVISION,
-                     dtype="auto",
+                     dtype=torch.half if QUANTIZATION == "gptq" else "auto",
                      quantization=QUANTIZATION,
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=2) as model:
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1163cc727762d..8df1d7595f026 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -14,8 +14,10 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           PackedColumnParameter,
                                            PackedvLLMParameter,
-                                           PerTensorScaleParameter)
+                                           PerTensorScaleParameter,
+                                           RowvLLMParameter)
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
@@ -24,7 +26,7 @@
     "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod",
     "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
     "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
-    "TPUInt8LinearMethod"
+    "TPUInt8LinearMethod", "GPTQLinearMethod"
 ]
 
 
@@ -574,8 +576,8 @@ def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
             # Special case for Quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
-            if isinstance(param, PackedvLLMParameter
-                          ) and param.packed_dim == param.output_dim:
+            if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
+                                  )) and param.packed_dim == param.output_dim:
                 shard_size, shard_offset = \
                     param.adjust_shard_indexes_for_packing(
                     shard_size=shard_size, shard_offset=shard_offset)
@@ -594,9 +596,10 @@ def weight_loader_v2(self,
                 param.load_merged_column_weight(loaded_weight=loaded_weight,
                                                 shard_id=0)
                 return
-            elif type(param) is BasevLLMParameter:
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
                 param.load_merged_column_weight(loaded_weight=loaded_weight)
                 return
+            # TODO: @dsikka - move to parameter.py
             self._load_fused_module_from_checkpoint(param, loaded_weight)
             return
 
@@ -724,8 +727,8 @@ def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
             # Special case for Quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
-            if isinstance(param, PackedvLLMParameter
-                          ) and param.packed_dim == param.output_dim:
+            if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
+                                  )) and param.packed_dim == param.output_dim:
                 shard_size, shard_offset = \
                     param.adjust_shard_indexes_for_packing(
                     shard_size=shard_size, shard_offset=shard_offset)
@@ -741,12 +744,12 @@ def weight_loader_v2(self,
                          loaded_shard_id: Optional[str] = None):
         if loaded_shard_id is None:  # special case for certain models
             if isinstance(param, PerTensorScaleParameter):
-                param.load_merged_column_weight(loaded_weight=loaded_weight,
-                                                shard_id=0)
+                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
                 return
-            elif type(param) is BasevLLMParameter:
-                param.load_merged_column_weight(loaded_weight=loaded_weight)
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+                param.load_qkv_weight(loaded_weight=loaded_weight)
                 return
+            # TODO: @dsikka - move to parameter.py
             self._load_fused_module_from_checkpoint(param, loaded_weight)
             return
 
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index f456286899a53..c067a76405df6 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -11,7 +11,11 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
 
 
 class GPTQConfig(QuantizationConfig):
@@ -108,6 +112,7 @@ def create_weights(
         **extra_weight_attrs,
     ):
         del output_size  # Unused.
+        weight_loader = extra_weight_attrs.get("weight_loader")
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
@@ -138,73 +143,81 @@ def create_weights(
                 scale_and_zero_size = input_size_per_partition // group_size
                 scale_and_zero_input_dim = 0
 
-        qweight = Parameter(
-            torch.empty(
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
                 input_size_per_partition // self.quant_config.pack_factor,
                 output_size_per_partition,
                 dtype=torch.int32,
             ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            qweight, {
-                "input_dim": 0,
-                "output_dim": 1,
-                "packed_dim": 0,
-                "pack_factor": self.quant_config.pack_factor,
-            })
-        g_idx = Parameter(
-            torch.tensor(
-                [
-                    i // self.quant_config.group_size
-                    for i in range(input_size_per_partition)
-                ],
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        # Ignore warning from fused linear layers such as QKVParallelLinear.
-        set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True})
-        qzeros = Parameter(
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        g_idx = RowvLLMParameter(data=torch.tensor(
+            [
+                i // self.quant_config.group_size
+                for i in range(input_size_per_partition)
+            ],
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+        qzeros_args = {
+            "data":
             torch.empty(
                 scale_and_zero_size,
                 output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
             ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            qzeros, {
-                "input_dim": scale_and_zero_input_dim,
-                "output_dim": 1,
-                "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
-            })
-        scales = Parameter(
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
             torch.empty(
                 scale_and_zero_size,
                 output_size_per_partition,
                 dtype=params_dtype,
             ),
-            requires_grad=False,
-        )
-        set_weight_attrs(scales, {
-            "input_dim": scale_and_zero_input_dim,
-            "output_dim": 1,
-        })
+            "weight_loader":
+            weight_loader
+        }
+        if scale_and_zero_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
 
         layer.register_parameter("qweight", qweight)
-        set_weight_attrs(qweight, extra_weight_attrs)
         layer.register_parameter("g_idx", g_idx)
-        set_weight_attrs(g_idx, extra_weight_attrs)
         layer.register_parameter("qzeros", qzeros)
-        set_weight_attrs(qzeros, extra_weight_attrs)
         layer.register_parameter("scales", scales)
-        set_weight_attrs(scales, extra_weight_attrs)
 
         layer.exllama_state = exllama_state
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # for torch.compile
+        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
+        layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
+        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
+        layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
+
         # exllama needs to shuffle the weight after the weight is loaded
         # here we do the shuffle on first forward pass
         if layer.exllama_state == ExllamaState.UNINITIALIZED:
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index b26a3227e6931..ef6d401be2070 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -10,6 +10,7 @@
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
+from vllm.model_executor.parameter import BasevLLMParameter
 from vllm.model_executor.utils import set_weight_attrs
 
 DEFAULT_VOCAB_PADDING_SIZE = 64
@@ -370,10 +371,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # If param packed on the same dim we are sharding on, then
         # need to adjust offsets of loaded weight by pack_factor.
         if packed_dim is not None and packed_dim == output_dim:
+            packed_factor = param.packed_factor if isinstance(
+                param, BasevLLMParameter) else param.pack_factor
             assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
-                                                       param.pack_factor)
-            start_idx = start_idx // param.pack_factor
-            shard_size = shard_size // param.pack_factor
+                                                       param.packed_factor)
+            start_idx = start_idx // packed_factor
+            shard_size = shard_size // packed_factor
         else:
             assert loaded_weight.shape[output_dim] == self.org_vocab_size
 
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 326b6ae8fee64..9ffb339ffeab3 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,3 +1,4 @@
+from fractions import Fraction
 from typing import Callable, Optional, Union
 
 import torch
@@ -257,7 +258,7 @@ class PackedColumnParameter(_ColumnvLLMParameter):
     """
 
     def __init__(self,
-                 packed_factor: int,
+                 packed_factor: Union[int, Fraction],
                  packed_dim: int,
                  marlin_tile_size: Optional[int] = None,
                  **kwargs):
@@ -298,7 +299,7 @@ class PackedvLLMParameter(ModelWeightParameter):
     """
 
     def __init__(self,
-                 packed_factor: int,
+                 packed_factor: Union[int, Fraction],
                  packed_dim: int,
                  marlin_tile_size: Optional[int] = None,
                  **kwargs):

From d4db9f53c8a50a2b0788cf1e03b5b91f20de4313 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 3 Sep 2024 17:57:41 -0700
Subject: [PATCH 33/51] [Benchmark] Add `--async-engine` option to
 benchmark_throughput.py (#7964)

---
 benchmarks/benchmark_throughput.py    | 113 +++++++++++++++++++++++++-
 vllm/entrypoints/openai/api_server.py |  45 ++++++----
 vllm/entrypoints/openai/rpc/client.py |   4 +
 3 files changed, 143 insertions(+), 19 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index eaf256f7cb8c2..94549d84fb4e4 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,13 +6,16 @@
 from typing import List, Optional, Tuple
 
 import torch
+import uvloop
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
 
-from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
 def sample_requests(
@@ -135,6 +138,93 @@ def run_vllm(
     return end - start
 
 
+async def run_vllm_async(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    use_v2_block_manager: bool = False,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
+    disable_frontend_multiprocessing: bool = False,
+) -> float:
+    from vllm import SamplingParams
+    engine_args = AsyncEngineArgs(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        use_v2_block_manager=use_v2_block_manager,
+        disable_async_output_proc=disable_async_output_proc,
+        worker_use_ray=False,
+        engine_use_ray=False,
+        disable_log_requests=True,
+    )
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        for prompt, _, output_len in requests:
+            prompts.append(prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=0.0 if use_beam_search else 1.0,
+                    top_p=1.0,
+                    use_beam_search=use_beam_search,
+                    ignore_eos=True,
+                    max_tokens=output_len,
+                ))
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
 def run_hf(
     requests: List[Tuple[str, int, int]],
     model: str,
@@ -230,7 +320,7 @@ def main(args: argparse.Namespace):
                                    args.output_len)
 
     if args.backend == "vllm":
-        elapsed_time = run_vllm(
+        run_args = [
             requests, args.model, args.tokenizer, args.quantization,
             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
             args.trust_remote_code, args.dtype, args.max_model_len,
@@ -240,7 +330,14 @@ def main(args: argparse.Namespace):
             args.max_num_batched_tokens, args.distributed_executor_backend,
             args.gpu_memory_utilization, args.num_scheduler_steps,
             args.use_v2_block_manager, args.download_dir, args.load_format,
-            args.disable_async_output_proc)
+            args.disable_async_output_proc
+        ]
+
+        if args.async_engine:
+            run_args.append(args.disable_frontend_multiprocessing)
+            elapsed_time = uvloop.run(run_vllm_async(*run_args))
+        else:
+            elapsed_time = run_vllm(*run_args)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -426,6 +523,14 @@ def main(args: argparse.Namespace):
         action='store_true',
         default=False,
         help="Disable async output processor for vLLM backend.")
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 8e8371ef1559a..7632e8aa5e32e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -67,7 +67,7 @@
 
 
 def model_is_embedding(model_name: str, trust_remote_code: bool,
-                       quantization: str) -> bool:
+                       quantization: Optional[str]) -> bool:
     return ModelConfig(model=model_name,
                        tokenizer=model_name,
                        tokenizer_mode="auto",
@@ -96,13 +96,6 @@ async def _force_log():
 @asynccontextmanager
 async def build_async_engine_client(
         args: Namespace) -> AsyncIterator[Optional[AsyncEngineClient]]:
-    """
-    Create AsyncEngineClient, either:
-        - in-process using the AsyncLLMEngine Directly
-        - multiprocess using AsyncLLMEngine RPC
-
-    Returns the Client or None if the creation failed.
-    """
 
     # Context manager to handle async_engine_client lifecycle
     # Ensures everything is shutdown and cleaned up on error/exit
@@ -112,14 +105,37 @@ async def build_async_engine_client(
     # Backend itself still global for the silly lil' health handler
     global async_engine_client
 
+    async with build_async_engine_client_from_engine_args(
+            engine_args, args.disable_frontend_multiprocessing) as engine:
+
+        async_engine_client = engine  # type: ignore[assignment]
+        yield engine
+
+
+@asynccontextmanager
+async def build_async_engine_client_from_engine_args(
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+) -> AsyncIterator[Optional[AsyncEngineClient]]:
+    """
+    Create AsyncEngineClient, either:
+        - in-process using the AsyncLLMEngine Directly
+        - multiprocess using AsyncLLMEngine RPC
+
+    Returns the Client or None if the creation failed.
+    """
+
     # If manually triggered or embedding model, use AsyncLLMEngine in process.
     # TODO: support embedding model via RPC.
-    if (model_is_embedding(args.model, args.trust_remote_code,
-                           args.quantization)
-            or args.disable_frontend_multiprocessing):
-        async_engine_client = AsyncLLMEngine.from_engine_args(
+    if (model_is_embedding(engine_args.model, engine_args.trust_remote_code,
+                           engine_args.quantization)
+            or disable_frontend_multiprocessing):
+        engine_client = AsyncLLMEngine.from_engine_args(
             engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
-        yield async_engine_client
+        try:
+            yield engine_client
+        finally:
+            engine_client.shutdown_background_loop()
         return
 
     # Otherwise, use the multiprocessing AsyncLLMEngine.
@@ -148,7 +164,6 @@ async def build_async_engine_client(
         # NOTE: Actually, this is not true yet. We still need to support
         # embedding models via RPC (see TODO above)
         rpc_client = AsyncEngineRPCClient(rpc_path)
-        async_engine_client = rpc_client  # type: ignore
 
         # Start RPCServer in separate process (holds the AsyncLLMEngine).
         context = multiprocessing.get_context("spawn")
@@ -174,7 +189,7 @@ async def build_async_engine_client(
                         yield None
                         return
 
-            yield async_engine_client
+            yield rpc_client  # type: ignore[misc]
         finally:
             # Ensure rpc server process was terminated
             rpc_server_process.terminate()
diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py
index c457555c54b9c..9b88db746be5c 100644
--- a/vllm/entrypoints/openai/rpc/client.py
+++ b/vllm/entrypoints/openai/rpc/client.py
@@ -7,6 +7,7 @@
 import cloudpickle
 import zmq
 import zmq.asyncio
+from zmq import Frame  # type: ignore[attr-defined]
 from zmq.asyncio import Socket
 
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
@@ -214,6 +215,7 @@ async def _send_get_data_rpc_request(self, request: RPCUtilityRequest,
 
             # Await the data from the Server.
             frame = await socket.recv(copy=False)
+            assert isinstance(frame, Frame)
             data = pickle.loads(frame.buffer)
 
         if isinstance(data, Exception):
@@ -247,6 +249,7 @@ async def do_rpc_call(socket: Socket, request: RPC_REQUEST_TYPE):
                                    f"{self._data_timeout} ms")
 
             frame = await socket.recv(copy=False)
+            assert isinstance(frame, Frame)
             return pickle.loads(frame.buffer)
 
         # Make a new socket connection.
@@ -395,6 +398,7 @@ async def generate(
                 # Stream back the results from the RPC Server.
                 while not finished:
                     message = await socket.recv(copy=False)
+                    assert isinstance(message, Frame)
                     request_output = pickle.loads(message.buffer)
 
                     if isinstance(request_output, Exception):

From 61f4a93d1490f285b0dd3a536dd85a9f3f18ddd9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 3 Sep 2024 18:35:33 -0700
Subject: [PATCH 34/51] [TPU][Bugfix] Use XLA rank for persistent cache path
 (#8137)

---
 docs/source/getting_started/tpu-installation.rst | 2 +-
 vllm/worker/tpu_worker.py                        | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index d0c2498d8849e..217028839e347 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -59,7 +59,7 @@ First, install the dependencies:
     $ export DATE="20240828"
     $ export TORCH_VERSION="2.5.0"
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
-    $ pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
+    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
 
     $ # Install JAX and Pallas.
     $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 44fa3aed5816d..9e0c522cee453 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -102,8 +102,9 @@ def init_device(self) -> None:
         # NOTE(woosuk): Set per-rank cache path since different ranks
         # can have slightly different XLA graphs.
         world_size = self.parallel_config.world_size
+        rank = xr.global_ordinal()
         per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                     f"tp{world_size}_rank{self.rank}")
+                                     f"tp{world_size}_rank{rank}")
         xr.initialize_cache(per_rank_path, readonly=False)
 
     def load_model(self):

From e16fa99a6ad5bae4aedfb76121d4e622d27f81c3 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 3 Sep 2024 22:12:41 -0400
Subject: [PATCH 35/51] [Misc] Update fbgemmfp8 to use `vLLMParameters` (#7972)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/linear.py          |  2 +-
 .../layers/quantization/fbgemm_fp8.py         | 34 ++++++++++++-------
 .../layers/quantization/utils/w8a8_utils.py   | 27 ---------------
 3 files changed, 22 insertions(+), 41 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8df1d7595f026..b997507ea738d 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -26,7 +26,7 @@
     "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod",
     "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
     "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
-    "TPUInt8LinearMethod", "GPTQLinearMethod"
+    "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod"
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index e7c3859967c71..3ccf1af9eb898 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -15,8 +15,9 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, create_per_channel_scale_param)
-from vllm.model_executor.utils import set_weight_attrs
+    apply_fp8_linear)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -85,6 +86,7 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        weight_loader = extra_weight_attrs.get("weight_loader")
         del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
 
@@ -95,20 +97,21 @@ def create_weights(
         layer.orig_dtype = params_dtype
 
         # WEIGHT
-        weight = Parameter(torch.empty(output_size_per_partition,
-                                       input_size_per_partition,
-                                       dtype=torch.float8_e4m3fn),
-                           requires_grad=False)
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            "input_dim": 1,
-            "output_dim": 0,
-            **extra_weight_attrs,
-        })
 
         # WEIGHT SCALE
-        weight_scale = create_per_channel_scale_param(output_partition_sizes,
-                                                      **extra_weight_attrs)
+        weight_scale = ChannelQuantScaleParameter(data=torch.empty(
+            (sum(output_partition_sizes), 1), dtype=torch.float32),
+                                                  output_dim=0,
+                                                  weight_loader=weight_loader)
+        weight_scale[:] = torch.finfo(torch.float32).min
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE UPPER BOUND
@@ -118,6 +121,11 @@ def create_weights(
         layer.input_scale_ub = input_scale_ub
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        # required by torch.compile
+        layer.weight_scale = Parameter(layer.weight_scale.data,
+                                       requires_grad=False)
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
         weight = layer.weight
         layer.weight = Parameter(weight.t(), requires_grad=False)
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 6cc1c65ddfa82..a54e3cae73b14 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,10 +1,8 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-from torch.nn import Parameter
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import is_hip
 
@@ -38,31 +36,6 @@ def all_close_1d(x: torch.Tensor) -> bool:
     return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
 
 
-def create_per_tensor_scale_param(
-    output_partition_sizes: List[int],
-    **extra_weight_attrs,
-) -> Parameter:
-    scale = Parameter(torch.empty(len(output_partition_sizes),
-                                  dtype=torch.float32),
-                      requires_grad=False)
-    scale[:] = torch.finfo(torch.float32).min
-    set_weight_attrs(scale, {
-        "needs_scalar_to_array": True,
-        **extra_weight_attrs
-    })
-    return scale
-
-
-def create_per_channel_scale_param(output_partition_sizes: List[int],
-                                   **extra_weight_attrs) -> Parameter:
-    scale = Parameter(torch.empty((sum(output_partition_sizes), 1),
-                                  dtype=torch.float32),
-                      requires_grad=False)
-    scale[:] = torch.finfo(torch.float32).min
-    set_weight_attrs(scale, {"output_dim": 0, **extra_weight_attrs})
-    return scale
-
-
 def convert_to_channelwise(
         weight_scale: torch.Tensor,
         logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:

From 2be8ec6e71473573a9732460fcde9392cf52be45 Mon Sep 17 00:00:00 2001
From: Peter Salas <peter@fixie.ai>
Date: Tue, 3 Sep 2024 21:38:21 -0700
Subject: [PATCH 36/51] [Model] Add Ultravox support for multiple audio chunks
 (#7963)

---
 examples/offline_inference_audio_language.py |  58 ++++---
 tests/models/test_ultravox.py                | 103 +++++++++----
 vllm/model_executor/models/ultravox.py       | 152 +++++++++++--------
 3 files changed, 198 insertions(+), 115 deletions(-)

diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 56ce8646c20c9..1c6ac06123bbb 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -11,25 +11,33 @@
 from vllm.assets.audio import AudioAsset
 from vllm.utils import FlexibleArgumentParser
 
-# Input audio and question
-audio_and_sample_rate = AudioAsset("mary_had_lamb").audio_and_sample_rate
-question = "What is recited in the audio?"
+audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+question_per_audio_count = [
+    "What is recited in the audio?",
+    "What sport and what nursery rhyme are referenced?"
+]
 
 
 # Ultravox 0.3
-def run_ultravox(question):
+def run_ultravox(question, audio_count):
     model_name = "fixie-ai/ultravox-v0_3"
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
-        'role': 'user',
-        'content': f"<|reserved_special_token_0|>\n{question}"
+        'role':
+        'user',
+        'content':
+        "<|reserved_special_token_0|>\n" * audio_count + question
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name)
+    llm = LLM(model=model_name,
+              enforce_eager=True,
+              enable_chunked_prefill=False,
+              max_model_len=8192,
+              limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -44,7 +52,9 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
-    llm, prompt, stop_token_ids = model_example_map[model](question)
+    audio_count = args.num_audios
+    llm, prompt, stop_token_ids = model_example_map[model](
+        question_per_audio_count[audio_count - 1], audio_count)
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
@@ -53,23 +63,18 @@ def main(args):
                                      stop_token_ids=stop_token_ids)
 
     assert args.num_prompts > 0
-    if args.num_prompts == 1:
-        # Single inference
-        inputs = {
-            "prompt": prompt,
-            "multi_modal_data": {
-                "audio": audio_and_sample_rate
-            },
-        }
-
-    else:
+    inputs = {
+        "prompt": prompt,
+        "multi_modal_data": {
+            "audio": [
+                asset.audio_and_sample_rate
+                for asset in audio_assets[:audio_count]
+            ]
+        },
+    }
+    if args.num_prompts > 1:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                "audio": audio_and_sample_rate
-            },
-        } for _ in range(args.num_prompts)]
+        inputs = [inputs] * args.num_prompts
 
     outputs = llm.generate(inputs, sampling_params=sampling_params)
 
@@ -92,6 +97,11 @@ def main(args):
                         type=int,
                         default=1,
                         help='Number of prompts to run.')
+    parser.add_argument("--num-audios",
+                        type=int,
+                        default=1,
+                        choices=[1, 2],
+                        help="Number of audio items per prompt.")
 
     args = parser.parse_args()
     main(args)
diff --git a/tests/models/test_ultravox.py b/tests/models/test_ultravox.py
index 23008f9b8b563..e98db9b65f484 100644
--- a/tests/models/test_ultravox.py
+++ b/tests/models/test_ultravox.py
@@ -16,37 +16,32 @@
 
 AudioTuple = Tuple[np.ndarray, int]
 
+VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+HF_PLACEHOLDER = "<|audio|>"
+
 
 @pytest.fixture(scope="session")
-def audio_and_sample_rate():
+def audio_assets():
     from vllm.assets.audio import AudioAsset
-    return AudioAsset("mary_had_lamb").audio_and_sample_rate
+    return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
 
 
-@pytest.fixture
-def prompts_and_audios(audio_and_sample_rate):
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
+def audio(request):
+    from vllm.assets.audio import AudioAsset
+    return AudioAsset(request.param)
 
-    vllm_placeholder = "<|reserved_special_token_0|>"
-    hf_placeholder = "<|audio|>"
 
-    question = "What's in the audio?"
-    vllm_prompt = tokenizer.apply_chat_template(
-        [{
-            'role': 'user',
-            'content': f"{vllm_placeholder}\n{question}"
-        }],
-        tokenize=False,
-        add_generation_prompt=True)
-    hf_prompt = tokenizer.apply_chat_template(
-        [{
-            'role': 'user',
-            'content': f"{hf_placeholder}\n{question}"
-        }],
-        tokenize=False,
-        add_generation_prompt=True)
+def _get_prompt(audio_count, question, placeholder):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    placeholder = f"{placeholder}\n" * audio_count
 
-    return [(vllm_prompt, hf_prompt, audio_and_sample_rate)]
+    return tokenizer.apply_chat_template([{
+        'role': 'user',
+        'content': f"{placeholder}{question}"
+    }],
+                                         tokenize=False,
+                                         add_generation_prompt=True)
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
@@ -134,15 +129,71 @@ def process(hf_inputs: BatchEncoding):
         )
 
 
+def run_multi_audio_test(
+    vllm_runner: Type[VllmRunner],
+    prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True,
+                     limit_mm_per_prompt={
+                         "audio":
+                         max((len(audio) for _, audio in prompts_and_audios))
+                     }) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            [prompt for prompt, _ in prompts_and_audios],
+            max_tokens,
+            num_logprobs=num_logprobs,
+            audios=[audios for _, audios in prompts_and_audios])
+
+    # The HuggingFace model doesn't support multiple audios yet, so
+    # just assert that some tokens were generated.
+    assert all(tokens for tokens, *_ in vllm_outputs)
+
+
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, prompts_and_audios, dtype: str,
-                max_tokens: int, num_logprobs: int) -> None:
+def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
+
+    vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
+    hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
     run_test(
         hf_runner,
         vllm_runner,
-        prompts_and_audios,
+        [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)],
+        MODEL_NAME,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
+                                     max_tokens: int,
+                                     num_logprobs: int) -> None:
+
+    vllm_prompt = _get_prompt(len(audio_assets),
+                              "Describe each of the audios above.",
+                              VLLM_PLACEHOLDER)
+    run_multi_audio_test(
+        vllm_runner,
+        [(vllm_prompt, [audio.audio_and_sample_rate
+                        for audio in audio_assets])],
         MODEL_NAME,
         dtype=dtype,
         max_tokens=max_tokens,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 7994945c5ac39..416fabda831a2 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -29,12 +29,12 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import SupportsMultiModal
-from vllm.model_executor.models.utils import (filter_weights,
+from vllm.model_executor.models.utils import (filter_weights, flatten_bn,
                                               init_vllm_registered_model,
                                               merge_multimodal_embeddings)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.base import MultiModalInputs, NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
@@ -48,13 +48,14 @@
 
 class UltravoxAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    data: Union[torch.Tensor, List[torch.Tensor]]
-    """Shape: `(batch_size * num_audios, 80, M)"""
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, 80, M)"""
 
 
 class UltravoxAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
-    data: torch.Tensor
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
 
 
 UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
@@ -85,24 +86,33 @@ def dummy_data_for_ultravox(
 
     audio_count = mm_counts["audio"]
 
-    audio_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [
-        _AUDIO_PLACEHOLDER_TOKEN
-    ]) * get_ultravox_max_audio_tokens(ctx) * audio_count
+    audio_placeholder = array(
+        VLLM_TOKEN_ID_ARRAY_TYPE,
+        [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx)
+
+    # Add a separator between each chunk.
+    audio_token_ids = (audio_placeholder +
+                       array(VLLM_TOKEN_ID_ARRAY_TYPE, [0])) * audio_count
     other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
                             [0]) * (seq_len - len(audio_token_ids))
 
     audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
-    mm_dict = {
-        "audio":
-        audio_and_sr if audio_count == 1 else [audio_and_sr] * audio_count
-    }
+    mm_dict = {"audio": [audio_and_sr] * audio_count}
 
     return (SequenceData(audio_token_ids + other_token_ids), mm_dict)
 
 
 def input_mapper_for_ultravox(ctx: InputContext, data: object):
-    if isinstance(data, tuple):
-        (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], data)
+    if not isinstance(data, list):
+        data = [data]
+
+    audio_features = []
+    for audio_input in data:
+        if not isinstance(audio_input, tuple):
+            raise NotImplementedError(
+                f"Unsupported data type: {type(audio_input)}")
+
+        (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], audio_input)
         feature_extractor = whisper_feature_extractor(ctx)
 
         if sr != feature_extractor.sampling_rate:
@@ -121,15 +131,14 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
             # Not enough audio; pad it.
             audio = np.pad(audio, (0, minimum_audio_length - len(audio)))
 
-        return MultiModalInputs({
-            "audio_features":
-            feature_extractor(audio,
-                              sampling_rate=sr,
-                              padding="longest",
-                              return_tensors="pt")["input_features"]
-        })
+        single_audio_features = feature_extractor(
+            audio, sampling_rate=sr, padding="longest",
+            return_tensors="pt")["input_features"]
 
-    raise NotImplementedError(f"Unsupported data type: {type(data)}")
+        # Remove the batch dimension because we're wrapping it in a list.
+        audio_features.append(single_audio_features.squeeze(0))
+
+    return MultiModalInputs({"audio_features": audio_features})
 
 
 def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs):
@@ -138,25 +147,31 @@ def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs):
         return llm_inputs
 
     feature_extractor = whisper_feature_extractor(ctx)
-    audio_data, sample_rate = multi_modal_data["audio"]
-
-    audio_length = audio_data.shape[0]
-    if sample_rate != feature_extractor.sampling_rate:
-        # Account for resampling.
-        adjustment = feature_extractor.sampling_rate / sample_rate
-        audio_length = math.ceil(adjustment * audio_length)
-
-    feature_extractor_output_length = math.ceil(
-        (audio_length -
-         (feature_extractor.hop_length - 1)) / feature_extractor.hop_length)
-
-    uv_config = ctx.get_hf_config(UltravoxConfig)
-    audio_num_tokens = min(
-        max(
-            1,
-            math.ceil(feature_extractor_output_length /
-                      (uv_config.stack_factor * 2))),
-        get_ultravox_max_audio_tokens(ctx))
+    audios = multi_modal_data["audio"]
+    if not isinstance(audios, list):
+        audios = [audios]
+
+    audio_token_counts = []
+    for audio_data, sample_rate in audios:
+        audio_length = audio_data.shape[0]
+        if sample_rate != feature_extractor.sampling_rate:
+            # Account for resampling.
+            adjustment = feature_extractor.sampling_rate / sample_rate
+            audio_length = math.ceil(adjustment * audio_length)
+
+        feature_extractor_output_length = math.ceil(
+            (audio_length - (feature_extractor.hop_length - 1)) /
+            feature_extractor.hop_length)
+
+        uv_config = ctx.get_hf_config(UltravoxConfig)
+        audio_num_tokens = min(
+            max(
+                1,
+                math.ceil(feature_extractor_output_length /
+                          (uv_config.stack_factor * 2))),
+            get_ultravox_max_audio_tokens(ctx))
+        audio_token_counts.append(audio_num_tokens)
+
     tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
 
     new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
@@ -164,7 +179,7 @@ def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs):
         llm_inputs.get("prompt"),
         llm_inputs["prompt_token_ids"],
         placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN,
-        repeat_count=audio_num_tokens,
+        repeat_count=audio_token_counts,
     )
 
     # NOTE: Create a defensive copy of the original inputs
@@ -338,45 +353,52 @@ def _parse_and_validate_audio_input(
                 raise ValueError("Incorrect type of audio features. "
                                  f"Got type: {type(audio_features)}")
 
-            # Remove the N dimension until multiple audios are supported.
-            if isinstance(audio_features, torch.Tensor):
-                audio_features = audio_features.squeeze(1)
-            else:
-                audio_features = [t.squeeze(0) for t in audio_features]
-
             return UltravoxAudioFeatureInputs(type="audio_features",
                                               data=audio_features)
 
         if audio_embeds is not None:
-            if not isinstance(audio_embeds, torch.Tensor):
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio embeds. "
                                  f"Got type: {type(audio_embeds)}")
 
-            # Remove the N dimension until multiple audios are supported.
-            audio_embeds = audio_embeds.squeeze(1)
-
             return UltravoxAudioEmbeddingInputs(type="audio_embeds",
                                                 data=audio_embeds)
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_audio_input(
-        self, audio_input: UltravoxAudioInputs
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+            self, audio_input: UltravoxAudioInputs) -> NestedTensors:
         if audio_input["type"] == "audio_embeds":
             return audio_input["data"]
 
         audio_features = audio_input["data"]
-        if isinstance(audio_features, list):
-            # TODO: Batch these through the encoder/projector instead of
-            # serializing them.
-            return [
-                self._audio_features_to_embeddings(
-                    features.unsqueeze(0)).squeeze(0)
-                for features in audio_features
-            ]
-        else:
-            return self._audio_features_to_embeddings(audio_features)
+        if isinstance(audio_features, torch.Tensor):
+            # Combine the B and N dimensions for the encoder/projector
+            flattened = flatten_bn(audio_features)
+            flattened_embeddings = self._audio_features_to_embeddings(
+                flattened)
+
+            # Restore the original dimensions
+            embeddings = flattened_embeddings.unflatten(
+                0, audio_features.shape[:2])
+            return embeddings
+
+        result = []
+        # TODO: Batch heterogeneous tensors through the encoder/projector
+        for audio_features_item in audio_features:
+            if isinstance(audio_features_item, torch.Tensor):
+                result.append(
+                    self._audio_features_to_embeddings(audio_features_item))
+            else:
+                embeddings = [
+                    # Add a batch dimension to embed it, then remove it.
+                    self._audio_features_to_embeddings(tensor.unsqueeze(0)
+                                                       ).squeeze(0)
+                    for tensor in audio_features_item
+                ]
+                result.append(embeddings)
+
+        return result
 
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
@@ -393,7 +415,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         with the `input_ids`.
 
         Args:
-            input_features: A batch of audio inputs, [1, 80, M].
+            audio_features: A batch of audio inputs [B, N, 80, M].
         """
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is not None:

From 855c262a6bcbb392a6e312caa3489648aa3f4a47 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 4 Sep 2024 13:22:17 +0800
Subject: [PATCH 37/51] [Frontend] Multimodal support in offline chat (#8098)

---
 tests/entrypoints/llm/test_generate.py        |  34 +++
 tests/entrypoints/test_chat_utils.py          | 164 ++++++++++----
 vllm/entrypoints/chat_utils.py                | 208 +++++++++++++-----
 vllm/entrypoints/llm.py                       |  31 ++-
 vllm/entrypoints/openai/serving_chat.py       |   9 +-
 .../openai/serving_tokenization.py            |   7 +-
 vllm/multimodal/utils.py                      |  10 +
 vllm/transformers_utils/tokenizers/mistral.py |   5 +-
 8 files changed, 356 insertions(+), 112 deletions(-)

diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index c426e9b4ee899..ef34bebbb0f8c 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,6 +6,7 @@
 from vllm import LLM, RequestOutput, SamplingParams
 
 from ...conftest import cleanup
+from ..openai.test_vision import TEST_IMAGE_URLS
 
 MODEL_NAME = "facebook/opt-125m"
 
@@ -159,3 +160,36 @@ def test_chat():
     ]
     outputs = llm.chat(messages)
     assert len(outputs) == 1
+
+
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(image_urls: List[str]):
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        dtype="bfloat16",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = llm.chat(messages)
+    assert len(outputs) >= 0
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 53f99189beb1c..6ded5102c9314 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1,11 +1,14 @@
 import warnings
+from typing import Optional
 
 import pytest
 from PIL import Image
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import parse_chat_messages
+from vllm.entrypoints.chat_utils import (parse_chat_messages,
+                                         parse_chat_messages_futures)
+from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
@@ -42,10 +45,28 @@ def image_url():
     return f"data:image/jpeg;base64,{base64}"
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_with_image_url(phi3v_model_config,
-                                                  phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def _assert_mm_data_is_image_input(
+    mm_data: Optional[MultiModalDataDict],
+    image_count: int,
+) -> None:
+    assert mm_data is not None
+    assert set(mm_data.keys()) == {"image"}
+
+    image_data = mm_data.get("image")
+    assert image_data is not None
+
+    if image_count == 1:
+        assert isinstance(image_data, Image.Image)
+    else:
+        assert isinstance(image_data, list) and len(image_data) == image_count
+
+
+def test_parse_chat_messages_single_image(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -63,15 +84,42 @@ async def test_parse_chat_messages_with_image_url(phi3v_model_config,
         "role": "user",
         "content": "<|image_1|>\nWhat's in the image?"
     }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert isinstance(mm_data["image"], Image.Image)
+    _assert_mm_data_is_image_input(mm_data, 1)
 
 
 @pytest.mark.asyncio
-async def test_parse_chat_messages_multiple_images(phi3v_model_config,
-                                                   phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+async def test_parse_chat_messages_single_image_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in the image?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 1)
+
+
+def test_parse_chat_messages_multiple_images(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -96,15 +144,49 @@ async def test_parse_chat_messages_multiple_images(phi3v_model_config,
         "content":
         "<|image_1|>\n<|image_2|>\nWhat's in these images?"
     }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)
 
 
 @pytest.mark.asyncio
-async def test_parse_chat_messages_placeholder_already_in_prompt(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+async def test_parse_chat_messages_multiple_images_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+
+
+def test_parse_chat_messages_placeholder_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -131,15 +213,15 @@ async def test_parse_chat_messages_placeholder_already_in_prompt(
         "content":
         "What's in <|image_1|> and how does it compare to <|image_2|>?"
     }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_placeholder_one_already_in_prompt(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def test_parse_chat_messages_placeholder_one_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -167,15 +249,15 @@ async def test_parse_chat_messages_placeholder_one_already_in_prompt(
         "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
         "other one?"
     }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_multiple_images_across_messages(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def test_parse_chat_messages_multiple_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
         "role":
         "user",
         "content": [{
@@ -218,14 +300,14 @@ async def test_parse_chat_messages_multiple_images_across_messages(
             "content": "<|image_2|>\nWhat about this one?"
         },
     ]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
-        phi3v_model_config, phi3v_tokenizer, image_url):
+def test_parse_chat_messages_rejects_too_many_images_in_one_message(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
@@ -259,9 +341,11 @@ async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
             }], phi3v_model_config, phi3v_tokenizer)
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_rejects_too_many_images_across_messages(
-        phi3v_model_config, phi3v_tokenizer, image_url):
+def test_parse_chat_messages_rejects_too_many_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c70c6d9330b10..f205a99920892 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,10 +1,11 @@
 import asyncio
 import codecs
+from abc import ABC, abstractmethod
 from collections import defaultdict
 from functools import lru_cache
 from pathlib import Path
-from typing import (Any, Awaitable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Tuple, Union)
+from typing import (Any, Awaitable, Dict, Generic, Iterable, List, Literal,
+                    Mapping, Optional, Tuple, TypeVar, Union)
 
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -23,7 +24,8 @@
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import (async_get_and_parse_audio,
-                                   async_get_and_parse_image)
+                                   async_get_and_parse_image,
+                                   get_and_parse_audio, get_and_parse_image)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
@@ -81,7 +83,11 @@ class ConversationMessage(TypedDict):
     content: str
 
 
-class MultiModalItemTracker:
+ModalityStr = Literal["image", "audio"]
+_T = TypeVar("_T")
+
+
+class BaseMultiModalItemTracker(ABC, Generic[_T]):
     """
     Tracks multi-modal items in a given request and ensures that the number
     of multi-modal items in a given request does not exceed the configured
@@ -89,37 +95,28 @@ class MultiModalItemTracker:
     """
 
     def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
+        super().__init__()
+
         self._model_config = model_config
         self._tokenizer = tokenizer
         self._allowed_items = (model_config.multimodal_config.limit_per_prompt
                                if model_config.multimodal_config else {})
         self._consumed_items = {k: 0 for k in self._allowed_items}
-        self._futures: List[Awaitable[MultiModalDataDict]] = []
+
+        self._items: List[_T] = []
 
     @staticmethod
     @lru_cache(maxsize=None)
-    def _cached_token_str(tokenizer: AnyTokenizer, token_index: int):
+    def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
         return tokenizer.decode(token_index)
 
-    def add(self, modality: Literal["image", "audio"],
-            mm_future: Awaitable[MultiModalDataDict]) -> Optional[str]:
-        """
-        Adds the multi-modal item to the current prompt and returns the
-        placeholder string to use, if any.
-        """
-        allowed_count = self._allowed_items.get(modality, 1)
-        current_count = self._consumed_items.get(modality, 0) + 1
-        if current_count > allowed_count:
-            raise ValueError(
-                f"At most {allowed_count} {modality}(s) may be provided in "
-                "one request.")
-
-        self._consumed_items[modality] = current_count
-        self._futures.append(mm_future)
-
+    def _placeholder_str(self, modality: ModalityStr,
+                         current_count: int) -> Optional[str]:
         # TODO: Let user specify how to insert image tokens into prompt
         # (similar to chat template)
-        model_type = self._model_config.hf_config.model_type
+        hf_config = self._model_config.hf_config
+        model_type = hf_config.model_type
+
         if modality == "image":
             if model_type == "phi3_v":
                 # Workaround since this token is not defined in the tokenizer
@@ -130,9 +127,8 @@ def add(self, modality: Literal["image", "audio"],
                 # These models do not use image tokens in the prompt
                 return None
             if model_type.startswith("llava"):
-                return MultiModalItemTracker._cached_token_str(
-                    self._tokenizer,
-                    self._model_config.hf_config.image_token_index)
+                return self._cached_token_str(self._tokenizer,
+                                              hf_config.image_token_index)
             if model_type in ("chameleon", "internvl_chat"):
                 return "<image>"
 
@@ -145,11 +141,11 @@ def add(self, modality: Literal["image", "audio"],
             raise TypeError(f"Unknown modality: {modality}")
 
     @staticmethod
-    async def _combine(futures: List[Awaitable[MultiModalDataDict]]):
+    def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict:
         mm_lists: Mapping[str, List[object]] = defaultdict(list)
 
         # Merge all the multi-modal items
-        for single_mm_data in (await asyncio.gather(*futures)):
+        for single_mm_data in items:
             for mm_key, mm_item in single_mm_data.items():
                 if isinstance(mm_item, list):
                     mm_lists[mm_key].extend(mm_item)
@@ -162,9 +158,113 @@ async def _combine(futures: List[Awaitable[MultiModalDataDict]]):
             for mm_key, mm_list in mm_lists.items()
         }
 
-    def all_mm_data(self) -> Optional[Awaitable[MultiModalDataDict]]:
-        return MultiModalItemTracker._combine(
-            self._futures) if self._futures else None
+    def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
+        """
+        Add a multi-modal item to the current prompt and returns the
+        placeholder string to use, if any.
+        """
+        allowed_count = self._allowed_items.get(modality, 1)
+        current_count = self._consumed_items.get(modality, 0) + 1
+        if current_count > allowed_count:
+            raise ValueError(
+                f"At most {allowed_count} {modality}(s) may be provided in "
+                "one request.")
+
+        self._consumed_items[modality] = current_count
+        self._items.append(item)
+
+        return self._placeholder_str(modality, current_count)
+
+    @abstractmethod
+    def create_parser(self) -> "BaseMultiModalContentParser":
+        raise NotImplementedError
+
+
+class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]):
+
+    def all_mm_data(self) -> Optional[MultiModalDataDict]:
+        return self._combine(self._items) if self._items else None
+
+    def create_parser(self) -> "BaseMultiModalContentParser":
+        return MultiModalContentParser(self)
+
+
+class AsyncMultiModalItemTracker(
+        BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]):
+
+    async def all_mm_data(self) -> Optional[MultiModalDataDict]:
+        if self._items:
+            items = await asyncio.gather(*self._items)
+            return self._combine(items)
+
+        return None
+
+    def create_parser(self) -> "BaseMultiModalContentParser":
+        return AsyncMultiModalContentParser(self)
+
+
+class BaseMultiModalContentParser(ABC):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        # multimodal placeholder_string : count
+        self._placeholder_counts: Dict[str, int] = defaultdict(lambda: 0)
+
+    def _add_placeholder(self, placeholder: Optional[str]):
+        if placeholder:
+            self._placeholder_counts[placeholder] += 1
+
+    def mm_placeholder_counts(self) -> Dict[str, int]:
+        return dict(self._placeholder_counts)
+
+    @abstractmethod
+    def parse_image(self, image_url: str) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def parse_audio(self, audio_url: str) -> None:
+        raise NotImplementedError
+
+
+class MultiModalContentParser(BaseMultiModalContentParser):
+
+    def __init__(self, tracker: MultiModalItemTracker) -> None:
+        super().__init__()
+
+        self._tracker = tracker
+
+    def parse_image(self, image_url: str) -> None:
+        image = get_and_parse_image(image_url)
+
+        placeholder = self._tracker.add("image", image)
+        self._add_placeholder(placeholder)
+
+    def parse_audio(self, audio_url: str) -> None:
+        audio = get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio)
+        self._add_placeholder(placeholder)
+
+
+class AsyncMultiModalContentParser(BaseMultiModalContentParser):
+
+    def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
+        super().__init__()
+
+        self._tracker = tracker
+
+    def parse_image(self, image_url: str) -> None:
+        image_coro = async_get_and_parse_image(image_url)
+
+        placeholder = self._tracker.add("image", image_coro)
+        self._add_placeholder(placeholder)
+
+    def parse_audio(self, audio_url: str) -> None:
+        audio_coro = async_get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio_coro)
+        self._add_placeholder(placeholder)
 
 
 def load_chat_template(
@@ -197,10 +297,10 @@ def load_chat_template(
 # (similar to chat template)
 def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
                                      text_prompt: str) -> str:
-    """Combine multimodal prompts for a multimodal language model"""
+    """Combine multimodal prompts for a multimodal language model."""
 
     # Look through the text prompt to check for missing placeholders
-    missing_placeholders = []
+    missing_placeholders: List[str] = []
     for placeholder in placeholder_counts:
 
         # For any existing placeholder in the text prompt, we leave it as is
@@ -227,12 +327,11 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 def _parse_chat_message_content_parts(
     role: str,
     parts: Iterable[ChatCompletionContentPartParam],
-    mm_tracker: MultiModalItemTracker,
+    mm_tracker: BaseMultiModalItemTracker,
 ) -> List[ConversationMessage]:
     texts: List[str] = []
 
-    # multimodal placeholder_string : count
-    mm_placeholder_counts: Dict[str, int] = {}
+    mm_parser = mm_tracker.create_parser()
 
     for part in parts:
         part_type = part["type"]
@@ -247,22 +346,16 @@ def _parse_chat_message_content_parts(
                     "'image_url.detail' is currently not supported and "
                     "will be ignored.")
 
-            image_coro = async_get_and_parse_image(image_url["url"])
-            placeholder = mm_tracker.add("image", image_coro)
-            if placeholder:
-                mm_placeholder_counts[placeholder] = mm_placeholder_counts.get(
-                    placeholder, 0) + 1
+            mm_parser.parse_image(image_url["url"])
         elif part_type == "audio_url":
             audio_url = _AudioParser.validate_python(part)["audio_url"]
-            audio_coro = async_get_and_parse_audio(audio_url["url"])
-            placeholder = mm_tracker.add("audio", audio_coro)
-            if placeholder:
-                mm_placeholder_counts[placeholder] = mm_placeholder_counts.get(
-                    placeholder, 0) + 1
+
+            mm_parser.parse_audio(audio_url["url"])
         else:
             raise NotImplementedError(f"Unknown part type: {part_type}")
 
     text_prompt = "\n".join(texts)
+    mm_placeholder_counts = mm_parser.mm_placeholder_counts()
     if mm_placeholder_counts:
         text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts,
                                                        text_prompt)
@@ -271,8 +364,9 @@ def _parse_chat_message_content_parts(
 
 
 def _parse_chat_message_content(
-        message: ChatCompletionMessageParam,
-        mm_tracker: MultiModalItemTracker) -> List[ConversationMessage]:
+    message: ChatCompletionMessageParam,
+    mm_tracker: BaseMultiModalItemTracker,
+) -> List[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
 
@@ -292,7 +386,7 @@ def parse_chat_messages(
     messages: List[ChatCompletionMessageParam],
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
-) -> Tuple[List[ConversationMessage], Optional[Awaitable[MultiModalDataDict]]]:
+) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]:
     conversation: List[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
@@ -304,6 +398,22 @@ def parse_chat_messages(
     return conversation, mm_tracker.all_mm_data()
 
 
+def parse_chat_messages_futures(
+    messages: List[ChatCompletionMessageParam],
+    model_config: ModelConfig,
+    tokenizer: AnyTokenizer,
+) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
+    conversation: List[ConversationMessage] = []
+    mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
+
+    for msg in messages:
+        sub_messages = _parse_chat_message_content(msg, mm_tracker)
+
+        conversation.extend(sub_messages)
+
+    return conversation, mm_tracker.all_mm_data()
+
+
 def apply_chat_template(
     tokenizer: AnyTokenizer,
     conversation: List[ConversationMessage],
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 0edd4bfaecd6a..b32c90a4df1aa 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -23,7 +23,7 @@
                                                get_cached_tokenizer)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Counter, deprecate_kwargs
+from vllm.utils import Counter, deprecate_kwargs, is_list_of
 
 logger = init_logger(__name__)
 
@@ -358,15 +358,18 @@ def chat(
         add_generation_prompt: bool = True,
     ) -> List[RequestOutput]:
         """
-        Generates responses for chat messages.
+        Generate responses for a chat conversation.
 
-        Converts the messages to prompts using the tokenizer and calls
-        the :meth:`generate` method to generate the responses.
+        The chat conversation is converted into a text prompt using the
+        tokenizer and calls the :meth:`generate` method to generate the
+        responses.
+
+        Multi-modal inputs can be passed in the same way you would pass them
+        to the OpenAI API.
 
         Args:
-            messages: A list of messages to generate responses for. Each
-                message is a list of dictionaries with 'role' and 'content'
-                keys.
+            messages: A single conversation represented as a list of messages.
+                Each message is a dictionary with 'role' and 'content' keys.
             sampling_params: The sampling parameters for text generation.
                 If None, we use the default sampling parameters. When it
                 is a single value, it is applied to every prompt. When it
@@ -387,21 +390,25 @@ def chat(
         tokenizer = self.get_tokenizer()
         model_config = self.llm_engine.get_model_config()
 
-        conversations, _ = parse_chat_messages(messages, model_config,
-                                               tokenizer)
+        conversation, mm_data = parse_chat_messages(messages, model_config,
+                                                    tokenizer)
 
         prompt = apply_chat_template(
             tokenizer,
-            conversations,
+            conversation,
             chat_template=chat_template,
-            add_generation_prompt=add_generation_prompt)
+            add_generation_prompt=add_generation_prompt,
+        )
 
         inputs: PromptInputs
-        if isinstance(prompt, list) and isinstance(prompt[0], int):
+        if is_list_of(prompt, int):
             inputs = TokensPrompt(prompt_token_ids=prompt)
         else:
             inputs = TextPrompt(prompt=prompt)
 
+        if mm_data is not None:
+            inputs["multi_modal_data"] = mm_data
+
         return self.generate(
             inputs,
             sampling_params=sampling_params,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index f7576509d06c8..a3bc0bb7b3554 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -11,7 +11,7 @@
 from vllm.entrypoints.chat_utils import (ConversationMessage,
                                          apply_chat_template,
                                          load_chat_template,
-                                         parse_chat_messages)
+                                         parse_chat_messages_futures)
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -26,7 +26,6 @@
                                                     TextTokensPrompt)
 from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -94,7 +93,7 @@ async def create_chat_completion(
             tokenizer = await self.async_engine_client.get_tokenizer(
                 lora_request)
 
-            conversation, mm_data_future = parse_chat_messages(
+            conversation, mm_data_future = parse_chat_messages_futures(
                 request.messages, model_config, tokenizer)
 
             tool_dicts = None if request.tools is None else [
@@ -114,10 +113,8 @@ async def create_chat_completion(
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
 
-        mm_data: Optional[MultiModalDataDict] = None
         try:
-            if mm_data_future:
-                mm_data = await mm_data_future
+            mm_data = await mm_data_future
         except Exception as e:
             logger.error("Error in loading multi-modal data: %s", e)
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index fc9ca29e9cf86..c3c0d52072cd3 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -4,7 +4,7 @@
 from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (apply_chat_template,
                                          load_chat_template,
-                                         parse_chat_messages)
+                                         parse_chat_messages_futures)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -65,10 +65,11 @@ async def create_tokenize(
         if isinstance(request, TokenizeChatRequest):
             model_config = self.model_config
 
-            conversation, mm_data_future = parse_chat_messages(
+            conversation, mm_data_future = parse_chat_messages_futures(
                 request.messages, model_config, tokenizer)
 
-            if mm_data_future:
+            mm_data = await mm_data_future
+            if mm_data:
                 logger.warning(
                     "Multi-modal inputs are ignored during tokenization")
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 4bed267e99637..b76b765bc677a 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -120,6 +120,16 @@ async def async_fetch_audio(
     return librosa.load(BytesIO(audio_bytes), sr=None)
 
 
+def get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
+    audio, sr = fetch_audio(audio_url)
+    return {"audio": (audio, sr)}
+
+
+def get_and_parse_image(image_url: str) -> MultiModalDataDict:
+    image = fetch_image(image_url)
+    return {"image": image}
+
+
 async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
     audio, sr = await async_fetch_audio(audio_url)
     return {"audio": (audio, sr)}
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 23ecfc0af6be4..533a86b787325 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -52,12 +52,13 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
         assert isinstance(self.tokenizer,
                           (Tekkenizer, SentencePieceTokenizer)), type(
                               self.tokenizer)
-        self._is_tekken = isinstance(self.tokenizer, Tekkenizer)
 
-        if self._is_tekken:
+        if (is_tekken := isinstance(self.tokenizer, Tekkenizer)):
             # Make sure special tokens will not raise
             self.tokenizer.special_token_policy = SpecialTokenPolicy.IGNORE
 
+        self._is_tekken = is_tekken
+
         # the following attributes are set to fit VLLM's design
         self.is_fast = True
         self.chat_template = True

From ccd72071911951a3eb73b52a1578c8e6e51130d7 Mon Sep 17 00:00:00 2001
From: TimWang <7367474+haitwang-cloud@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:17:05 +0800
Subject: [PATCH 38/51] chore: Update check-wheel-size.py to read MAX_SIZE_MB
 from env (#8103)

---
 .buildkite/check-wheel-size.py | 35 ++++++++++++++++++++--------------
 Dockerfile                     | 13 ++++++++++---
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index b39dce2659a54..0412c5f37952d 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,36 +1,43 @@
 import os
+import sys
 import zipfile
 
-MAX_SIZE_MB = 250
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
 
 
 def print_top_10_largest_files(zip_file):
+    """Print the top 10 largest files in the given zip file."""
     with zipfile.ZipFile(zip_file, 'r') as z:
         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
         file_sizes.sort(key=lambda x: x[1], reverse=True)
         for f, size in file_sizes[:10]:
-            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
 
 
 def check_wheel_size(directory):
+    """Check the size of .whl files in the given directory."""
     for root, _, files in os.walk(directory):
-        for f in files:
-            if f.endswith(".whl"):
-                wheel_path = os.path.join(root, f)
-                wheel_size = os.path.getsize(wheel_path)
-                wheel_size_mb = wheel_size / (1024 * 1024)
-                if wheel_size_mb > MAX_SIZE_MB:
-                    print(
-                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
-                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+        for file_name in files:
+            if file_name.endswith(".whl"):
+                wheel_path = os.path.join(root, file_name)
+                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
+                if wheel_size_mb > VLLM_MAX_SIZE_MB:
+                    print(f"Not allowed: Wheel {wheel_path} is larger "
+                          f"({wheel_size_mb:.2f} MB) than the limit "
+                          f"({VLLM_MAX_SIZE_MB} MB).")
                     print_top_10_largest_files(wheel_path)
                     return 1
                 else:
                     print(f"Wheel {wheel_path} is within the allowed size "
-                          f"({wheel_size_mb} MB).")
+                          f"({wheel_size_mb:.2f} MB).")
     return 0
 
 
 if __name__ == "__main__":
-    import sys
-    sys.exit(check_wheel_size(sys.argv[1]))
+    if len(sys.argv) < 2:
+        print("Usage: python check-wheel-size.py <directory>")
+        sys.exit(1)
+
+    directory = sys.argv[1]
+    sys.exit(check_wheel_size(directory))
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index ec6069f605eb1..7f255e1d6e93e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -108,10 +108,17 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
-# check the size of the wheel, we cannot upload wheels larger than 100MB
+# Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-RUN python3 check-wheel-size.py dist
-
+# Default max size of the wheel is 250MB
+ARG VLLM_MAX_SIZE_MB=250
+ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
+ARG RUN_WHEEL_CHECK=true
+RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
+        python3 check-wheel-size.py dist; \
+    else \
+        echo "Skipping wheel size check."; \
+    fi
 #################### EXTENSION Build IMAGE ####################
 
 #################### DEV IMAGE ####################

From d3311562fbe740a883e7f03f0b59620587cabb29 Mon Sep 17 00:00:00 2001
From: wnma <wnma3mz@gmail.com>
Date: Wed, 4 Sep 2024 18:55:37 +0800
Subject: [PATCH 39/51] [Bugfix] remove post_layernorm in siglip (#8106)

---
 vllm/model_executor/models/siglip.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 114dbf09b0c53..0bee75e2f0cbb 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -443,14 +443,27 @@ def __init__(
         self.config = config
         embed_dim = config.hidden_size
 
+        if (num_hidden_layers_override is None
+                or num_hidden_layers_override == config.num_hidden_layers):
+            self.need_post_layernorm = True
+        elif num_hidden_layers_override > config.num_hidden_layers:
+            raise ValueError(
+                "num_hidden_layers_override cannot be greater than "
+                "num_hidden_layers")
+        else:
+            self.need_post_layernorm = False
+
         self.embeddings = SiglipVisionEmbeddings(config)
         self.encoder = SiglipEncoder(
             config,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
         )
-        self.post_layernorm = nn.LayerNorm(embed_dim,
-                                           eps=config.layer_norm_eps)
+        if self.need_post_layernorm:
+            self.post_layernorm = nn.LayerNorm(embed_dim,
+                                               eps=config.layer_norm_eps)
+        else:
+            self.post_layernorm = nn.Identity()
         self.use_head = (True if not hasattr(config, "vision_use_head") else
                          config.vision_use_head)
         if self.use_head:
@@ -470,7 +483,6 @@ def forward(
         encoder_outputs = self.encoder(inputs_embeds=hidden_states)
 
         last_hidden_state = self.post_layernorm(encoder_outputs)
-
         # TODO: add this back when pooled_output is used in inference
         # if self.use_head:
         # pooled_output = self.head(last_hidden_state)
@@ -499,6 +511,10 @@ def __init__(
             num_hidden_layers_override=num_hidden_layers_override,
         )
 
+    @property
+    def need_post_layernorm(self):
+        return self.vision_model.need_post_layernorm
+
     def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
 
@@ -517,6 +533,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         layer_count = len(self.vision_model.encoder.layers)
 
         for name, loaded_weight in weights:
+            # post_layernorm is optional in SiglipVisionModel
+            if ("vision_model.post_layernorm" in name
+                    and not self.need_post_layernorm):
+                continue
+
             # omit layers when num_hidden_layers_override is set
             if "vision_model.encoder.layers." in name:
                 layer_idx = int(name.split(".")[3])

From 2ad2e5608eeede10683412bbbfaf30b3a68019dc Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 4 Sep 2024 11:53:25 -0700
Subject: [PATCH 40/51] [MISC] Consolidate FP8 kv-cache tests (#8131)

---
 .buildkite/run-cpu-test.sh                    |   7 +-
 .../basic_correctness/test_chunked_prefill.py |  43 +----
 tests/models/test_fp8.py                      | 181 ++++++++----------
 tests/models/test_fp8kv_flashinfer.py         |  96 ----------
 4 files changed, 94 insertions(+), 233 deletions(-)
 delete mode 100644 tests/models/test_fp8kv_flashinfer.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 8e4be08f3aba0..ca9cf15780e25 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,7 +23,12 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
+      --ignore=tests/models/test_oot_registration.py \
+      --ignore=tests/models/test_registry.py \
+      --ignore=tests/models/test_fp8.py \
+      --ignore=tests/models/test_jamba.py \
+      --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 
 # online inference
 docker exec cpu-test bash -c "
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index a63ac380e8598..9c34b2a13fd53 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -16,18 +16,6 @@
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
-E5M2_KV_MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-chat-hf",
-]
-E4M3_KV_MODELS = [
-    "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
-    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
-]
-KV_CACHE_QUANTIZATION_PATHS = {
-    "meta-llama/Llama-2-7b-chat-hf":
-    "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
-}
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -78,10 +66,10 @@ def test_models(
     )
 
 
-@pytest.mark.parametrize("kv_cache_dtype,model",
-                         [("fp8_e5m2", m)
-                          for m in E5M2_KV_MODELS] + [("fp8_e4m3", m)
-                                                      for m in E4M3_KV_MODELS])
+@pytest.mark.parametrize(
+    "kv_cache_dtype,model",
+    [("fp8_e4m3",
+      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
@@ -104,30 +92,15 @@ def test_models_with_fp8_kv_cache(
     disable_async_output_proc: bool,
 ) -> None:
     """
-    Only checks log probs match between chunked-prefill and
-    non-chunked-prefill version of vLLM model runner.
-    
-    This test is used when there is discrepancy in kernels
-    / numerics (e.g. when using lower-precision types like FP8).
+    Check output logprobs match between no_chunked_prefill and chunked_prefill
+    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
+    so here we only check chunked prefill.
     """
     NUM_LOG_PROBS = 8
 
-    if model == "facebook/opt-125m":
-        pytest.skip(
-            "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
-        )
-    if ((model, kv_cache_dtype, chunked_prefill_token_size) == (
-            "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", "fp8_e4m3", 4)):
-        pytest.skip("flakey test, see: #7874 #8051")
-
     max_num_seqs = chunked_prefill_token_size
     max_num_batched_tokens = chunked_prefill_token_size
 
-    extra_kwargs = {}
-    if model in KV_CACHE_QUANTIZATION_PATHS:
-        extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[
-            model]
-
     with vllm_runner(
             model,
             tensor_parallel_size=tensor_parallel_size,
@@ -135,7 +108,6 @@ def test_models_with_fp8_kv_cache(
             max_num_seqs=max_num_seqs,
             kv_cache_dtype=kv_cache_dtype,
             disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
     ) as vllm_model:
         no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -149,7 +121,6 @@ def test_models_with_fp8_kv_cache(
             max_num_seqs=max_num_seqs,
             kv_cache_dtype=kv_cache_dtype,
             disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
     ) as vllm_model:
         chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 4ab968c01da04..17acdb52322fd 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -3,116 +3,97 @@
 Note: these tests will only pass on L4 GPU.
 """
 import os
-from typing import List
+from typing import Optional
 
 import pytest
-import torch
-from transformers import AutoTokenizer
 
+from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
 
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-MAX_MODEL_LEN = 1024
-
-MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-]
+from ..models.utils import check_logprobs_close
 
-EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": {
-        "auto": [
-            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no'
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system made up of several basic components that work together to enable it to',
-            'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk'
-        ]
-    },
-    "meta-llama/Meta-Llama-3-8B-Instruct": {
-        "auto": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-            'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu'
-        ]
-    },
-}
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 
-# This test compares against golden strings for exact match since
-# there is no baseline implementation to compare against
-# and is unstable w.r.t specifics of the fp8 implementation or
-# the hardware being run on.
-# Disabled to prevent it from breaking the build
-@pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build.")
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model_name", MODELS)
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
-    model = LLM(model=model_name,
-                max_model_len=MAX_MODEL_LEN,
-                trust_remote_code=True,
-                enforce_eager=True,
-                quantization="fp8",
-                kv_cache_dtype=kv_cache_dtype)
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model,scale_path",
+    [
+        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
+        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
+         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
+         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
+         "meta-llama/Llama-2-7b-chat-hf",
+         "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    scale_path: Optional[str],
+    max_tokens: int,
+    enforce_eager: bool,
+    backend: str,
+    tensor_parallel_size: int,
+    disable_async_output_proc: bool,
+    monkeypatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+    override_backend_env_variable(monkeypatch, backend)
+
+    MAX_MODEL_LEN = 1024
+    NUM_LOG_PROBS = 8
+
+    with vllm_runner(
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        baseline_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for prompt in example_prompts
-    ]
+    extra_kwargs = {}
+    if scale_path is not None:
+        extra_kwargs["quantization_param_path"] = scale_path
 
-    params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
-    # Note: these need to be run 1 at a time due to numerical precision,
-    # since the expected strs were generated this way.
-    for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
-        generations.append(outputs[0].outputs[0].text)
-    del model
+    with vllm_runner(
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+            **extra_kwargs,
+    ) as vllm_model:
+        test_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    print(model_name, kv_cache_dtype, generations)
-    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
-    for i in range(len(example_prompts)):
-        generated_str = generations[i]
-        expected_str = expected_strs[i]
-        assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+    check_logprobs_close(
+        outputs_0_lst=baseline_outputs,
+        outputs_1_lst=test_outputs,
+        name_0="fp16_kv_cache",
+        name_1="fp8_kv_cache",
+    )
diff --git a/tests/models/test_fp8kv_flashinfer.py b/tests/models/test_fp8kv_flashinfer.py
deleted file mode 100644
index ff2a44162b6c3..0000000000000
--- a/tests/models/test_fp8kv_flashinfer.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# flake8: noqa
-"""Tests fp8 models against ground truth generation
-This verifies the flashinfer backend with fp8 
-quantization and fp8 KV Cache without scaling 
-factors Note: these tests will only pass on H100 GPU.
-"""
-import os
-from typing import List
-
-import pytest
-from transformers import AutoTokenizer
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
-
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-MAX_MODEL_LEN = 1024
-
-MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
-]
-
-EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": {
-        "auto": [
-            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-            'In the sterile, metallic halls of the robotics lab, a peculiar phenomenon occurred. Zeta-5',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, mushi o',
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
-        ]
-    }
-}
-
-
-# This test compares against golden strings for exact match since
-# there is no baseline implementation to compare against
-# and is unstable w.r.t specifics of the fp8 implementation or
-# the hardware being run on.
-# No assert to prevent it from breaking the build
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model_name", MODELS)
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-@pytest.mark.parametrize("backend", ["XFORMERS", "FLASHINFER"])
-def test_models(example_prompts, model_name, kv_cache_dtype, backend) -> None:
-    # Note that the golden strings may not work for FLASHINFER Backend.
-    # The intention is to test the path
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
-    model = LLM(model=model_name,
-                max_model_len=MAX_MODEL_LEN,
-                trust_remote_code=True,
-                quantization="fp8",
-                kv_cache_dtype=kv_cache_dtype)
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for prompt in example_prompts
-    ]
-
-    params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
-    # Note: these need to be run 1 at a time due to numerical precision,
-    # since the expected strs were generated this way.
-    for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
-        generations.append(outputs[0].outputs[0].text)
-    del model
-
-    print(f"Testing: {model_name} with kv_cache_dtype: {kv_cache_dtype}")
-    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
-    for i in range(len(example_prompts)):
-        generated_str = generations[i]
-        expected_str = expected_strs[i]
-        print(f"generated_str\n: {generated_str}")
-        print(f"expected_str\n: {expected_str}")

From d1dec6424307a6070bf3ab1700633996f20ef248 Mon Sep 17 00:00:00 2001
From: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:57:54 -0400
Subject: [PATCH 41/51] [CI/Build][ROCm] Enabling LoRA tests on ROCm (#7369)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .buildkite/run-amd-test.sh     | 47 +++++++++++++++++++++++++++++-----
 .buildkite/test-pipeline.yaml  |  3 +--
 tests/lora/test_gemma.py       |  4 +++
 tests/lora/test_quant_model.py | 24 ++++++++++++-----
 4 files changed, 64 insertions(+), 14 deletions(-)
 mode change 100644 => 100755 .buildkite/run-amd-test.sh

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
old mode 100644
new mode 100755
index 5548071390aff..972c62a091aea
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,5 +1,5 @@
 # This script runs test inside the corresponding ROCm docker container.
-set -ex
+set -o pipefail
 
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
@@ -70,16 +70,51 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p ${HF_CACHE}
 HF_MOUNT="/root/.cache/huggingface"
 
-docker run \
+commands=$@
+PARALLEL_JOB_COUNT=8
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+if [[ $commands == *"--shard-id="* ]]; then
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    #replace shard arguments
+    commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
+    commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+    docker run \
         --device /dev/kfd --device /dev/dri \
         --network host \
         --shm-size=16gb \
         --rm \
-        -e HIP_VISIBLE_DEVICES=0 \
+        -e HIP_VISIBLE_DEVICES=${GPU} \
         -e HF_TOKEN \
         -v ${HF_CACHE}:${HF_MOUNT} \
         -e HF_HOME=${HF_MOUNT} \
-        --name ${container_name} \
+        --name ${container_name}_${GPU}  \
         ${image_name} \
-        /bin/bash -c "${@}"
-
+        /bin/bash -c "${commands}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in ${PIDS[@]}; do
+    wait ${pid}
+    STATUS+=($?)
+  done
+  for st in ${STATUS[@]}; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit ${st}
+    fi
+  done
+else
+  docker run \
+          --device /dev/kfd --device /dev/dri \
+          --network host \
+          --shm-size=16gb \
+          --rm \
+          -e HIP_VISIBLE_DEVICES=0 \
+          -e HF_TOKEN \
+          -v ${HF_CACHE}:${HF_MOUNT} \
+          -e HF_HOME=${HF_MOUNT} \
+          --name ${container_name} \
+          ${image_name} \
+          /bin/bash -c "${commands}"
+fi
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 86eddb576c42a..65e1862ce8181 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -218,9 +218,9 @@ steps:
     - pytest -v -s spec_decode
 
 - label: LoRA Test %N # 30min each
+  mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/lora
-  - csrc/punica
   - tests/lora
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
   parallelism: 4
@@ -360,7 +360,6 @@ steps:
   num_gpus: 4
   source_file_dependencies:
   - vllm/lora
-  - csrc/punica
   - tests/lora/test_long_context
   commands:
     # FIXIT: find out which code initialize cuda before running the test
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 709246179bfe4..58cac3156c9c1 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,7 +1,10 @@
 from typing import List
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hip
 
 MODEL_PATH = "google/gemma-7b"
 
@@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
+@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm")
 def test_gemma_lora(gemma_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 2370c693e9534..133e0d4514a6d 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -7,6 +7,7 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hip
 
 from .conftest import cleanup
 
@@ -17,12 +18,23 @@ class ModelWithQuantization:
     quantization: str
 
 
-MODELS: List[ModelWithQuantization] = [
-    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-                          quantization="AWQ"),
-    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-                          quantization="GPTQ"),
-]
+MODELS: List[ModelWithQuantization]
+#AWQ quantization is currently not supported in ROCm.
+if is_hip():
+    MODELS = [
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            quantization="GPTQ"),
+    ]
+else:
+    MODELS = [
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            quantization="AWQ"),
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            quantization="GPTQ"),
+    ]
 
 
 def do_sample(llm: vllm.LLM,

From 561d6f8077c54c7af5dbf2ed92131ce9f7d9b56b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 4 Sep 2024 13:05:50 -0700
Subject: [PATCH 42/51] [CI] Change test input in Gemma LoRA test (#8163)

---
 tests/lora/test_gemma.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 58cac3156c9c1..f7c1d4f041c12 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -13,7 +13,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     prompts = [
         "Quote: Imagination is",
         "Quote: Be yourself;",
-        "Quote: So many books,",
+        "Quote: Painting is poetry that is seen rather than felt,",
     ]
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
     outputs = llm.generate(
@@ -41,7 +41,8 @@ def test_gemma_lora(gemma_lora_files):
     expected_lora_output = [
         "more important than knowledge.\nAuthor: Albert Einstein\n",
         "everyone else is already taken.\nAuthor: Oscar Wilde\n",
-        "so little time\nAuthor: Frank Zappa\n",
+        "and poetry is painting that is felt rather than seen.\n"
+        "Author: Leonardo da Vinci\n",
     ]
 
     output1 = do_sample(llm, gemma_lora_files, lora_id=1)

From e02ce498be2e11a165803d4590588ba98f129797 Mon Sep 17 00:00:00 2001
From: Kyle Mistele <kyle@mistele.com>
Date: Wed, 4 Sep 2024 15:18:13 -0500
Subject: [PATCH 43/51] [Feature] OpenAI-Compatible Tools API + Streaming for
 Hermes & Mistral models (#5649)

Co-authored-by: constellate <constellate@1-ai-appserver-staging.codereach.com>
Co-authored-by: Kyle Mistele <kyle@constellate.ai>
---
 .buildkite/test-pipeline.yaml                 |  10 +
 .../serving/openai_compatible_server.md       |  58 ++-
 ...penai_chat_completion_client_with_tools.py | 162 +++++++++
 examples/tool_chat_template_hermes.jinja      | 129 +++++++
 examples/tool_chat_template_mistral.jinja     |  86 +++++
 .../tool_chat_template_mistral_parallel.jinja |  94 +++++
 requirements-common.txt                       |   1 +
 tests/tool_use/__init__.py                    |   0
 tests/tool_use/conftest.py                    |  32 ++
 tests/tool_use/test_chat_completions.py       | 143 ++++++++
 tests/tool_use/test_parallel_tool_calls.py    | 193 ++++++++++
 tests/tool_use/test_tool_calls.py             | 192 ++++++++++
 tests/tool_use/utils.py                       | 215 +++++++++++
 vllm/entrypoints/chat_utils.py                | 101 ++++-
 vllm/entrypoints/openai/api_server.py         |   8 +-
 vllm/entrypoints/openai/cli_args.py           |  18 +
 vllm/entrypoints/openai/protocol.py           | 125 ++++++-
 vllm/entrypoints/openai/serving_chat.py       | 275 ++++++++++++--
 .../openai/serving_tokenization.py            |   6 +-
 .../openai/tool_parsers/__init__.py           |   5 +
 .../tool_parsers/abstract_tool_parser.py      |  58 +++
 .../openai/tool_parsers/hermes_tool_parser.py | 344 ++++++++++++++++++
 .../tool_parsers/mistral_tool_parser.py       | 293 +++++++++++++++
 vllm/entrypoints/openai/tool_parsers/utils.py |  87 +++++
 .../guided_decoding/__init__.py               |   5 +-
 .../guided_decoding/outlines_decoding.py      |  31 +-
 26 files changed, 2588 insertions(+), 83 deletions(-)
 create mode 100644 examples/openai_chat_completion_client_with_tools.py
 create mode 100644 examples/tool_chat_template_hermes.jinja
 create mode 100644 examples/tool_chat_template_mistral.jinja
 create mode 100644 examples/tool_chat_template_mistral_parallel.jinja
 create mode 100644 tests/tool_use/__init__.py
 create mode 100644 tests/tool_use/conftest.py
 create mode 100644 tests/tool_use/test_chat_completions.py
 create mode 100644 tests/tool_use/test_parallel_tool_calls.py
 create mode 100644 tests/tool_use/test_tool_calls.py
 create mode 100644 tests/tool_use/utils.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/__init__.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 65e1862ce8181..d50d8f32a816d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -92,6 +92,7 @@ steps:
   - pytest -v -s entrypoints/openai
   - pytest -v -s entrypoints/test_chat_utils.py
 
+
 - label: Distributed Tests (4 GPUs) # 10min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -271,6 +272,15 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
+- label: OpenAI-Compatible Tool Use # 20 min
+  fast_check: false
+  mirror_hardwares: [ amd ]
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s tool_use
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index b2acde390083c..eb4ea0fb5655e 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -110,6 +110,14 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 :func: create_parser_for_docs
 :prog: vllm serve
 ```
+## Tool Calling in the Chat Completion API
+### Named Function Calling
+vLLM supports only named function calling in the chat completion API by default. It does so using Outlines, so this is 
+enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a 
+high-quality one. 
+
+To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and 
+specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. 
 
 ### Config file
 
@@ -140,10 +148,52 @@ The order of priorities is `command line > config file values > defaults`.
 ## Tool calling in the chat completion API
 vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap.
 
-To use a named function you need to define the function in the `tools` parameter and call it in the `tool_choice` parameter. 
-
-It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. **This may change in the future.**
+It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt.
 
 vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
 
-Please refer to the OpenAI API reference documentation for more information.
+
+### Automatic Function Calling
+To enable this feature, you should set the following flags:
+* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it 
+deems appropriate.
+* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral`. Additional tool parsers 
+will continue to be added in the future.
+* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages 
+that contain previously generated tool calls. Hermes and Mistral models have tool-compatible chat templates in their 
+`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat 
+template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates)
+from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json)
+
+If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! 
+
+#### Hermes Models
+All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
+* `NousResearch/Hermes-2-Pro-*`
+* `NousResearch/Hermes-2-Theta-*`
+* `NousResearch/Hermes-3-*`
+
+
+_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge 
+step in their creation_. 
+
+Flags: `--tool-call-parser hermes`
+
+#### Mistral Models
+Supported models:
+* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
+* Additional mistral function-calling models are compatible as well.
+
+Known issues:
+1. Mistral 7B struggles to generate parallel tool calls correctly. 
+2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is 
+much shorter than what vLLM generates. Since an exception is thrown when this condition 
+is not met, the following additional chat templates are provided:
+
+* `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that
+it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
+* `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt
+when tools are provided, that results in much better reliability when working with parallel tool calling.
+
+
+Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/openai_chat_completion_client_with_tools.py
new file mode 100644
index 0000000000000..2bbe42b6bd2ef
--- /dev/null
+++ b/examples/openai_chat_completion_client_with_tools.py
@@ -0,0 +1,162 @@
+"""
+Set up this example by starting a vLLM OpenAI-compatible server with tool call
+options enabled. For example:
+
+IMPORTANT: for mistral, you must use one of the provided mistral tool call
+templates, or your own - the model default doesn't work for tool calls with vLLM
+See the vLLM docs on OpenAI server & tool calling for more details.
+
+vllm serve --model mistralai/Mistral-7B-Instruct-v0.3 \
+            --chat-template examples/tool_chat_template_mistral.jinja \
+            --enable-auto-tool-choice --tool-call-parser mistral
+
+OR
+vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
+            --chat-template examples/tool_chat_template_hermes.jinja \
+            --enable-auto-tool-choice --tool-call-parser hermes
+"""
+import json
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+
+messages = [{
+    "role": "user",
+    "content": "Hi! How are you doing today?"
+}, {
+    "role": "assistant",
+    "content": "I'm doing well! How can I help you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+chat_completion = client.chat.completions.create(messages=messages,
+                                                 model=model,
+                                                 tools=tools)
+
+print("Chat completion results:")
+print(chat_completion)
+print("\n\n")
+
+tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                   model=model,
+                                                   tools=tools,
+                                                   stream=True)
+
+chunks = []
+for chunk in tool_calls_stream:
+    chunks.append(chunk)
+    if chunk.choices[0].delta.tool_calls:
+        print(chunk.choices[0].delta.tool_calls[0])
+    else:
+        print(chunk.choices[0].delta)
+
+arguments = []
+tool_call_idx = -1
+for chunk in chunks:
+
+    if chunk.choices[0].delta.tool_calls:
+        tool_call = chunk.choices[0].delta.tool_calls[0]
+
+        if tool_call.index != tool_call_idx:
+            if tool_call_idx >= 0:
+                print(
+                    f"streamed tool call arguments: {arguments[tool_call_idx]}"
+                )
+            tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+            arguments.append("")
+        if tool_call.id:
+            print(f"streamed tool call id: {tool_call.id} ")
+
+        if tool_call.function:
+            if tool_call.function.name:
+                print(f"streamed tool call name: {tool_call.function.name}")
+
+            if tool_call.function.arguments:
+                arguments[tool_call_idx] += tool_call.function.arguments
+
+if len(arguments):
+    print(f"streamed tool call arguments: {arguments[-1]}")
+
+print("\n\n")
+
+messages.append({
+    "role": "assistant",
+    "tool_calls": chat_completion.choices[0].message.tool_calls
+})
+
+
+# Now, simulate a tool call
+def get_current_weather(city: str, state: str, unit: 'str'):
+    return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+            "partly cloudly, with highs in the 90's.")
+
+
+available_tools = {"get_current_weather": get_current_weather}
+
+completion_tool_calls = chat_completion.choices[0].message.tool_calls
+for call in completion_tool_calls:
+    tool_to_call = available_tools[call.function.name]
+    args = json.loads(call.function.arguments)
+    result = tool_to_call(**args)
+    print(result)
+    messages.append({
+        "role": "tool",
+        "content": result,
+        "tool_call_id": call.id,
+        "name": call.function.name
+    })
+
+chat_completion_2 = client.chat.completions.create(messages=messages,
+                                                   model=model,
+                                                   tools=tools,
+                                                   stream=False)
+print("\n\n")
+print(chat_completion_2)
diff --git a/examples/tool_chat_template_hermes.jinja b/examples/tool_chat_template_hermes.jinja
new file mode 100644
index 0000000000000..b18b463032d4f
--- /dev/null
+++ b/examples/tool_chat_template_hermes.jinja
@@ -0,0 +1,129 @@
+{%- macro json_to_python_type(json_spec) %}
+    {%- set basic_type_map = {
+    "string": "str",
+    "number": "float",
+    "integer": "int",
+    "boolean": "bool"
+} %}
+
+    {%- if basic_type_map[json_spec.type] is defined %}
+        {{- basic_type_map[json_spec.type] }}
+    {%- elif json_spec.type == "array" %}
+        {{- "list[" +  json_to_python_type(json_spec|items) + "]" }}
+    {%- elif json_spec.type == "object" %}
+        {%- if json_spec.additionalProperties is defined %}
+            {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }}
+        {%- else %}
+            {{- "dict" }}
+        {%- endif %}
+    {%- elif json_spec.type is iterable %}
+        {{- "Union[" }}
+        {%- for t in json_spec.type %}
+            {{- json_to_python_type({"type": t}) }}
+            {%- if not loop.last %}
+                {{- "," }}
+            {%- endif %}
+        {%- endfor %}
+        {{- "]" }}
+    {%- else %}
+        {{- "Any" }}
+    {%- endif %}
+{%- endmacro %}
+
+
+{{- bos_token }}
+{{- "<|im_start|>system\nYou are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> " }}
+{%- if tools is iterable and tools | length > 0 %}
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- '{"type": "function", "function": ' }}
+        {{- '{"name": "' + tool.name + '", ' }}
+        {{- '"description": "' + tool.name + '(' }}
+        {%- for param_name, param_fields in tool.parameters.properties|items %}
+            {{- param_name + ": " + json_to_python_type(param_fields) }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- endif %}
+        {%- endfor %}
+        {{- ")" }}
+        {%- if tool.return is defined %}
+            {{- " -> " + json_to_python_type(tool.return) }}
+        {%- endif %}
+        {{- " - " + tool.description + "\n\n" }}
+        {%- for param_name, param_fields in tool.parameters.properties|items %}
+            {%- if loop.first %}
+                {{- "    Args:\n" }}
+            {%- endif %}
+            {{- "        " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
+        {%- endfor %}
+        {%- if tool.return is defined and tool.return.description is defined %}
+            {{- "\n    Returns:\n        " + tool.return.description }}
+        {%- endif %}
+        {{- '"' }}
+        {{- ', "parameters": ' }}
+        {%- if tool.parameters.properties | length == 0 %}
+            {{- "{}" }}
+        {%- else %}
+            {{- tool.parameters|tojson }}
+        {%- endif %}
+        {{- "}" }}
+        {%- if not loop.last %}
+            {{- "\n" }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+{{- " </tools>" }}
+{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}}
+' }}
+{{- "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+" }}
+{{- "<tool_call>
+" }}
+{{- '{"name": <function-name>, "arguments": <args-dict>}
+' }}
+{{- '</tool_call><|im_end|>' }}
+{%- for message in messages %}
+    {%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" and message.tool_calls is defined %}
+        {{- '<|im_start|>' + message.role }}
+            {%- for tool_call in message.tool_calls %}
+                {{- '\n<tool_call>\n' }}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '{' }}
+                {{- '"name": "' }}
+                {{- tool_call.name }}
+                {{- '"}' }}
+                {{- ', ' }}
+                {%- if tool_call.arguments is defined %}
+                    {{- '"arguments": ' }}
+                    {{- tool_call.arguments|tojson }}
+                {%- endif %}
+                {{- '\n</tool_call>' }}
+            {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>tool\n' }}
+        {%- endif %}
+        {{- '<tool_response>\n' }}
+        {{- message.content }}
+        {%- if not loop.last %}
+            {{- '\n</tool_response>\n' }}
+        {%- else %}
+            {{- '\n</tool_response>' }}
+        {%- endif %}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/examples/tool_chat_template_mistral.jinja b/examples/tool_chat_template_mistral.jinja
new file mode 100644
index 0000000000000..49691f59c2f2c
--- /dev/null
+++ b/examples/tool_chat_template_mistral.jinja
@@ -0,0 +1,86 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+
+{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
+    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
+        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif %}
+{%- endfor %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- "[AVAILABLE_TOOLS] [" }}
+            {%- for tool in tools %}
+                {%- set tool = tool.function %}
+                {{- '{"type": "function", "function": {' }}
+                {%- for key, val in tool.items() if key != "return" %}
+                    {%- if val is string %}
+                        {{- '"' + key + '": "' + val + '"' }}
+                    {%- else %}
+                        {{- '"' + key + '": ' + val|tojson }}
+                    {%- endif %}
+                    {%- if not loop.last %}
+                        {{- ", " }}
+                    {%- endif %}
+                {%- endfor %}
+                {{- "}}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- else %}
+                    {{- "]" }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "[/AVAILABLE_TOOLS]" }}
+        {%- endif %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }}
+        {%- else %}
+            {{- "[INST] " + message["content"] + "[/INST]" }}
+        {%- endif %}
+    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
+        {%- if message.tool_calls is defined %}
+            {%- set tool_calls = message.tool_calls %}
+        {%- else %}
+            {%- set tool_calls = message.content %}
+        {%- endif %}
+        {{- "[TOOL_CALLS] [" }}
+        {%- for tool_call in tool_calls %}
+            {%- set out = tool_call.function|tojson %}
+            {{- out[:-1] }}
+            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
+                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
+            {%- endif %}
+            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]" + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif message["role"] == "assistant" %}
+        {{- " " + message["content"] + eos_token }}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
+        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
+        {%- endif %}
+        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/examples/tool_chat_template_mistral_parallel.jinja b/examples/tool_chat_template_mistral_parallel.jinja
new file mode 100644
index 0000000000000..a294cbfd026be
--- /dev/null
+++ b/examples/tool_chat_template_mistral_parallel.jinja
@@ -0,0 +1,94 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{%- if tools is defined %}
+    {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %}
+    {%- if system_message is defined %}
+        {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %}
+    {%- else %}
+        {%- set system_message = parallel_tool_prompt %}
+    {%- endif %}
+{%- endif %}
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+
+{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
+    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
+        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif %}
+{%- endfor %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- "[AVAILABLE_TOOLS] [" }}
+            {%- for tool in tools %}
+                {%- set tool = tool.function %}
+                {{- '{"type": "function", "function": {' }}
+                {%- for key, val in tool.items() if key != "return" %}
+                    {%- if val is string %}
+                        {{- '"' + key + '": "' + val + '"' }}
+                    {%- else %}
+                        {{- '"' + key + '": ' + val|tojson }}
+                    {%- endif %}
+                    {%- if not loop.last %}
+                        {{- ", " }}
+                    {%- endif %}
+                {%- endfor %}
+                {{- "}}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- else %}
+                    {{- "]" }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "[/AVAILABLE_TOOLS]" }}
+        {%- endif %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }}
+        {%- else %}
+            {{- "[INST] " + message["content"] + "[/INST]" }}
+        {%- endif %}
+    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
+        {%- if message.tool_calls is defined %}
+            {%- set tool_calls = message.tool_calls %}
+        {%- else %}
+            {%- set tool_calls = message.content %}
+        {%- endif %}
+        {{- "[TOOL_CALLS] [" }}
+        {%- for tool_call in tool_calls %}
+            {%- set out = tool_call.function|tojson %}
+            {{- out[:-1] }}
+            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
+                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
+            {%- endif %}
+            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]" + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif message["role"] == "assistant" %}
+        {{- " " + message["content"] + eos_token }}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
+        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
+        {%- endif %}
+        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/requirements-common.txt b/requirements-common.txt
index 4c5b681a0d5ab..447fd32311c09 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -20,6 +20,7 @@ lm-format-enforcer == 0.10.6
 outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
 typing_extensions >= 4.10
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec
 gguf == 0.9.1
diff --git a/tests/tool_use/__init__.py b/tests/tool_use/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
new file mode 100644
index 0000000000000..ab6a29eba1b3f
--- /dev/null
+++ b/tests/tool_use/conftest.py
@@ -0,0 +1,32 @@
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from tests.utils import RemoteOpenAIServer
+
+from .utils import ARGS, CONFIGS, ServerConfig
+
+
+# for each server config, download the model and return the config
+@pytest.fixture(scope="session", params=CONFIGS.keys())
+def server_config(request):
+    config = CONFIGS[request.param]
+    # download model and tokenizer using transformers
+    snapshot_download(config["model"])
+    yield CONFIGS[request.param]
+
+
+# run this for each server config
+@pytest.fixture(scope="session")
+def server(request, server_config: ServerConfig):
+    model = server_config["model"]
+    args_for_model = server_config["arguments"]
+    with RemoteOpenAIServer(model, ARGS + args_for_model,
+                            max_wait_seconds=480) as server:
+        yield server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    async with server.get_async_client() as async_client:
+        yield async_client
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
new file mode 100644
index 0000000000000..038ff81d2b674
--- /dev/null
+++ b/tests/tool_use/test_chat_completions.py
@@ -0,0 +1,143 @@
+from typing import List
+
+import openai
+import pytest
+
+from .utils import MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL
+
+
+# test: make sure chat completions without tools provided work even when tools
+# are enabled. This makes sure tool call chat templates work, AND that the tool
+# parser stream processing doesn't change the output of the model.
+@pytest.mark.asyncio
+async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_WITHOUT_TOOLS,
+        temperature=0,
+        max_tokens=150,
+        model=model_name,
+        logprobs=False)
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    output_text = chat_completion.choices[0].message.content
+
+    # check to make sure we got text
+    assert output_text is not None
+    assert len(output_text) > 0
+    assert stop_reason != "tool_calls"
+
+    # check to make sure no tool calls were returned
+    assert (choice.message.tool_calls is None
+            or len(choice.message.tool_calls) == 0)
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        messages=MESSAGES_WITHOUT_TOOLS,
+        temperature=0,
+        max_tokens=150,
+        model=model_name,
+        logprobs=False,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    # assemble streamed chunks
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        # make sure the role is assistant
+        if delta.role:
+            assert not role_sent
+            assert delta.role == 'assistant'
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        # make sure tool call chunks aren't being streamed
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    # make sure the role was sent, only 1 finish reason was sent, that chunks
+    # were in fact sent, and that the chunks match non-streaming
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == output_text
+
+
+# test: conversation with tools enabled and provided that should not invoke
+# tools, to make sure we can still get normal chat completion responses
+# and that they won't be parsed as tools
+@pytest.mark.asyncio
+async def test_chat_completion_with_tools(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_WITHOUT_TOOLS,
+        temperature=0,
+        max_tokens=150,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        logprobs=False)
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    output_text = chat_completion.choices[0].message.content
+
+    # check to make sure we got text
+    assert output_text is not None
+    assert stop_reason != 'tool_calls'
+    assert len(output_text) > 0
+
+    # check to make sure no tool calls were returned
+    assert (choice.message.tool_calls is None
+            or len(choice.message.tool_calls) == 0)
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        messages=MESSAGES_WITHOUT_TOOLS,
+        temperature=0,
+        max_tokens=150,
+        model=model_name,
+        logprobs=False,
+        tools=[WEATHER_TOOL],
+        stream=True,
+    )
+
+    chunks: List[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    # assemble streamed chunks
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        # make sure the role is assistant
+        if delta.role:
+            assert delta.role == 'assistant'
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+
+        # make sure tool call chunks aren't being streamed
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    # make sure the role was sent, only 1 finish reason was sent, that chunks
+    # were in fact sent, and that the chunks match non-streaming
+    assert role_sent
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert chunk.choices[0].finish_reason != 'tool_calls'
+    assert len(chunks)
+    assert "".join(chunks) == output_text
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
new file mode 100644
index 0000000000000..b03b5a2075a6c
--- /dev/null
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -0,0 +1,193 @@
+import json
+from typing import Dict, List, Optional
+
+import openai
+import pytest
+
+from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+                    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
+                    WEATHER_TOOL)
+
+
+# test: getting the model to generate parallel tool calls (streaming/not)
+# when requested. NOTE that not all models may support this, so some exclusions
+# may be added in the future. e.g. llama 3.1 models are not designed to support
+# parallel tool calls.
+@pytest.mark.asyncio
+async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure 2 tool calls are present
+    assert choice.message.role == "assistant"
+    assert non_streamed_tool_calls is not None
+    assert len(non_streamed_tool_calls) == 2
+
+    for tool_call in non_streamed_tool_calls:
+        # make sure the tool includes a function and ID
+        assert tool_call.type == "function"
+        assert tool_call.function is not None
+        assert isinstance(tool_call.id, str)
+        assert len(tool_call.id) > 16
+
+        # make sure the weather tool was called correctly
+        assert tool_call.function.name == WEATHER_TOOL["function"]["name"]
+        assert isinstance(tool_call.function.arguments, str)
+
+        parsed_arguments = json.loads(tool_call.function.arguments)
+        assert isinstance(parsed_arguments, Dict)
+        assert isinstance(parsed_arguments.get("city"), str)
+        assert isinstance(parsed_arguments.get("state"), str)
+
+    assert stop_reason == "tool_calls"
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_tokens=200,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True)
+
+    role_name: Optional[str] = None
+    finish_reason_count: int = 0
+
+    tool_call_names: List[str] = []
+    tool_call_args: List[str] = []
+    tool_call_idx: int = -1
+    tool_call_id_count: int = 0
+
+    async for chunk in stream:
+
+        # if there's a finish reason make sure it's tools
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == 'tool_calls'
+
+        # if a role is being streamed make sure it wasn't already set to
+        # something else
+        if chunk.choices[0].delta.role:
+            assert not role_name or role_name == 'assistant'
+            role_name = 'assistant'
+
+        # if a tool call is streamed make sure there's exactly one
+        # (based on the request parameters
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+
+            # make sure only one diff is present - correct even for parallel
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            # if a new tool is being called, set up empty arguments
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = tool_call.index
+                tool_call_args.append("")
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id:
+                tool_call_id_count += 1
+                assert (isinstance(tool_call.id, str)
+                        and (len(tool_call.id) > 16))
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert isinstance(tool_call.function.name, str)
+                    tool_call_names.append(tool_call.function.name)
+
+                if tool_call.function.arguments:
+                    # make sure they're a string and then add them to the list
+                    assert isinstance(tool_call.function.arguments, str)
+
+                    tool_call_args[
+                        tool_call.index] += tool_call.function.arguments
+
+    assert finish_reason_count == 1
+    assert role_name == 'assistant'
+
+    assert (len(non_streamed_tool_calls) == len(tool_call_names) ==
+            len(tool_call_args))
+
+    for i in range(2):
+        assert non_streamed_tool_calls[i].function.name == tool_call_names[i]
+        streamed_args = json.loads(tool_call_args[i])
+        non_streamed_args = json.loads(
+            non_streamed_tool_calls[i].function.arguments)
+        assert streamed_args == non_streamed_args
+
+
+# test: providing parallel tool calls back to the model to get a response
+# (streaming/not)
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+        temperature=0,
+        max_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None \
+           or len(choice.message.tool_calls) == 0
+    assert choice.message.content is not None
+    assert "98" in choice.message.content  # Dallas temp in tool response
+    assert "78" in choice.message.content  # Orlando temp in tool response
+
+    stream = await client.chat.completions.create(
+        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+        temperature=0,
+        max_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True)
+
+    chunks: List[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        if delta.role:
+            assert not role_sent
+            assert delta.role == "assistant"
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == choice.message.content
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
new file mode 100644
index 0000000000000..c3abe9e1f5060
--- /dev/null
+++ b/tests/tool_use/test_tool_calls.py
@@ -0,0 +1,192 @@
+import json
+from typing import Dict, List, Optional
+
+import openai
+import pytest
+
+from .utils import (MESSAGES_ASKING_FOR_TOOLS, MESSAGES_WITH_TOOL_RESPONSE,
+                    SEARCH_TOOL, WEATHER_TOOL)
+
+
+# test: request a chat completion that should return tool calls, so we know they
+# are parsable
+@pytest.mark.asyncio
+async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure a tool call is present
+    assert choice.message.role == 'assistant'
+    assert tool_calls is not None
+    assert len(tool_calls) == 1
+    assert tool_calls[0].type == 'function'
+    assert tool_calls[0].function is not None
+    assert isinstance(tool_calls[0].id, str)
+    assert len(tool_calls[0].id) > 16
+
+    # make sure the weather tool was called (classic example) with arguments
+    assert tool_calls[0].function.name == WEATHER_TOOL["function"]["name"]
+    assert tool_calls[0].function.arguments is not None
+    assert isinstance(tool_calls[0].function.arguments, str)
+
+    # make sure the arguments parse properly
+    parsed_arguments = json.loads(tool_calls[0].function.arguments)
+    assert isinstance(parsed_arguments, Dict)
+    assert isinstance(parsed_arguments.get("city"), str)
+    assert isinstance(parsed_arguments.get("state"), str)
+    assert parsed_arguments.get("city") == "Dallas"
+    assert parsed_arguments.get("state") == "TX"
+
+    assert stop_reason == "tool_calls"
+
+    function_name: Optional[str] = None
+    function_args_str: str = ''
+    tool_call_id: Optional[str] = None
+    role_name: Optional[str] = None
+    finish_reason_count: int = 0
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_tokens=100,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True)
+
+    async for chunk in stream:
+        assert chunk.choices[0].index == 0
+
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == 'tool_calls'
+
+        # if a role is being streamed make sure it wasn't already set to
+        # something else
+        if chunk.choices[0].delta.role:
+            assert not role_name or role_name == 'assistant'
+            role_name = 'assistant'
+
+        # if a tool call is streamed make sure there's exactly one
+        # (based on the request parameters
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id:
+                assert not tool_call_id
+                tool_call_id = tool_call.id
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert function_name is None
+                    assert isinstance(tool_call.function.name, str)
+                    function_name = tool_call.function.name
+                if tool_call.function.arguments:
+                    assert isinstance(tool_call.function.arguments, str)
+                    function_args_str += tool_call.function.arguments
+
+    assert finish_reason_count == 1
+    assert role_name == 'assistant'
+    assert isinstance(tool_call_id, str) and (len(tool_call_id) > 16)
+
+    # validate the name and arguments
+    assert function_name == WEATHER_TOOL["function"]["name"]
+    assert function_name == tool_calls[0].function.name
+    assert isinstance(function_args_str, str)
+
+    # validate arguments
+    streamed_args = json.loads(function_args_str)
+    assert isinstance(streamed_args, Dict)
+    assert isinstance(streamed_args.get("city"), str)
+    assert isinstance(streamed_args.get("state"), str)
+    assert streamed_args.get("city") == "Dallas"
+    assert streamed_args.get("state") == "TX"
+
+    # make sure everything matches non-streaming except for ID
+    assert function_name == tool_calls[0].function.name
+    assert choice.message.role == role_name
+    assert choice.message.tool_calls[0].function.name == function_name
+
+    # compare streamed with non-streamed args Dict-wise, not string-wise
+    # because character-to-character comparison might not work e.g. the tool
+    # call parser adding extra spaces or something like that. we care about the
+    # dicts matching not byte-wise match
+    assert parsed_arguments == streamed_args
+
+
+# test: providing tools and results back to model to get a non-tool response
+# (streaming/not)
+@pytest.mark.asyncio
+async def test_tool_call_with_results(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_WITH_TOOL_RESPONSE,
+        temperature=0,
+        max_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None \
+           or len(choice.message.tool_calls) == 0
+    assert choice.message.content is not None
+    assert "98" in choice.message.content  # the temperature from the response
+
+    stream = await client.chat.completions.create(
+        messages=MESSAGES_WITH_TOOL_RESPONSE,
+        temperature=0,
+        max_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True)
+
+    chunks: List[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        if delta.role:
+            assert not role_sent
+            assert delta.role == "assistant"
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == choice.message.content
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
new file mode 100644
index 0000000000000..8ec9b05b2c521
--- /dev/null
+++ b/tests/tool_use/utils.py
@@ -0,0 +1,215 @@
+from typing import Dict, List
+
+from openai.types.chat import (ChatCompletionMessageParam,
+                               ChatCompletionToolParam)
+from typing_extensions import TypedDict
+
+from tests.utils import VLLM_PATH
+
+
+class ServerConfig(TypedDict):
+    model: str
+    arguments: List[str]
+
+
+# universal args for all models go here. also good if you need to test locally
+# and change type or KV cache quantization or something.
+ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"]
+
+CONFIGS: Dict[str, ServerConfig] = {
+    "hermes": {
+        "model":
+        "NousResearch/Hermes-2-Pro-Llama-3-8B",
+        "arguments": [
+            "--tool-call-parser", "hermes", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
+        ]
+    },
+    "mistral": {
+        "model":
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "arguments": [
+            "--tool-call-parser", "mistral", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
+            "--ignore-patterns=\"consolidated.safetensors\""
+        ]
+    }
+}
+
+WEATHER_TOOL: ChatCompletionToolParam = {
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, "
+                    "e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state "
+                    "that the city is in, e.g. 'CA' which would "
+                    "mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            }
+        }
+    }
+}
+
+SEARCH_TOOL: ChatCompletionToolParam = {
+    "type": "function",
+    "function": {
+        "name":
+        "web_search",
+        "description":
+        "Search the internet and get a summary of the top "
+        "10 webpages. Should only be used if you don't know "
+        "the answer to a user query, and the results are likely"
+        "to be able to be found with a web search",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "search_term": {
+                    "type":
+                    "string",
+                    "description":
+                    "The term to use in the search. This should"
+                    "ideally be keywords to search for, not a"
+                    "natural-language question"
+                }
+            },
+            "required": ["search_term"]
+        }
+    }
+}
+
+MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{
+    "role":
+    "system",
+    "content":
+    "You are a helpful assistant with access to tools. If a tool"
+    " that you have would be helpful to answer a user query, "
+    "call the tool. Otherwise, answer the user's query directly "
+    "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+    "to the user's question - just respond to it normally."
+}, {
+    "role":
+    "user",
+    "content":
+    "Hi! How are you?"
+}, {
+    "role":
+    "assistant",
+    "content":
+    "I'm doing great! How can I assist you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me a joke please?"
+}]
+
+MESSAGES_ASKING_FOR_TOOLS: List[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "What is the weather in Dallas, Texas in Fahrenheit?"
+}]
+
+MESSAGES_WITH_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "What is the weather in Dallas, Texas in Fahrenheit?"
+}, {
+    "role":
+    "assistant",
+    "tool_calls": [{
+        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "type": "function",
+        "function": {
+            "name":
+            WEATHER_TOOL["function"]["name"],
+            "arguments":
+            '{"city": "Dallas", "state": "TX", '
+            '"unit": "fahrenheit"}'
+        }
+    }]
+}, {
+    "role":
+    "tool",
+    "tool_call_id":
+    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+    "content":
+    "The weather in Dallas is 98 degrees fahrenheit, with partly"
+    "cloudy skies and a low chance of rain."
+}]
+
+MESSAGES_ASKING_FOR_PARALLEL_TOOLS: List[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "What is the weather in Dallas, Texas and Orlando, Florida in "
+    "Fahrenheit?"
+}]
+
+MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{
+    "role":
+    "user",
+    "content":
+    "What is the weather in Dallas, Texas and Orlando, Florida in "
+    "Fahrenheit?"
+}, {
+    "role":
+    "assistant",
+    "tool_calls": [{
+        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "type": "function",
+        "function": {
+            "name":
+            WEATHER_TOOL["function"]["name"],
+            "arguments":
+            '{"city": "Dallas", "state": "TX", '
+            '"unit": "fahrenheit"}'
+        }
+    }, {
+        "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+        "type": "function",
+        "function": {
+            "name":
+            WEATHER_TOOL["function"]["name"],
+            "arguments":
+            '{"city": "Orlando", "state": "Fl", '
+            '"unit": "fahrenheit"}'
+        }
+    }]
+}, {
+    "role":
+    "tool",
+    "tool_call_id":
+    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+    "content":
+    "The weather in Dallas TX is 98 degrees fahrenheit with mostly "
+    "cloudy skies and a chance of rain in the evening."
+}, {
+    "role":
+    "tool",
+    "tool_call_id":
+    "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+    "content":
+    "The weather in Orlando FL is 78 degrees fahrenheit with clear"
+    "skies."
+}]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index f205a99920892..9a7493649c795 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,23 +1,28 @@
 import asyncio
 import codecs
+import json
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from functools import lru_cache
+from functools import lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Dict, Generic, Iterable, List, Literal,
-                    Mapping, Optional, Tuple, TypeVar, Union)
+                    Mapping, Optional, Tuple, TypeVar, Union, cast)
 
 # yapf conflicts with isort for this block
 # yapf: disable
-from openai.types.chat import ChatCompletionContentPartImageParam
+from openai.types.chat import (ChatCompletionAssistantMessageParam,
+                               ChatCompletionContentPartImageParam)
 from openai.types.chat import (
     ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
-from openai.types.chat import ChatCompletionContentPartTextParam
+from openai.types.chat import (ChatCompletionContentPartRefusalParam,
+                               ChatCompletionContentPartTextParam)
 from openai.types.chat import (
     ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
+from openai.types.chat import (ChatCompletionMessageToolCallParam,
+                               ChatCompletionToolMessageParam)
 # yapf: enable
 # pydantic needs the TypedDict from typing_extensions
-from pydantic import ConfigDict, TypeAdapter
+from pydantic import ConfigDict
 from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
@@ -54,7 +59,8 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False):
 
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
-    CustomChatCompletionContentPartParam, ]
+    ChatCompletionContentPartRefusalParam,
+    CustomChatCompletionContentPartParam]
 
 
 class CustomChatCompletionMessageParam(TypedDict, total=False):
@@ -72,15 +78,33 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     same role.
     """
 
+    tool_call_id: Optional[str]
+    """Tool call that this message is responding to."""
+
+    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    """The tool calls generated by the model, such as function calls."""
+
 
 ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
                                    CustomChatCompletionMessageParam]
 
 
 # TODO: Make fields ReadOnly once mypy supports it
-class ConversationMessage(TypedDict):
-    role: str
-    content: str
+class ConversationMessage(TypedDict, total=False):
+    role: Required[str]
+    """The role of the message's author."""
+
+    content: Optional[str]
+    """The contents of the message"""
+
+    tool_call_id: Optional[str]
+    """Tool call that this message is responding to."""
+
+    name: Optional[str]
+    """The name of the function to call"""
+
+    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    """The tool calls generated by the model, such as function calls."""
 
 
 ModalityStr = Literal["image", "audio"]
@@ -319,9 +343,11 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
     return "\n".join(missing_placeholders + [text_prompt])
 
 
-_TextParser = TypeAdapter(ChatCompletionContentPartTextParam)
-_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam)
-_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam)
+# No need to validate using Pydantic again
+_TextParser = partial(cast, ChatCompletionContentPartTextParam)
+_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
+_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
+_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 
 
 def _parse_chat_message_content_parts(
@@ -336,10 +362,10 @@ def _parse_chat_message_content_parts(
     for part in parts:
         part_type = part["type"]
         if part_type == "text":
-            text = _TextParser.validate_python(part)["text"]
+            text = _TextParser(part)["text"]
             texts.append(text)
         elif part_type == "image_url":
-            image_url = _ImageParser.validate_python(part)["image_url"]
+            image_url = _ImageParser(part)["image_url"]
 
             if image_url.get("detail", "auto") != "auto":
                 logger.warning(
@@ -348,7 +374,7 @@ def _parse_chat_message_content_parts(
 
             mm_parser.parse_image(image_url["url"])
         elif part_type == "audio_url":
-            audio_url = _AudioParser.validate_python(part)["audio_url"]
+            audio_url = _AudioParser(part)["audio_url"]
 
             mm_parser.parse_audio(audio_url["url"])
         else:
@@ -363,6 +389,11 @@ def _parse_chat_message_content_parts(
     return [ConversationMessage(role=role, content=text_prompt)]
 
 
+# No need to validate using Pydantic again
+_AssistantParser = partial(cast, ChatCompletionAssistantMessageParam)
+_ToolParser = partial(cast, ChatCompletionToolMessageParam)
+
+
 def _parse_chat_message_content(
     message: ChatCompletionMessageParam,
     mm_tracker: BaseMultiModalItemTracker,
@@ -371,16 +402,34 @@ def _parse_chat_message_content(
     content = message.get("content")
 
     if content is None:
-        return []
-    if isinstance(content, str):
-        return [ConversationMessage(role=role, content=content)]
+        content = []
+    elif isinstance(content, str):
+        content = [
+            ChatCompletionContentPartTextParam(type="text", text=content)
+        ]
 
-    return _parse_chat_message_content_parts(
+    result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore
         mm_tracker,
     )
 
+    for result_msg in result:
+        if role == 'assistant':
+            parsed_msg = _AssistantParser(message)
+
+            if "tool_calls" in parsed_msg:
+                result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
+        elif role == "tool":
+            parsed_msg = _ToolParser(message)
+            if "tool_call_id" in parsed_msg:
+                result_msg["tool_call_id"] = parsed_msg["tool_call_id"]
+
+        if "name" in message and isinstance(message["name"], str):
+            result_msg["name"] = message["name"]
+
+    return result
+
 
 def parse_chat_messages(
     messages: List[ChatCompletionMessageParam],
@@ -428,6 +477,20 @@ def apply_chat_template(
             "allowed, so you must provide a chat template if the tokenizer "
             "does not define one.")
 
+    # per the Transformers docs & maintainers, tool call arguments in
+    # assistant-role messages with tool_calls need to be dicts not JSON str -
+    # this is how tool-use chat templates will expect them moving forwards
+    # so, for messages that have tool_calls, parse the string (which we get
+    # from openAI format) to dict
+    for message in conversation:
+        if (message["role"] == "assistant" and "tool_calls" in message
+                and isinstance(message["tool_calls"], list)):
+
+            for i in range(len(message["tool_calls"])):
+                args: str = message["tool_calls"][i]["function"]["arguments"]
+                parsed_args: Dict = json.loads(args)
+                message["tool_calls"][i]["function"]["arguments"] = parsed_args
+
     prompt = tokenizer.apply_chat_template(
         conversation=conversation,
         chat_template=chat_template,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 7632e8aa5e32e..728a2e5232d9b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -233,7 +233,7 @@ def mount_metrics(app: FastAPI):
         metrics_route = Mount("/metrics", make_asgi_app())
 
     # Workaround for 307 Redirect for /metrics
-    metrics_route.path_regex = re.compile('^/metrics(?P<path>.*)$')
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
     app.routes.append(metrics_route)
 
 
@@ -283,11 +283,14 @@ async def show_version():
 @router.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
+
     generator = await openai_serving_chat.create_chat_completion(
         request, raw_request)
+
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
+
     elif isinstance(generator, ChatCompletionResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -422,7 +425,8 @@ async def init_app(
         request_logger=request_logger,
         chat_template=args.chat_template,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-    )
+        enable_auto_tools=args.enable_auto_tool_choice,
+        tool_parser=args.tool_call_parser)
     openai_serving_completion = OpenAIServingCompletion(
         async_engine_client,
         model_config,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 94742838b421c..7ccee0b6b55b7 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -163,6 +163,24 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help="If specified, will run the OpenAI frontend server in the same "
         "process as the model serving engine.")
 
+    parser.add_argument(
+        "--enable-auto-tool-choice",
+        action="store_true",
+        default=False,
+        help=
+        "Enable auto tool choice for supported models. Use --tool-call-parser"
+        "to specify which parser to use")
+
+    parser.add_argument(
+        "--tool-call-parser",
+        type=str,
+        choices=["mistral", "hermes"],
+        default=None,
+        help=
+        "Select the tool call parser depending on the model that you're using."
+        " This is used to parse the model-generated tool call into OpenAI API "
+        "format. Required for --enable-auto-tool-choice.")
+
     parser = AsyncEngineArgs.add_cli_args(parser)
 
     parser.add_argument('--max-log-len',
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 0954b81595ef5..ff9c3690672b6 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -5,8 +5,9 @@
 from typing import Any, Dict, List, Literal, Optional, Union
 
 import torch
+from openai.types.chat import ChatCompletionContentPartParam
 from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Annotated
+from typing_extensions import Annotated, Required, TypedDict
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
@@ -35,6 +36,26 @@
 assert _LONG_INFO.max == _MOCK_LONG_INFO.max
 
 
+class CustomChatCompletionMessageParam(TypedDict, total=False):
+    """Enables custom roles in the Chat Completion API."""
+    role: Required[str]
+    """The role of the message's author."""
+
+    content: Union[str, List[ChatCompletionContentPartParam]]
+    """The contents of the message."""
+
+    name: str
+    """An optional name for the participant.
+
+    Provides the model information to differentiate between participants of the
+    same role.
+    """
+
+    tool_call_id: Optional[str]
+
+    tool_calls: Optional[List[dict]]
+
+
 class OpenAIBaseModel(BaseModel):
     # OpenAI API does not allow extra fields
     model_config = ConfigDict(extra="forbid")
@@ -145,8 +166,11 @@ class ChatCompletionRequest(OpenAIBaseModel):
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     tools: Optional[List[ChatCompletionToolsParam]] = None
-    tool_choice: Optional[Union[Literal["none"],
+    tool_choice: Optional[Union[Literal["none"], Literal["auto"],
                                 ChatCompletionNamedToolChoiceParam]] = "none"
+
+    # NOTE this will be ignored by VLLM -- the model determines the behavior
+    parallel_tool_calls: Optional[bool] = False
     user: Optional[str] = None
 
     # doc: begin-chat-completion-sampling-params
@@ -328,6 +352,9 @@ def check_logprobs(cls, data):
     @model_validator(mode="before")
     @classmethod
     def check_guided_decoding_count(cls, data):
+        if isinstance(data, ValueError):
+            raise data
+
         guide_count = sum([
             "guided_json" in data and data["guided_json"] is not None,
             "guided_regex" in data and data["guided_regex"] is not None,
@@ -339,21 +366,61 @@ def check_guided_decoding_count(cls, data):
                 "You can only use one kind of guided decoding "
                 "('guided_json', 'guided_regex' or 'guided_choice').")
         # you can only either use guided decoding or tools, not both
-        if guide_count > 1 and "tool_choice" in data and data[
-                "tool_choice"] != "none":
+        if guide_count > 1 and data.get("tool_choice",
+                                        "none") not in ("none", "auto"):
             raise ValueError(
                 "You can only either use guided decoding or tools, not both.")
         return data
 
     @model_validator(mode="before")
     @classmethod
-    def check_tool_choice(cls, data):
-        if "tool_choice" in data and data["tool_choice"] != "none":
-            if not isinstance(data["tool_choice"], dict):
-                raise ValueError("Currently only named tools are supported.")
+    def check_tool_usage(cls, data):
+
+        # if "tool_choice" is not specified but tools are provided,
+        # default to "auto" tool_choice
+        if "tool_choice" not in data and "tools" in data:
+            data["tool_choice"] = "auto"
+
+        # if "tool_choice" is specified -- validation
+        if "tool_choice" in data:
+
+            # ensure that if "tool choice" is specified, tools are present
             if "tools" not in data or data["tools"] is None:
                 raise ValueError(
                     "When using `tool_choice`, `tools` must be set.")
+
+            # make sure that tool choice is either a named tool
+            # OR that it's set to "auto"
+            if data["tool_choice"] != "auto" and not isinstance(
+                    data["tool_choice"], dict):
+                raise ValueError(
+                    "`tool_choice` must either be a named tool or \"auto\". "
+                    "`tool_choice=\"none\" is not supported.")
+
+            # ensure that if "tool_choice" is specified as an object,
+            # it matches a valid tool
+            if isinstance(data["tool_choice"], dict):
+                valid_tool = False
+                specified_function = data["tool_choice"]["function"]
+                if not specified_function:
+                    raise ValueError(
+                        "Incorrectly formatted `tool_choice`. Should be like "
+                        "`{\"type\": \"function\","
+                        " \"function\": {\"name\": \"my_function\"}}`")
+                specified_function_name = specified_function["name"]
+                if not specified_function_name:
+                    raise ValueError(
+                        "Incorrectly formatted `tool_choice`. Should be like "
+                        "`{\"type\": \"function\", "
+                        "\"function\": {\"name\": \"my_function\"}}`")
+                for tool in data["tools"]:
+                    if tool["function"]["name"] == specified_function_name:
+                        valid_tool = True
+                        break
+                if not valid_tool:
+                    raise ValueError(
+                        "The tool specified in `tool_choice` does not match any"
+                        " of the specified `tools`")
         return data
 
 
@@ -413,7 +480,7 @@ class CompletionRequest(OpenAIBaseModel):
     )
     guided_json: Optional[Union[str, dict, BaseModel]] = Field(
         default=None,
-        description=("If specified, the output will follow the JSON schema."),
+        description="If specified, the output will follow the JSON schema.",
     )
     guided_regex: Optional[str] = Field(
         default=None,
@@ -633,9 +700,41 @@ class ToolCall(OpenAIBaseModel):
     function: FunctionCall
 
 
+class DeltaFunctionCall(BaseModel):
+    name: Optional[str] = None
+    arguments: Optional[str] = None
+
+
+# a tool call delta where everything is optional
+class DeltaToolCall(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
+    type: Literal["function"] = "function"
+    index: int
+    function: Optional[DeltaFunctionCall] = None
+
+
+# the initial delta that gets sent once a new tool call is started;
+class InitialDeltaToolCall(DeltaToolCall):
+    id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
+    type: Literal["function"] = "function"
+    index: int
+
+
+class ExtractedToolCallInformation(BaseModel):
+    # indicate if tools were called
+    tools_called: bool
+
+    # extracted tool calls
+    tool_calls: List[ToolCall]
+
+    # content - per OpenAI spec, content AND tool calls can be returned rarely
+    # But some models will do this intentionally
+    content: Optional[str] = None
+
+
 class ChatMessage(OpenAIBaseModel):
     role: str
-    content: str
+    content: Optional[str] = None
     tool_calls: List[ToolCall] = Field(default_factory=list)
 
 
@@ -657,7 +756,9 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
     index: int
     message: ChatMessage
     logprobs: Optional[ChatCompletionLogProbs] = None
-    finish_reason: Optional[str] = None
+    # per OpenAI spec this is the default
+    finish_reason: Optional[str] = "stop"
+    # not part of the OpenAI spec but included in vLLM for legacy reasons
     stop_reason: Optional[Union[int, str]] = None
 
 
@@ -674,7 +775,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
 class DeltaMessage(OpenAIBaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
-    tool_calls: List[ToolCall] = Field(default_factory=list)
+    tool_calls: List[DeltaToolCall] = Field(default_factory=list)
 
 
 class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a3bc0bb7b3554..78f355228012f 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,6 +1,8 @@
 import asyncio
+import json
 import time
-from typing import AsyncGenerator, AsyncIterator, Dict, Final, List, Optional
+from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, Final, List,
+                    Optional)
 from typing import Sequence as GenericSequence
 from typing import Union
 
@@ -18,15 +20,18 @@
     ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
-    ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
-    FunctionCall, ToolCall, UsageInfo)
+    ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
+    DeltaToolCall, ErrorResponse, FunctionCall, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing,
                                                     PromptAdapterPath,
                                                     TextTokensPrompt)
+from vllm.entrypoints.openai.tool_parsers import (Hermes2ProToolParser,
+                                                  MistralToolParser,
+                                                  ToolParser)
 from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
-from vllm.outputs import RequestOutput
+from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
@@ -38,19 +43,19 @@
 
 class OpenAIServingChat(OpenAIServing):
 
-    def __init__(
-        self,
-        async_engine_client: AsyncEngineClient,
-        model_config: ModelConfig,
-        served_model_names: List[str],
-        response_role: str,
-        *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
-        return_tokens_as_token_ids: bool = False,
-    ):
+    def __init__(self,
+                 async_engine_client: AsyncEngineClient,
+                 model_config: ModelConfig,
+                 served_model_names: List[str],
+                 response_role: str,
+                 *,
+                 lora_modules: Optional[List[LoRAModulePath]],
+                 prompt_adapters: Optional[List[PromptAdapterPath]],
+                 request_logger: Optional[RequestLogger],
+                 chat_template: Optional[str],
+                 return_tokens_as_token_ids: bool = False,
+                 enable_auto_tools: bool = False,
+                 tool_parser: Optional[str] = None):
         super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
@@ -60,10 +65,27 @@ def __init__(
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
 
         self.response_role = response_role
-
-        # If this is None we use the tokenizer's default chat template
+        self.use_tool_use_model_template = False
         self.chat_template = load_chat_template(chat_template)
 
+        # set up tool use
+        self.enable_auto_tools: bool = enable_auto_tools
+        if self.enable_auto_tools:
+            logger.info(
+                "\"auto\" tool choice has been enabled please note that while"
+                " the parallel_tool_calls client option is preset for "
+                "compatibility reasons, it will be ignored.")
+
+        self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
+        if self.enable_auto_tools:
+            if tool_parser == "mistral":
+                self.tool_parser = MistralToolParser
+            elif tool_parser == "hermes":
+                self.tool_parser = Hermes2ProToolParser
+            else:
+                raise TypeError("Error: --enable-auto-tool-choice requires "
+                                "--tool-call-parser")
+
     async def create_chat_completion(
         self,
         request: ChatCompletionRequest,
@@ -76,11 +98,10 @@ async def create_chat_completion(
         for the API specification. This API mimics the OpenAI
         ChatCompletion API.
 
-        NOTE: Currently we do not support the following feature:
-            - function_call (Users should implement this by themselves)
         """
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
+            logger.error("Error with model %s", error_check_ret)
             return error_check_ret
 
         try:
@@ -119,6 +140,20 @@ async def create_chat_completion(
             logger.error("Error in loading multi-modal data: %s", e)
             return self.create_error_response(str(e))
 
+        # validation for OpenAI tools
+        # tool_choice = "required" is not supported
+        if request.tool_choice == "required":
+            return self.create_error_response(
+                "tool_choice = \"required\" is not supported!")
+
+            # "auto" tools requires --enable-auto-tool-choice
+            # and --tool-call-parser
+        if request.tool_choice == "auto" and not (
+                self.enable_auto_tools and self.tool_parser is not None):
+            return self.create_error_response(
+                "\"auto\" tool choice requires "
+                "--enable-auto-tool-choice and --tool-call-parser to be set")
+
         request_id = f"chat-{random_uuid()}"
         try:
             guided_decode_logits_processor = (
@@ -187,6 +222,7 @@ async def create_chat_completion(
         if request.stream:
             return self.chat_completion_stream_generator(
                 request, result_generator, request_id, conversation, tokenizer)
+
         try:
             return await self.chat_completion_full_generator(
                 request, result_generator, request_id, conversation, tokenizer)
@@ -219,6 +255,9 @@ async def chat_completion_stream_generator(
         previous_num_tokens = [0] * num_choices
         finish_reason_sent = [False] * num_choices
 
+        tool_parser: Optional[ToolParser] = self.tool_parser(
+            tokenizer) if self.tool_parser else None
+
         try:
             async for res in result_generator:
                 # We need to do it here, because if there are exceptions in
@@ -228,6 +267,9 @@ async def chat_completion_stream_generator(
                     # Send first response for each request.n (index) with
                     # the role
                     role = self.get_chat_request_role(request)
+
+                    # NOTE num_choices defaults to 1 so this usually executes
+                    # once per request
                     for i in range(num_choices):
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
@@ -240,14 +282,18 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
+
+                        # if usage should be included
                         if (request.stream_options
                                 and request.stream_options.include_usage):
-                            if (request.stream_options.continuous_usage_stats):
+                            # if continuous usage stats are requested, add it
+                            if request.stream_options.continuous_usage_stats:
                                 prompt_tokens = len(res.prompt_token_ids)
                                 usage = UsageInfo(prompt_tokens=prompt_tokens,
                                                   completion_tokens=0,
                                                   total_tokens=prompt_tokens)
                                 chunk.usage = usage
+                            # otherwise don't
                             else:
                                 chunk.usage = None
 
@@ -257,7 +303,7 @@ async def chat_completion_stream_generator(
                     # Send response to echo the input portion of the
                     # last message
                     if request.echo:
-                        last_msg_content = ""
+                        last_msg_content: Optional[str] = ""
                         if conversation and conversation[-1].get(
                                 "content") and conversation[-1].get(
                                     "role") == role:
@@ -298,6 +344,7 @@ async def chat_completion_stream_generator(
                     first_iteration = False
 
                 for output in res.outputs:
+
                     i = output.index
 
                     if finish_reason_sent[i]:
@@ -320,20 +367,50 @@ async def chat_completion_stream_generator(
                         logprobs = None
 
                     delta_text = output.text[len(previous_texts[i]):]
-                    previous_texts[i] = output.text
-                    previous_num_tokens[i] = len(output.token_ids)
+                    delta_message: Optional[DeltaMessage] = None
 
-                    if request.tool_choice and type(
-                            request.tool_choice
-                    ) is ChatCompletionNamedToolChoiceParam:
+                    # handle streaming deltas for tools with named tool_choice
+                    if (request.tool_choice and type(request.tool_choice) is
+                            ChatCompletionNamedToolChoiceParam):
                         delta_message = DeltaMessage(tool_calls=[
-                            ToolCall(function=FunctionCall(
+                            DeltaToolCall(function=DeltaFunctionCall(
                                 name=request.tool_choice.function.name,
-                                arguments=delta_text))
+                                arguments=delta_text),
+                                          index=i)
                         ])
+
+                    # handle streaming deltas for tools with "auto" tool choice
+                    elif (self._should_stream_with_auto_tool_parsing(request)
+                          and tool_parser):
+                        delta_message = (
+                            tool_parser.extract_tool_calls_streaming(
+                                previous_text=previous_texts[i],
+                                current_text=output.text,
+                                delta_text=delta_text,
+                                previous_token_ids= \
+                                    output.token_ids[
+                                    :-1 * len(delta_token_ids)
+                                    ],
+                                current_token_ids=output.token_ids,
+                                delta_token_ids=delta_token_ids
+                            )
+                        )
+
+                    # handle streaming just a content delta
                     else:
                         delta_message = DeltaMessage(content=delta_text)
 
+                    # set the previous values for the next iteration
+                    previous_texts[i] = output.text
+                    previous_num_tokens[i] = len(output.token_ids)
+
+                    # if the message delta is None (e.g. because it was a
+                    # "control token" for tool calls or the parser otherwise
+                    # wasn't ready to send a token, then
+                    #   get the next token without streaming a chunk
+                    if delta_message is None:
+                        continue
+
                     if output.finish_reason is None:
                         # Send token-by-token response for each request.n
 
@@ -348,6 +425,8 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
+
+                        # handle usage stats if requested & if continuous
                         if (request.stream_options
                                 and request.stream_options.include_usage):
                             if (request.stream_options.continuous_usage_stats):
@@ -365,14 +444,55 @@ async def chat_completion_stream_generator(
 
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
+
+                    # if the model is finished generating
                     else:
+                        # check to make sure we haven't "forgotten" to stream
+                        #   any tokens that were generated but previously
+                        #   matched by partial json parsing
+                        # only happens if we are NOT using guided decoding
+                        if tool_parser:
+                            index = len(
+                                tool_parser.prev_tool_call_arr) - 1 if len(
+                                    tool_parser.prev_tool_call_arr) > 0 else 0
+                        else:
+                            index = 0
+
+                        if self._should_check_for_unstreamed_tool_arg_tokens(
+                                delta_message, output) and tool_parser:
+                            # get the expected call based on partial JSON
+                            # parsing which "autocompletes" the JSON
+                            expected_call = json.dumps(
+                                tool_parser.prev_tool_call_arr[index].get(
+                                    "arguments", {}))
+
+                            # get what we've streamed so for for arguments
+                            # for the current tool
+                            actual_call = tool_parser.streamed_args_for_tool[
+                                index]
+
+                            # check to see if there's anything left to stream
+                            remaining_call = expected_call.replace(
+                                actual_call, "", 1)
+
+                            # set that as a delta message
+                            delta_message = DeltaMessage(tool_calls=[
+                                DeltaToolCall(index=index,
+                                              function=DeltaFunctionCall(
+                                                  arguments=remaining_call).
+                                              model_dump(exclude_none=True))
+                            ])
+
                         # Send the finish response for each request.n only once
                         prompt_tokens = len(res.prompt_token_ids)
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=delta_message,
                             logprobs=logprobs,
-                            finish_reason=output.finish_reason,
+                            finish_reason=output.finish_reason
+                            if not (tool_parser
+                                    and len(tool_parser.prev_tool_call_arr))
+                            else "tool_calls",
                             stop_reason=output.stop_reason)
                         chunk = ChatCompletionStreamResponse(
                             id=request_id,
@@ -398,6 +518,8 @@ async def chat_completion_stream_generator(
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True
 
+            # once the final token is handled, if stream_options.include_usage
+            # is sent, send the usage
             if (request.stream_options
                     and request.stream_options.include_usage):
                 final_usage = UsageInfo(
@@ -419,6 +541,7 @@ async def chat_completion_stream_generator(
 
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
+            logger.error("error in chat completion stream generator: %s", e)
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"
         # Send the final done message after all response.n are finished
@@ -463,8 +586,21 @@ async def chat_completion_full_generator(
             else:
                 logprobs = None
 
-            if request.tool_choice and type(
+            # by default, tools are not used.
+            tools_called = False
+
+            # if auto tools are not enabled, and a named tool choice using
+            #   outlines is not being used
+            if not (self.enable_auto_tools
+                    or not self.tool_parser) and not isinstance(
+                        request.tool_choice,
+                        ChatCompletionNamedToolChoiceParam):
+                message = ChatMessage(role=role, content=output.text)
+
+            # if the request uses tools and specified a tool choice
+            elif request.tool_choice and type(
                     request.tool_choice) is ChatCompletionNamedToolChoiceParam:
+
                 message = ChatMessage(
                     role=role,
                     content="",
@@ -473,14 +609,47 @@ async def chat_completion_full_generator(
                             name=request.tool_choice.function.name,
                             arguments=output.text))
                     ])
+                tools_called = True
+
+            # if the request doesn't use tool choice
+            # OR specifies to not use a tool
             elif not request.tool_choice or request.tool_choice == "none":
+
+                message = ChatMessage(role=role, content=output.text)
+
+            # handle when there are tools and tool choice is auto
+            elif request.tools and (
+                    request.tool_choice == "auto"
+                    or request.tool_choice is None) and self.enable_auto_tools \
+                    and self.tool_parser:
+
+                tool_parser = self.tool_parser(tokenizer)
+                tool_call_info = tool_parser.extract_tool_calls(output.text)
+                tools_called = tool_call_info.tools_called
+                if tool_call_info.tools_called:
+                    message = ChatMessage(role=role,
+                                          content=tool_call_info.content,
+                                          tool_calls=tool_call_info.tool_calls)
+
+                else:
+                    # FOR NOW make it a chat message; we will have to detect
+                    # the type to make it later.
+                    message = ChatMessage(role=role, content=output.text)
+
+            # undetermined case that is still important to handle
+            else:
+                logger.error(
+                    "Error in chat_completion_full_generator - cannot determine"
+                    " if tools should be extracted. Returning a standard chat "
+                    "completion.")
                 message = ChatMessage(role=role, content=output.text)
 
             choice_data = ChatCompletionResponseChoice(
                 index=output.index,
                 message=message,
                 logprobs=logprobs,
-                finish_reason=output.finish_reason,
+                finish_reason="tool_calls" if tools_called else
+                output.finish_reason if output.finish_reason else "stop",
                 stop_reason=output.stop_reason)
             choices.append(choice_data)
 
@@ -488,10 +657,11 @@ async def chat_completion_full_generator(
             last_msg_content = ""
             if conversation and conversation[-1].get(
                     "content") and conversation[-1].get("role") == role:
-                last_msg_content = conversation[-1]["content"]
+                last_msg_content = conversation[-1]["content"] or ""
 
             for choice in choices:
-                full_message = last_msg_content + choice.message.content
+                full_message = last_msg_content + (choice.message.content
+                                                   or "")
                 choice.message.content = full_message
 
         num_prompt_tokens = len(final_res.prompt_token_ids)
@@ -574,3 +744,38 @@ def _create_chat_logprobs(
                     ))
 
         return ChatCompletionLogProbs(content=logprobs_content)
+
+    def _should_stream_with_auto_tool_parsing(self,
+                                              request: ChatCompletionRequest):
+        """
+        Utility function to check if streamed tokens should go through the tool
+        call parser that was configured.
+
+        We only want to do this IF user-provided tools are set, a tool parser
+        is configured, "auto" tool choice is enabled, and the request's tool
+        choice field indicates that "auto" tool choice should be used.
+        """
+        return (request.tools and self.tool_parser and self.enable_auto_tools
+                and request.tool_choice in ['auto', None])
+
+    def _should_check_for_unstreamed_tool_arg_tokens(
+        self,
+        delta_message: Optional[DeltaMessage],
+        output: CompletionOutput,
+    ) -> bool:
+        """
+        Check to see if we should check for unstreamed tool arguments tokens.
+        This is only applicable when auto tool parsing is enabled, the delta
+        is a tool call with arguments.
+        """
+
+        # yapf: disable
+        return bool(
+            # if there is a delta message that includes tool calls which
+            # include a function that has arguments
+            self.enable_auto_tools and self.tool_parser and delta_message
+            and delta_message.tool_calls and delta_message.tool_calls[0]
+            and delta_message.tool_calls[0].function
+            and delta_message.tool_calls[0].function.arguments is not None
+            and output.finish_reason is not None
+        )
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index c3c0d52072cd3..69a5ad5b62cfa 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -43,7 +43,11 @@ def __init__(
                          request_logger=request_logger)
 
         # If this is None we use the tokenizer's default chat template
-        self.chat_template = load_chat_template(chat_template)
+        # the list of commonly-used chat template names for HF named templates
+        hf_chat_templates: List[str] = ['default', 'tool_use']
+        self.chat_template = chat_template \
+            if chat_template in hf_chat_templates \
+            else load_chat_template(chat_template)
 
     async def create_tokenize(
         self,
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
new file mode 100644
index 0000000000000..5d5d53784fedf
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -0,0 +1,5 @@
+from .abstract_tool_parser import ToolParser
+from .hermes_tool_parser import Hermes2ProToolParser
+from .mistral_tool_parser import MistralToolParser
+
+__all__ = ["ToolParser", "Hermes2ProToolParser", "MistralToolParser"]
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
new file mode 100644
index 0000000000000..b0807e6f1e782
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -0,0 +1,58 @@
+from typing import Dict, List, Sequence, Union
+
+from vllm.entrypoints.openai.protocol import (DeltaMessage,
+                                              ExtractedToolCallInformation)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+class ToolParser:
+    """
+    Abstract ToolParser class that should not be used directly. Provided
+    properties and methods should be used in
+    derived classes.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        self.prev_tool_call_arr: List[Dict] = []
+        # the index of the tool call that is currently being parsed
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.current_tool_initial_sent: bool = False
+        self.streamed_args_for_tool: List[str] = []
+
+        self.model_tokenizer = tokenizer
+
+    def extract_tool_calls(self,
+                           model_output: str) -> ExtractedToolCallInformation:
+        """
+        Static method that should be implemented for extracting tool calls from
+        a complete model-generated string.
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+        Static because it's stateless.
+        """
+        raise NotImplementedError(
+            "AbstractToolParser.extract_tool_calls has not been implemented!")
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting tool calls
+        from an incomplete response; for use when handling tool calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+        raise NotImplementedError(
+            "AbstractToolParser.extract_tool_calls_streaming has not been "
+            "implemented!")
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
new file mode 100644
index 0000000000000..7afbca7162edf
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -0,0 +1,344 @@
+import json
+import re
+from typing import Dict, List, Sequence, Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall,
+                                              InitialDeltaToolCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+
+logger = init_logger(__name__)
+
+
+class Hermes2ProToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if isinstance(self.model_tokenizer, MistralTokenizer):
+            logger.error(
+                "Detected Mistral tokenizer when using a Hermes model")
+            self.model_tokenizer = self.model_tokenizer.tokenizer
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: List[Dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent = False
+        self.current_tool_initial_sent: bool = False
+        self.streamed_args_for_tool: List[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL)
+        self.scratch_pad_regex = re.compile(
+            r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_call_start_token_id: int = self.model_tokenizer.vocab[
+            self.tool_call_start_token]
+        self.tool_call_end_token_id: int = self.model_tokenizer.vocab[
+            self.tool_call_end_token]
+        if not self.tool_call_start_token_id or not self.tool_call_end_token_id:
+            raise RuntimeError(
+                "Hermes 2 Pro Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!")
+
+    def extract_tool_calls(self,
+                           model_output: str) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = (
+                    self.tool_call_regex.findall(model_output))
+
+                # load the JSON, and then use it to build the Function and
+                # Tool Call
+                raw_function_calls = [
+                    json.loads(match[0] if match[0] else match[1])
+                    for match in function_call_tuples
+                ]
+                tool_calls = [
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=function_call["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(function_call["arguments"])))
+                    for function_call in raw_function_calls
+                ]
+
+                content = model_output[:model_output.
+                                       find(self.tool_call_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None)
+
+            except Exception as e:
+                logger.error("Error in extracting tool call from response %s",
+                             e)
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_call_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count):
+                logger.debug("Generating text content! skipping tool parsing.")
+                if delta_text != self.tool_call_end_token:
+                    return DeltaMessage(content=delta_text)
+
+            # case: if tool open & close tag counts don't match, we're doing
+            # imaginary "else" block here
+            # something with tools with this diff.
+            # flags for partial JSON parting. exported constants from
+            # "Allow" are handled via BIT MASK
+            flags = Allow.ALL if self.current_tool_name_sent \
+                else Allow.ALL & ~Allow.STR
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.current_tool_initial_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count > prev_tool_end_count):
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = json.dumps(diff).replace(
+                        self.streamed_args_for_tool[self.current_tool_id], "")
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s", diff)
+                    self.streamed_args_for_tool[self.current_tool_id] \
+                        += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=diff).model_dump(
+                                              exclude_none=True))
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            try:
+
+                current_tool_call = partial_json_parser.loads(
+                    tool_call_portion or "{}",
+                    flags) if tool_call_portion else None
+                logger.debug("Parsed tool call %s", current_tool_call)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # case - we haven't sent the initial delta with the tool call ID
+            #   (it will be sent)
+            if not self.current_tool_initial_sent:
+                self.current_tool_initial_sent = True
+                return DeltaMessage(tool_calls=[
+                    InitialDeltaToolCall(
+                        index=self.current_tool_id).model_dump(
+                            exclude_none=True)
+                ])
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            elif not self.current_tool_name_sent:
+                function_name: Union[str, None] = current_tool_call.get("name")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                else:
+                    return None
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = DeltaMessage(content=delta_text) \
+                    if text_portion is not None else None
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = (
+                self.prev_tool_call_arr[self.current_tool_id].get("arguments"))
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                cur_arguments_json = json.dumps(cur_arguments)
+                logger.debug("finding %s in %s", delta_text,
+                             cur_arguments_json)
+
+                # get the location where previous args differ from current
+                args_delta_start_loc = cur_arguments_json.index(delta_text) \
+                                       + len(delta_text)
+
+                # use that to find the actual delta
+                arguments_delta = cur_arguments_json[:args_delta_start_loc]
+                logger.debug("First tokens in arguments received: %s",
+                             arguments_delta)
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=self.current_tool_id,
+                                  function=DeltaFunctionCall(
+                                      arguments=arguments_delta).model_dump(
+                                          exclude_none=True))
+                ])
+                self.streamed_args_for_tool[self.current_tool_id] \
+                    += arguments_delta
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+
+                cur_args_json = json.dumps(cur_arguments)
+                prev_args_json = json.dumps(prev_arguments)
+                logger.debug("Searching for diff between\n%s", cur_args_json)
+                logger.debug("and\n%s", prev_args_json)
+                argument_diff = extract_intermediate_diff(
+                    cur_args_json, prev_args_json)
+                logger.debug("got argument diff %s", argument_diff)
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=self.current_tool_id,
+                                  function=DeltaFunctionCall(
+                                      arguments=argument_diff).model_dump(
+                                          exclude_none=True))
+                ])
+                self.streamed_args_for_tool[self.current_tool_id] \
+                    += argument_diff
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[self.current_tool_id] = \
+                    current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception as e:
+            logger.error("Error trying to handle streaming tool call: %s", e)
+            return None  # do not stream a delta. skip this token ID.
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
new file mode 100644
index 0000000000000..d48770c792e98
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -0,0 +1,293 @@
+import json
+import re
+from typing import Dict, List, Sequence, Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall,
+                                              InitialDeltaToolCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+
+logger = init_logger(__name__)
+
+
+class MistralToolParser(ToolParser):
+    """
+    Tool call parser for Mistral 7B Instruct v0.3, intended for use with the
+    examples/tool_chat_template_mistral.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser gmistral are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if isinstance(self.model_tokenizer, MistralTokenizer):
+            self.model_tokenizer = self.model_tokenizer.tokenizer
+        else:
+            logger.info("Non-Mistral tokenizer detected when using a Mistral "
+                        "model...")
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: List[Dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.current_tool_initial_sent: bool = False
+        self.streamed_args_for_tool: List[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token = "[TOOL_CALLS]"
+        self.bot_token_id = self.model_tokenizer.vocab[self.bot_token]
+        self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
+
+    def extract_tool_calls(self,
+                           model_output: str) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response. Requires
+        find-and-replacing single quotes with double quotes for JSON parsing,
+        make sure your tool call arguments don't ever include quotes!
+        """
+
+        # case -- if a tool call token is not present, return a text response
+        if self.bot_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+        try:
+
+            # use a regex to find the tool call. remove the BOT token
+            #   and make sure to replace single quotes with double quotes
+            raw_tool_call = self.tool_call_regex.findall(
+                model_output.replace(self.bot_token, ""))[0]
+
+            # load the JSON, and then use it to build the Function and
+            # Tool Call
+            function_call_arr = json.loads(raw_tool_call)
+            tool_calls: List[ToolCall] = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(raw_function_call["arguments"])))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before  the tool call
+            content = model_output.split(self.bot_token)[0]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if len(content) > 0 else None)
+
+        except Exception as e:
+            logger.error("Error in extracting tool call from response: %s", e)
+            print("ERROR", e)
+            # return information to just treat the tool call as regular JSON
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+
+        # if the tool call token is not in the tokens generated so far, append
+        # output to contents since it's not a tool
+        if self.bot_token_id not in current_token_ids:
+            return DeltaMessage(content=delta_text)
+
+        # if the tool call token ID IS in the tokens generated so far, that
+        # means we're parsing as tool calls now
+
+        # handle if we detected the BOT token which means the start of tool
+        # calling
+        if (self.bot_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            # if it's the only token, return None, so we don't send a chat
+            # completion any don't send a control token
+            return None
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+
+            # replace BOT token with empty string, and convert single quotes
+            # to double to allow parsing as JSON since mistral uses single
+            # quotes instead of double for tool calls
+            parsable_arr = current_text.split(self.bot_token)[1]
+
+            # tool calls are generated in an array, so do partial JSON
+            # parsing on the entire array
+            try:
+                tool_call_arr: List[Dict] = partial_json_parser.loads(
+                    parsable_arr, flags)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+
+            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    diff: Union[str, None] = current_tool_call.get("arguments")
+
+                    if diff:
+                        diff = json.dumps(diff).replace(
+                            self.streamed_args_for_tool[self.current_tool_id],
+                            "")
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=diff).model_dump(
+                                                  exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.current_tool_initial_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # case: update an existing tool - this is handled below
+
+            # if the current tool initial data incl. the id, type=function
+            # and idx not sent, send that
+            if not self.current_tool_initial_sent:
+                self.current_tool_initial_sent = True
+                delta = DeltaMessage(tool_calls=[
+                    InitialDeltaToolCall(
+                        index=self.current_tool_id).model_dump(
+                            exclude_none=True)
+                ])
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+
+                prev_arguments = self.prev_tool_call_arr[
+                    self.current_tool_id].get("arguments")
+                cur_arguments = current_tool_call.get("arguments")
+
+                new_text = delta_text.replace("\'", "\"")
+
+                if not cur_arguments and not prev_arguments:
+
+                    delta = None
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "INVARIANT - impossible to have arguments reset "
+                        "mid-arguments")
+                    delta = None
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments)
+                    logger.debug("finding %s in %s", new_text,
+                                 cur_arguments_json)
+
+                    arguments_delta = cur_arguments_json[:cur_arguments_json.
+                                                         index(new_text) +
+                                                         len(new_text)]
+                    logger.debug("First tokens in arguments received: %s",
+                                 arguments_delta)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=arguments_delta).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += arguments_delta
+
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments)
+                    prev_args_json = json.dumps(prev_arguments)
+                    logger.debug("Searching for diff between \n%s\n%s",
+                                 cur_args_json, prev_args_json)
+
+                    argument_diff = extract_intermediate_diff(
+                        cur_args_json, prev_args_json)
+                    logger.debug("got arguments diff: %s", argument_diff)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+                else:
+                    # try parsing it with regular JSON - if it works we're
+                    # at the end, and we need to send the difference between
+                    # tokens streamed so far and the valid JSON
+                    delta = None
+
+            # check to see if the name is defined and has been sent. if so,
+            # stream the name - otherwise keep waiting
+            # finish by setting old and returning None as base case
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception as e:
+            logger.error("Error trying to handle streaming tool call: %s", e)
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
new file mode 100644
index 0000000000000..db7fc5259fc4e
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -0,0 +1,87 @@
+def find_common_prefix(s1: str, s2: str) -> str:
+    """
+    Finds a common prefix that is shared between two strings, if there is one.
+    Order of arguments is NOT important.
+
+    This function is provided as a UTILITY for extracting information from JSON
+    generated by partial_json_parser, to help in ensuring that the right tokens
+    are returned in streaming, so that close-quotes, close-brackets and
+    close-braces are not returned prematurely.
+
+    e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') ->
+    '{"fruit": "ap'
+    """
+    prefix = ''
+    min_length = min(len(s1), len(s2))
+    for i in range(0, min_length):
+        if s1[i] == s2[i]:
+            prefix += s1[i]
+        else:
+            break
+    return prefix
+
+
+def find_common_suffix(s1: str, s2: str) -> str:
+    """
+    Finds a common suffix shared between two strings, if there is one. Order of
+    arguments is NOT important.
+    Stops when the suffix ends OR it hits an alphanumeric character
+
+    e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}'
+    """
+    suffix = ''
+    min_length = min(len(s1), len(s2))
+    for i in range(1, min_length + 1):
+        if s1[-i] == s2[-i] and not s1[-i].isalnum():
+            suffix = s1[-i] + suffix
+        else:
+            break
+    return suffix
+
+
+def extract_intermediate_diff(curr: str, old: str) -> str:
+    """
+    Given two strings, extract the difference in the middle between two strings
+    that are known to have a common prefix and/or suffix.
+
+    This function is provided as a UTILITY for extracting information from JSON
+    generated by partial_json_parser, to help in ensuring that the right tokens
+    are returned in streaming, so that close-quotes, close-brackets and
+    close-braces are not returned prematurely. The order of arguments IS
+    important - the new version of the partially-parsed JSON must be the first
+    argument, and the secnod argument must be from the previous generation.
+
+    What it returns, is tokens that should be streamed to the client.
+
+    e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}')
+        -> 'ple'
+
+    """
+    suffix = find_common_suffix(curr, old)
+
+    old = old[::-1].replace(suffix[::-1], '', 1)[::-1]
+    prefix = find_common_prefix(curr, old)
+    diff = curr
+    if len(suffix):
+        diff = diff[::-1].replace(suffix[::-1], '', 1)[::-1]
+
+    if len(prefix):
+        # replace the prefix only once in case it's mirrored
+        diff = diff.replace(prefix, '', 1)
+
+    return diff
+
+
+def find_all_indices(string, substring):
+    """
+    Find all (starting) indices of a substring in a given string. Useful for
+    tool call extraction
+    """
+    indices = []
+    index = -1
+    while True:
+        index = string.find(substring, index + 1)
+        if index == -1:
+            break
+        indices.append(index)
+    return indices
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index f9fcdead980a2..7161e83952a3d 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -59,8 +59,9 @@ def _adapt_request_for_tool_use(request: Union[CompletionRequest,
     if type(request) is CompletionRequest:
         return request
 
-    # user has chosen to not use any tool
-    if request.tool_choice == "none":
+    # user has chosen to not use any tool,
+    # OR is allowing the model to choose a tool.
+    if request.tool_choice == "none" or request.tool_choice == "auto":
         return request
 
     # user has chosen to use a named tool
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index bfc658ef7d26b..e1f5b380120c5 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -8,8 +8,9 @@
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              CompletionRequest)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionNamedToolChoiceParam, ChatCompletionRequest,
+    CompletionRequest)
 from vllm.model_executor.guided_decoding.guided_fields import (
     GuidedDecodingRequest)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
@@ -101,16 +102,30 @@ def _get_guide_and_mode(
     request: Union[CompletionRequest, ChatCompletionRequest,
                    GuidedDecodingRequest]
 ) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
+    # if the request is a chat completion request, AND the tool choice is a
+    # named tool choice, do guided decoding
+    #   using that tool as the JSON schema
+    if isinstance(request, ChatCompletionRequest) and isinstance(
+            request.tool_choice, ChatCompletionNamedToolChoiceParam):
+        # Guided generation for tools/functions parameters
+        if request.tool_choice.type == "function":
+            for tool in request.tools:
+                if (tool.type == "function" and tool.function.name
+                        == request.tool_choice.function.name):
+                    json = json_dumps(tool.function.parameters, sort_keys=True)
+                    return json, GuidedDecodingMode.JSON
+        return None, None
 
-    if request.guided_json:
-        json = request.guided_json
-        if isinstance(json, dict):
+    elif request.guided_json:
+        if isinstance(request.guided_json, dict):
             # turn dict into hashable string
-            json = json_dumps(json)
-        elif isinstance(json, BaseModel):
+            json = json_dumps(request.guided_json)
+        elif isinstance(request.guided_json, BaseModel):
             # use pydantic signature so that different model classes
             # with the same fields will get hashed the same
-            json = str(json.__signature__)
+            json = str(request.guided_json.__signature__)
+        else:
+            json = request.guided_json
         return json, GuidedDecodingMode.JSON
     elif request.guided_regex:
         return request.guided_regex, GuidedDecodingMode.REGEX

From 77d9e514a2284d5d0bd34b1518b9483ae7d8a05a Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 4 Sep 2024 13:23:22 -0700
Subject: [PATCH 44/51] [MISC] Replace input token throughput with total token
 throughput (#8164)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 benchmarks/benchmark_serving.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e38ceaa222956..84f366bdba387 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -56,8 +56,8 @@ class BenchmarkMetrics:
     total_input: int
     total_output: int
     request_throughput: float
-    input_throughput: float
     output_throughput: float
+    total_token_throughput: float
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
@@ -283,8 +283,8 @@ def calculate_metrics(
         total_input=total_input,
         total_output=sum(actual_output_lens),
         request_throughput=completed / dur_s,
-        input_throughput=total_input / dur_s,
         output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0) *
         1000,  # ttfts is empty if streaming is not supported by backend
         std_ttft_ms=np.std(ttfts or 0) * 1000,
@@ -426,10 +426,10 @@ async def benchmark(
                                  metrics.total_output))
     print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                     metrics.request_throughput))
-    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
-                                    metrics.input_throughput))
     print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
                                     metrics.output_throughput))
+    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+                                    metrics.total_token_throughput))
 
     result = {
         "duration": benchmark_duration,
@@ -437,8 +437,8 @@ async def benchmark(
         "total_input_tokens": metrics.total_input,
         "total_output_tokens": metrics.total_output,
         "request_throughput": metrics.request_throughput,
-        "input_throughput": metrics.input_throughput,
         "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
         "input_lens": [output.prompt_len for output in outputs],
         "output_lens": actual_output_lens,
         "ttfts": [output.ttft for output in outputs],

From 008cf886c9361e696f70a15a282d72b58686468a Mon Sep 17 00:00:00 2001
From: Harsha vardhan manoj Bikki <39381063+hbikki@users.noreply.github.com>
Date: Wed, 4 Sep 2024 16:33:43 -0700
Subject: [PATCH 45/51] =?UTF-8?q?[Neuron]=20Adding=20support=20for=20addin?=
 =?UTF-8?q?g/=20overriding=20neuron=20configuration=20a=E2=80=A6=20(#8062)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Harsha Bikki <harbikh@amazon.com>
---
 ...line_inference_neuron_int8_quantization.py | 50 ++++++++++++++
 vllm/config.py                                | 69 +++++++++++--------
 vllm/engine/arg_utils.py                      | 17 ++++-
 vllm/engine/llm_engine.py                     |  2 +
 .../layers/quantization/__init__.py           |  3 +
 .../layers/quantization/neuron_quant.py       | 67 ++++++++++++++++++
 vllm/model_executor/model_loader/neuron.py    | 65 ++++++++++++++---
 vllm/worker/neuron_model_runner.py            | 12 +++-
 8 files changed, 243 insertions(+), 42 deletions(-)
 create mode 100644 examples/offline_inference_neuron_int8_quantization.py
 create mode 100644 vllm/model_executor/layers/quantization/neuron_quant.py

diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference_neuron_int8_quantization.py
new file mode 100644
index 0000000000000..8ec17e3400953
--- /dev/null
+++ b/examples/offline_inference_neuron_int8_quantization.py
@@ -0,0 +1,50 @@
+import os
+
+from vllm import LLM, SamplingParams
+
+# creates XLA hlo graphs for all the context length buckets.
+os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+# creates XLA hlo graphs for all the token gen buckets.
+os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+# Quantizes neuron model weight to int8 ,
+# The default config for quantization is int8 dtype.
+os.environ['NEURON_QUANT_DTYPE'] = "s8"
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(
+    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    max_num_seqs=8,
+    # The max_model_len and block_size arguments are required to be same as
+    # max sequence length when targeting neuron device.
+    # Currently, this is a known limitation in continuous batching support
+    # in transformers-neuronx.
+    # TODO(liangfu): Support paged-attention in transformers-neuronx.
+    max_model_len=2048,
+    block_size=2048,
+    # The device can be automatically detected when AWS Neuron SDK is installed.
+    # The device argument can be either unspecified for automated detection,
+    # or explicitly assigned.
+    device="neuron",
+    quantization="neuron_quant",
+    override_neuron_config={
+        "cast_logits_dtype": "bfloat16",
+    },
+    tensor_parallel_size=2)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/vllm/config.py b/vllm/config.py
index b84d91d402370..9b3f4f9206300 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,8 +1,8 @@
 import enum
 import json
 from dataclasses import dataclass, field, fields
-from typing import (TYPE_CHECKING, ClassVar, List, Mapping, Optional, Tuple,
-                    Type, Union)
+from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Mapping,
+                    Optional, Tuple, Type, Union)
 
 import torch
 from transformers import PretrainedConfig
@@ -115,35 +115,39 @@ class ModelConfig:
             the model name will be the same as `model`.
         limit_mm_per_prompt: Maximum number of data instances per modality 
             per prompt. Only applicable for multimodal models.
+        override_neuron_config: Initialize non default neuron config or 
+            override default neuron config that are specific to Neuron devices, 
+            this argument will be used to configure the neuron config that 
+            can not be gathered from the vllm arguments. 
     """
 
     def __init__(
-        self,
-        model: str,
-        tokenizer: str,
-        tokenizer_mode: str,
-        trust_remote_code: bool,
-        dtype: Union[str, torch.dtype],
-        seed: int,
-        revision: Optional[str] = None,
-        code_revision: Optional[str] = None,
-        rope_scaling: Optional[dict] = None,
-        rope_theta: Optional[float] = None,
-        tokenizer_revision: Optional[str] = None,
-        max_model_len: Optional[int] = None,
-        spec_target_max_model_len: Optional[int] = None,
-        quantization: Optional[str] = None,
-        quantization_param_path: Optional[str] = None,
-        enforce_eager: Optional[bool] = None,
-        max_context_len_to_capture: Optional[int] = None,
-        max_seq_len_to_capture: Optional[int] = None,
-        max_logprobs: int = 20,
-        disable_sliding_window: bool = False,
-        skip_tokenizer_init: bool = False,
-        served_model_name: Optional[Union[str, List[str]]] = None,
-        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
-        use_async_output_proc: bool = True,
-    ) -> None:
+            self,
+            model: str,
+            tokenizer: str,
+            tokenizer_mode: str,
+            trust_remote_code: bool,
+            dtype: Union[str, torch.dtype],
+            seed: int,
+            revision: Optional[str] = None,
+            code_revision: Optional[str] = None,
+            rope_scaling: Optional[dict] = None,
+            rope_theta: Optional[float] = None,
+            tokenizer_revision: Optional[str] = None,
+            max_model_len: Optional[int] = None,
+            spec_target_max_model_len: Optional[int] = None,
+            quantization: Optional[str] = None,
+            quantization_param_path: Optional[str] = None,
+            enforce_eager: Optional[bool] = None,
+            max_context_len_to_capture: Optional[int] = None,
+            max_seq_len_to_capture: Optional[int] = None,
+            max_logprobs: int = 20,
+            disable_sliding_window: bool = False,
+            skip_tokenizer_init: bool = False,
+            served_model_name: Optional[Union[str, List[str]]] = None,
+            limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+            use_async_output_proc: bool = True,
+            override_neuron_config: Optional[Dict[str, Any]] = None) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -227,6 +231,9 @@ def __init__(
             limit_mm_per_prompt)
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
+
+        self.override_neuron_config = override_neuron_config if is_neuron(
+        ) else None
         self._verify_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -275,6 +282,7 @@ def _verify_quantization(self) -> None:
             "experts_int8"
         ]
         tpu_supported_quantization = ["tpu_int8"]
+        neuron_supported_quantization = ["neuron_quant"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -329,6 +337,11 @@ def _verify_quantization(self) -> None:
                     "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
                     " is not set, enabling VLLM_USE_TRITON_AWQ.")
                 envs.VLLM_USE_TRITON_AWQ = True
+            if is_neuron(
+            ) and self.quantization not in neuron_supported_quantization:
+                raise ValueError(
+                    f"{self.quantization} quantization is currently not "
+                    f"supported in Neuron Backend.")
 
     def _verify_cuda_graph(self) -> None:
         if self.max_seq_len_to_capture is None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8dbe6504d21bd..f0b866db64324 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -2,8 +2,8 @@
 import dataclasses
 import json
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
-                    Union)
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Type, Union)
 
 import torch
 
@@ -149,6 +149,7 @@ class EngineArgs:
     otlp_traces_endpoint: Optional[str] = None
     collect_detailed_traces: Optional[str] = None
     disable_async_output_proc: bool = False
+    override_neuron_config: Optional[Dict[str, Any]] = None
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -742,6 +743,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.disable_async_output_proc,
             help="Disable async output processing. This may result in "
             "lower performance.")
+        parser.add_argument(
+            '--override-neuron-config',
+            type=lambda configs: {
+                str(key): value
+                for key, value in
+                (config.split(':') for config in configs.split(','))
+            },
+            default=None,
+            help="override or set neuron device configuration.")
+
         return parser
 
     @classmethod
@@ -802,7 +813,7 @@ def create_engine_config(self) -> EngineConfig:
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             use_async_output_proc=not self.disable_async_output_proc,
-        )
+            override_neuron_config=self.override_neuron_config)
         cache_config = CacheConfig(
             block_size=self.block_size if self.device != "neuron" else
             self.max_model_len,  # neuron needs block_size = max_model_len
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7da4f7b25db9e..50dcb6937eb6f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -214,6 +214,7 @@ def __init__(
             "Initializing an LLM engine (v%s) with config: "
             "model=%r, speculative_config=%r, tokenizer=%r, "
             "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
+            "override_neuron_config=%s, "
             "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
             "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
             "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
@@ -232,6 +233,7 @@ def __init__(
             model_config.skip_tokenizer_init,
             model_config.tokenizer_mode,
             model_config.revision,
+            model_config.override_neuron_config,
             model_config.rope_scaling,
             model_config.rope_theta,
             model_config.tokenizer_revision,
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 95b160f4287f9..c6fb6ca0d2e01 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -22,6 +22,8 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQMarlin24Config)
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
+from vllm.model_executor.layers.quantization.neuron_quant import (
+    NeuronQuantConfig)
 from vllm.model_executor.layers.quantization.qqq import QQQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
@@ -46,6 +48,7 @@
     "bitsandbytes": BitsAndBytesConfig,
     "qqq": QQQConfig,
     "experts_int8": ExpertsInt8Config,
+    "neuron_quant": NeuronQuantConfig,
 }
 
 
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
new file mode 100644
index 0000000000000..2624981f6a614
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -0,0 +1,67 @@
+import os
+from importlib.util import find_spec
+from typing import Any, Dict, List, Optional
+
+from torch.nn import Module
+
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn']
+
+
+class NeuronQuantConfig(QuantizationConfig):
+    """Int8 Quantization Config class for Neuron Backend."""
+
+    def __init__(
+        self,
+        dequant_dtype: str = "f16",
+        quantize_method: str = "vector_dynamic",
+    ) -> None:
+        self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8")
+        if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
+            raise ValueError(
+                f"Neuron quantization datatype {self.quant_dtype} is not valid,"
+                f"the quantization datatype should match one of the below types"
+                f"{SUPPORTED_QUANT_DTYPE_LIST}")
+        self.dequant_dtype = dequant_dtype
+        self.quantize_method = quantize_method
+
+    def get_name(self) -> str:
+        return "neuron_quant"
+
+    def get_supported_act_dtypes(self) -> List[str]:
+        return SUPPORTED_QUANT_DTYPE_LIST
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "This function should not be called with Neuron Backend")
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "NeuronQuantConfig":
+        quantize_method = cls.get_from_keys(config, ["quantize_method"])
+        dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"])
+        return cls(dequant_dtype=dequant_dtype,
+                   quantize_method=quantize_method)
+
+    def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]:
+        if find_spec("transformers_neuronx") is not None:
+            return self.get_quantization_config()
+        else:
+            raise NotImplementedError(
+                "Neuron Quantization is only supported through"
+                " transformers_neuronx.")
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def get_quantization_config(self):
+        from transformers_neuronx.config import QuantizationConfig
+        return QuantizationConfig(quant_dtype=self.quant_dtype,
+                                  dequant_dtype=self.dequant_dtype,
+                                  quantize_method=self.quantize_method)
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index 7396ac833e782..594ae442ef328 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -10,6 +10,7 @@
 
 from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
@@ -81,8 +82,7 @@ def load_weights(self, model_name_or_path: str, **kwargs):
         neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
 
         split_model_dir = f"{model_name_or_path}-split"
-        if os.path.isdir(os.path.join(model_name_or_path,
-                                      "pytorch_model.bin")):
+        if _is_pretrained_neuron_checkpoint(model_name_or_path):
             split_model_dir = model_name_or_path
         elif not os.path.exists(f"{model_name_or_path}-split"):
             hf_model_cls = getattr(transformers, hf_model_cls_name)
@@ -97,6 +97,23 @@ def load_weights(self, model_name_or_path: str, **kwargs):
         self.model.to_neuron()
 
 
+def _is_pretrained_neuron_checkpoint(model_name_or_path: str) -> bool:
+    # Checking if the neuron checkpoint is saved in the old format.
+    if os.path.isdir(os.path.join(model_name_or_path, "pytorch_model.bin")):
+        return True
+    # Checking if the neuron checkpoint is saved in the new format.
+    pretrained_split_files = ["config.json", "generation_config.json"]
+    pretrained_split_format = ".safetensors"
+    for file in pretrained_split_files:
+        file_path = os.path.join(model_name_or_path, file)
+        if not os.path.isfile(file_path):
+            return False
+    for file in os.listdir(model_name_or_path):
+        if file.endswith(pretrained_split_format):
+            return True
+    return False
+
+
 def _get_model_architecture(config: PretrainedConfig) -> str:
     architectures = getattr(config, "architectures", [])
     for arch in architectures:
@@ -119,19 +136,51 @@ def _get_buckets(env: str, default_value: List[int]) -> List[int]:
     return buckets_list
 
 
+def _get_default_neuron_config(model_config: ModelConfig,
+                               parallel_config: ParallelConfig,
+                               scheduler_config: SchedulerConfig):
+    from transformers_neuronx.config import ContinuousBatchingConfig
+    from transformers_neuronx.constants import LAYOUT_BSH
+
+    continuous_batching_config = ContinuousBatchingConfig(
+        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
+    quant_config = dict(
+        dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+        quantize_method="vector_dynamic")
+    neuron_quantization_config_builder = lambda quant: get_quantization_config(
+        quant).from_config(quant_config).get_quant_method(None, "")
+    # TODO: Add Paged attention config to the default neuron arguments.
+    default_neuron_args = dict(
+        collectives_layout=LAYOUT_BSH,
+        attention_layout=LAYOUT_BSH,
+        fuse_qkv=True,
+        quant=neuron_quantization_config_builder(model_config.quantization)
+        if model_config.quantization else None,
+        continuous_batching=continuous_batching_config,
+        weight_tiling=bool(model_config.quantization))
+    return default_neuron_args
+
+
+def _get_neuron_config_after_override(default_neuron_config,
+                                      overridden_neuron_config):
+    from transformers_neuronx.config import NeuronConfig
+    overridden_neuron_config = overridden_neuron_config or {}
+    default_neuron_config.update(overridden_neuron_config)
+    return NeuronConfig(**default_neuron_config)
+
+
 def get_neuron_model(model_config: ModelConfig,
                      parallel_config: ParallelConfig,
                      scheduler_config: SchedulerConfig) -> nn.Module:
-    from transformers_neuronx.config import (ContinuousBatchingConfig,
-                                             NeuronConfig)
 
     # Create a model instance.
     model = NeuronCasualLM(model_config.hf_config)
 
-    continuous_batching_config = ContinuousBatchingConfig(
-        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
-    neuron_config = NeuronConfig(
-        continuous_batching=continuous_batching_config)
+    default_neuron_config_args = _get_default_neuron_config(
+        model_config, parallel_config, scheduler_config)
+
+    neuron_config = _get_neuron_config_after_override(
+        default_neuron_config_args, model_config.override_neuron_config)
 
     context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
                                             [scheduler_config.max_model_len])
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index f3defffdfa520..0cf7445d4388d 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from importlib.util import find_spec
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -76,9 +77,14 @@ def __init__(
         self.model: nn.Module  # initialize after load_model.
 
     def load_model(self) -> None:
-        self.model = get_neuron_model(self.model_config,
-                                      parallel_config=self.parallel_config,
-                                      scheduler_config=self.scheduler_config)
+        if find_spec("transformers_neuronx") is not None:
+            self.model = get_neuron_model(
+                self.model_config,
+                parallel_config=self.parallel_config,
+                scheduler_config=self.scheduler_config)
+        else:
+            raise NotImplementedError(
+                "Supports only Transformer-NeuronX based models.")
 
     def _prepare_prompt(
         self,

From 32e7db25365415841ebc7c4215851743fbb1bad1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 4 Sep 2024 16:34:27 -0700
Subject: [PATCH 46/51] Bump version to v0.6.0 (#8166)

---
 vllm/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/version.py b/vllm/version.py
index 052eb76b5873c..039f6369b8ed5 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -9,4 +9,4 @@
                   stacklevel=2)
     __commit__ = "COMMIT_HASH_PLACEHOLDER"
 
-__version__ = "0.5.5"
+__version__ = "0.6.0"

From e01c2beb7d1df1f388051f083a20ae9c0d552027 Mon Sep 17 00:00:00 2001
From: Maureen McElaney <mmcelaney@users.noreply.github.com>
Date: Wed, 4 Sep 2024 19:50:13 -0400
Subject: [PATCH 47/51] [Doc] [Misc] Create CODE_OF_CONDUCT.md (#8161)

---
 CODE_OF_CONDUCT.md | 128 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 CODE_OF_CONDUCT.md

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000..f801b5f8f5513
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+
+# vLLM Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socioeconomic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official email address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline/IRL event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement in the #code-of-conduct
+channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
+version 2.1, available at
+[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
+
+For answers to common questions about this code of conduct, see the
+[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
+[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
+

From 1afc931987d0c0e12bb3fde7908e768222916385 Mon Sep 17 00:00:00 2001
From: William Lin <SolitaryThinker@users.noreply.github.com>
Date: Wed, 4 Sep 2024 17:35:36 -0700
Subject: [PATCH 48/51] [bugfix] >1.43 constraint for openai (#8169)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 447fd32311c09..e430753357ca0 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -9,7 +9,7 @@ tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi
 aiohttp
-openai >= 1.0 # Ensure modern openai package (ensure types module present)
+openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
 pydantic >= 2.8  # Required for OpenAI server.
 pillow  # Required for image processing

From 4624d98dbdd6f29a3d8ba7a86d93bde730ef5f7d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 4 Sep 2024 20:31:48 -0700
Subject: [PATCH 49/51] [Misc] Clean up RoPE forward_native (#8076)

---
 .../model_executor/layers/rotary_embedding.py | 95 ++++---------------
 1 file changed, 19 insertions(+), 76 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index c5a0278e485d4..d323f6cc432a2 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -28,7 +28,6 @@
 import torch.nn as nn
 
 from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -48,21 +47,29 @@ def _apply_rotary_emb(
     x: torch.Tensor,
     cos: torch.Tensor,
     sin: torch.Tensor,
+    is_neox_style: bool,
 ) -> torch.Tensor:
     """
     Args:
         x: [num_tokens, num_heads, head_size]
         cos: [num_tokens, head_size // 2]
         sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
     """
-    orig_dtype = x.dtype
-    x = x.float()
-    x1, x2 = torch.chunk(x, 2, dim=-1)
-    cos = cos.unsqueeze(-2)
-    sin = sin.unsqueeze(-2)
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
     o1 = x1 * cos - x2 * sin
     o2 = x2 * cos + x1 * sin
-    return torch.cat((o1, o2), dim=-1).to(orig_dtype)
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
 
 
 class RotaryEmbedding(CustomOp):
@@ -87,10 +94,9 @@ def __init__(
 
         cache = self._compute_cos_sin_cache()
         cache = cache.to(dtype)
+        self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
 
-        self.use_native2 = current_platform.is_tpu() and is_neox_style
-
     def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
         """Compute the inverse frequency."""
         # NOTE(woosuk): To exactly match the HF implementation, we need to
@@ -119,59 +125,7 @@ def forward_native(
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """A PyTorch-native implementation equivalent to forward().
-
-        This method mimics the implementation of the custom CUDA kernel
-        used in `forward_cuda()`.
-        """
-        query = query.view(*query.shape[:-1], -1, self.head_size)
-        key = key.view(*key.shape[:-1], -1, self.head_size)
-
-        query_rot = query[..., :self.rotary_dim]
-        key_rot = key[..., :self.rotary_dim]
-        if self.rotary_dim < self.head_size:
-            query_pass = query[..., self.rotary_dim:]
-            key_pass = key[..., self.rotary_dim:]
-
-        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
-            positions.device, dtype=query.dtype)
-        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
-                                     if offsets is not None else positions]
-        cos, sin = cos_sin.chunk(2, dim=-1)
-        if self.is_neox_style:
-            # NOTE(woosuk): Here we assume that the positions tensor has the
-            # shape [batch_size, seq_len].
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
-        else:
-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
-
-        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
-        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
-        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
-
-        if self.rotary_dim < self.head_size:
-            query = torch.cat((query_rot, query_pass), dim=-1)
-            key = torch.cat((key_rot, key_pass), dim=-1)
-        else:
-            query = query_rot
-            key = key_rot
-        query = query.flatten(-2)
-        key = key.flatten(-2)
-        return query, key
-
-    def forward_native2(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Another PyTorch-native implementation of forward().
-
-        This method might perform better than `forward_native()` when compiled.
-        """
+        """A PyTorch-native implementation of forward()."""
         if offsets is not None:
             positions = positions + offsets
         positions = positions.flatten()
@@ -183,14 +137,14 @@ def forward_native2(
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., :self.rotary_dim]
         query_pass = query[..., self.rotary_dim:]
-        query_rot = _apply_rotary_emb(query_rot, cos, sin)
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
         key_rot = key[..., :self.rotary_dim]
         key_pass = key[..., self.rotary_dim:]
-        key_rot = _apply_rotary_emb(key_rot, cos, sin)
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
@@ -203,7 +157,7 @@ def forward_cuda(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         from vllm import _custom_ops as ops
 
-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
                                                    dtype=query.dtype)
         # ops.rotary_embedding()/batched_rotary_embedding()
         # are in-place operations that update the query and key tensors.
@@ -240,17 +194,6 @@ def forward_xpu(
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
-    def forward_tpu(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        forward_fn = (self.forward_native2
-                      if self.use_native2 else self.forward_native)
-        return forward_fn(positions, query, key, offsets)
-
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"

From ba262c4e5aa9fa753c8cedfaea5c42941184a0db Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 4 Sep 2024 20:33:12 -0700
Subject: [PATCH 50/51] [ci] Mark LoRA test as soft-fail (#8160)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d50d8f32a816d..b2874750a777e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -368,6 +368,7 @@ steps:
 - label: LoRA Long Context (Distributed) # 11min
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   num_gpus: 4
+  soft_fail: true
   source_file_dependencies:
   - vllm/lora
   - tests/lora/test_long_context

From e39ebf5cf5ec8f7449d633b6428333a99a206a1c Mon Sep 17 00:00:00 2001
From: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Date: Wed, 4 Sep 2024 22:12:26 -0700
Subject: [PATCH 51/51] [Core/Bugfix] Add query dtype as per FlashInfer API
 requirements. (#8173)

---
 tests/kernels/test_flashinfer.py      | 3 ++-
 vllm/attention/backends/flashinfer.py | 9 ++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
index 67f12cf1ee08e..696cc0c6cdf10 100644
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -445,7 +445,8 @@ def test_flashinfer_decode_with_paged_fp8_kv(
                           head_size,
                           block_size,
                           "NONE",
-                          data_type=dtype)
+                          data_type=dtype,
+                          q_data_type=dtype)
     output = wrapper.forward(query,
                              kv_cache_fp8,
                              logits_soft_cap=soft_cap,
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index aa9d4a71dbf87..7aec8203eb1e5 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -224,6 +224,7 @@ def graph_capture_get_metadata_for_batch(self, batch_size: int):
             query_start_loc=query_start_loc_host,
             device=self.runner.device,
             data_type=kv_cache_dtype,
+            q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=True,
             decode_wrapper=self._graph_decode_wrapper,
             prefill_wrapper=None)
@@ -292,6 +293,8 @@ class FlashInferMetadata(AttentionMetadata):
     page_size: Optional[int] = None
     # The data type of the paged kv cache
     data_type: torch.dtype = None
+    # The data type of the query
+    q_data_type: torch.dtype = None
     device: torch.device = torch.device("cuda")
     is_profile_run: bool = False
 
@@ -353,7 +356,10 @@ def begin_forward(self):
                 self.page_size,
                 # Disable flashinfer's pos encoding and use vllm's rope.
                 pos_encoding_mode="NONE",
-                data_type=self.data_type)
+                # kv-cache data type.
+                data_type=self.data_type,
+                # query data type.
+                q_data_type=self.q_data_type)
 
     def asdict_zerocopy(self,
                         skip_fields: Optional[Set[str]] = None
@@ -617,6 +623,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             query_start_loc=query_start_loc,
             device=device,
             data_type=kv_cache_dtype,
+            q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=use_captured_graph,
             is_profile_run=self.is_profile_run)