From 9855aea21b6aec48b12cef3a1614e7796b970a73 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 13 Dec 2024 17:08:23 -0800 Subject: [PATCH] [Bugfix][V1] Re-compute an entire block when fully cache hit (#11186) Signed-off-by: Cody Yu --- vllm/v1/core/scheduler.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index a3e85c20cc664..f055eed77c372 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -199,9 +199,13 @@ def schedule(self) -> "SchedulerOutput": if num_new_tokens == 0: # The happens when prompt length is divisible by the block # size and all blocks are cached. Now we force to recompute - # the last token. - num_computed_tokens -= 1 - num_new_tokens = 1 + # the last block. Note that we have to re-compute an entire + # block because allocate_slots() assumes num_computed_tokens + # is always a multiple of the block size. This limitation + # can potentially be removed in the future to slightly + # improve the performance. + num_computed_tokens -= self.block_size + num_new_tokens = self.block_size computed_blocks.pop() num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0