From 84563c9b979df4ae567884806f5d717337406651 Mon Sep 17 00:00:00 2001 From: Cedar Date: Tue, 28 Jan 2025 09:48:36 -0800 Subject: [PATCH] remove a residual line from when requests only stored the last token in order to fix a masking issue --- shortfin/python/shortfin_apps/llm/components/service.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/shortfin/python/shortfin_apps/llm/components/service.py b/shortfin/python/shortfin_apps/llm/components/service.py index 9f08e82b1..75dc47146 100644 --- a/shortfin/python/shortfin_apps/llm/components/service.py +++ b/shortfin/python/shortfin_apps/llm/components/service.py @@ -401,10 +401,7 @@ async def run(self): m.fill( 1 # Must pad with a nonzero value because a division by 0 during softmax floods clobber page (page 0) in cache with NaN values. ) - m.items = [ - req.start_position + len(req.input_token_ids) - for req in self.exec_requests - ] + m.items = [req.start_position + 1 for req in self.exec_requests] seq_lens_host.copy_to(seq_lens) # Populate cache pages.