From c8b7f555ddbe4b7822e81dc46a937ad169020d36 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 11 Dec 2023 08:44:10 +0000 Subject: [PATCH] add missing free_request method to Dummy cache manager --- serve/mlc_serve/model/dummy_model.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/serve/mlc_serve/model/dummy_model.py b/serve/mlc_serve/model/dummy_model.py index 7d28586ebf..c86c6523c4 100644 --- a/serve/mlc_serve/model/dummy_model.py +++ b/serve/mlc_serve/model/dummy_model.py @@ -1,28 +1,18 @@ -from typing import Optional, Union +from typing import Union from mlc_serve.engine import ( ChatMessage, - DebugOptions, - FinishReason, - Request, + RequestState, RequestId, - RequestOutput, - SamplingParams, - StoppingCriteria, get_engine_config ) from mlc_serve.model.base import ModelArtifactConfig from mlc_serve.engine.model_module import ( - ConversationTemplate, DecodeRequest, KVCache, - KVCacheManager, - ModelModule, PrefillRequest, SequenceId, TextGenerationResult, - TextGenerator, - Tokenizer, ) class DummyTokenizer: @@ -74,6 +64,10 @@ def free(self, sequence_id: SequenceId): raise RuntimeError("Multiple generated sequences not supported") del self.cache.cached_requests[sequence_id.request_id] + def free_request(self, state: RequestState): + for gen_seq in state.generation_sequences: + self.free(gen_seq.seq_id) + def get_kv_cache_size(self) -> int: return self.cache.max_cached_tokens