From 1295df57e8a783b175be0ca411938f71f6f71527 Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 19 Jun 2024 07:40:07 +0000 Subject: [PATCH 1/5] add log & init --- .../api_openai_backend/query_client.py | 10 ++++---- llm_on_ray/inference/predictor_deployment.py | 6 +++++ .../inference/predictors/hpu_predictor.py | 23 ++++++++++--------- llm_on_ray/inference/serve.py | 14 +++++------ 4 files changed, 31 insertions(+), 22 deletions(-) diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py index 569be40dc..101d02d46 100644 --- a/llm_on_ray/inference/api_openai_backend/query_client.py +++ b/llm_on_ray/inference/api_openai_backend/query_client.py @@ -54,10 +54,12 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep top_p = request_config.get("top_p", 1.0) max_new_tokens = request_config.get("max_tokens", None) gen_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p} - if temperature != 1.0 or top_p != 1.0: - gen_config.update({"do_sample": True}) - if request_config.get("ignore_eos", False): - gen_config.update({"ignore_eos": True}) + gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0}) + gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)}) + + print("SSSSSS3:", request_config) + print("SSSSSS4:", gen_config) # no use + # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down async for x in handle_request( model=model, diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index a1055915d..de7fca9f3 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -396,8 +396,11 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON content="Empty prompt is not supported.", ) config = json_request["config"] if "config" in json_request else {} + print("SSSSSS1:", config) + print("SSSSSS2:", input) # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) + print("SSSSSS6:", prompts) # Handle streaming response if streaming_response: @@ -416,8 +419,11 @@ async def openai_call( ): self.use_openai = True + # TODO: print input inside preprocess_prompts later + print("SSSSSS5:", input) # return prompt or list of prompts preprocessed input = self.preprocess_prompts(input, tools, tool_choice) + print("SSSSSS7:", input) # Handle streaming response if streaming_response: diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py index 5e19c8733..74e128f2e 100644 --- a/llm_on_ray/inference/predictors/hpu_predictor.py +++ b/llm_on_ray/inference/predictors/hpu_predictor.py @@ -79,9 +79,16 @@ def __init__(self, infer_conf: InferenceConfig): # decide correct torch dtype for loading HF model decide_torch_dtype(infer_conf) + print("SSSSSS8:", infer_conf) + self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs + # optimize transformers for gaudi + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + adapt_transformers_to_gaudi() + if infer_conf.deepspeed: # DeepSpeed is enabled, start worker group # Prepare placement group @@ -105,13 +112,6 @@ def __init__(self, infer_conf: InferenceConfig): htcore.hpu_set_env() - # Tweak transformer to optimize performance on Gaudi - from optimum.habana.transformers.modeling_utils import ( - adapt_transformers_to_gaudi, - ) - - adapt_transformers_to_gaudi() - self.device = torch.device("hpu") model = AutoModelForCausalLM.from_pretrained( model_desc.model_id_or_path, **model_desc.config.dict() @@ -181,6 +181,7 @@ def _process_config(self, config): def get_streamer(self): if self.infer_conf.deepspeed: + # Q2: Why always use the first worker? return ray.get(self.deepspeed_workers[0].get_streamer.remote()) else: return TextIteratorStreamer( @@ -196,6 +197,8 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput: self._process_config(config) + # TODO: Maybe we should get realtime load info of all cards, set a heathy usage ratio and pick the usable cards for serving. + # So that some errors like OOM can be prevented, and the server will be more robust. if self.infer_conf.deepspeed: return ray.get( [worker.generate.remote(prompt, **config) for worker in self.deepspeed_workers] @@ -219,7 +222,9 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput: def streaming_generate(self, prompt, streamer, **config): self._process_config(config) + # Q1: Why it is handled here when using both deepspeed and hpu? if self.infer_conf.deepspeed: + # Q2: Why always use the first worker? self.deepspeed_workers[0].streaming_generate.remote(prompt, streamer, **config) for worker in self.deepspeed_workers[1:]: worker.streaming_generate.remote(prompt, self._create_dummy_streamer(), **config) @@ -284,10 +289,6 @@ def load_model_and_tokenizer(self): self.world_size = int(os.environ["WORLD_SIZE"]) self.local_rank = int(os.environ["LOCAL_RANK"]) self.device = torch.device("hpu") - # optimize transformers for gaudi - from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi - - adapt_transformers_to_gaudi() self.load_model() model_desc = self.infer_conf.model_description self.tokenizer = load_tokenizer(self.model, model_desc.tokenizer_name_or_path) diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py index ecd3bdee8..c8016c745 100644 --- a/llm_on_ray/inference/serve.py +++ b/llm_on_ray/inference/serve.py @@ -41,14 +41,14 @@ def get_deployed_models(args): set(all_models_name) ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}." model_list = {model: all_models[model] for model in models} + print("--config_file is not set while --models is set, serving model(s):", model_list) else: model_list = all_models + print("--config_file and --models is not set, serving all models:", model_list) else: - # config_file has precedence over others - if args.config_file: - print("Reading from config file, " + args.config_file) - with open(args.config_file, "r") as f: - infer_conf = parse_yaml_raw_as(InferenceConfig, f) + print("Reading from config file, " + args.config_file) + with open(args.config_file, "r") as f: + infer_conf = parse_yaml_raw_as(InferenceConfig, f) model_list = {} model_list[infer_conf.name] = infer_conf @@ -147,6 +147,8 @@ def main(argv=None): ray.init(address="auto") deployments, model_list = get_deployed_models(args) + print("Service is running with deployments:" + str(deployments)) + print("Service is running models:" + str(model_list)) if args.simple: # provide simple model endpoint # models can be served to customed URLs according to configuration files. @@ -156,8 +158,6 @@ def main(argv=None): # all models are served under the same URL and then accessed # through model_id, so it needs to pass in a unified URL. host = "127.0.0.1" if args.serve_local_only else "0.0.0.0" - print("Service is running with deployments:" + str(deployments)) - print("Service is running models:" + str(model_list)) openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests) msg = "Service is deployed successfully." From 651bb4bc64f9b541e459bf44ffa67bc24977ee0a Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 20 Jun 2024 06:38:11 +0000 Subject: [PATCH 2/5] init --- .../api_server_openai/query_openai_sdk.py | 8 ++++++ .../api_openai_backend/query_client.py | 4 +-- llm_on_ray/inference/inference_config.py | 1 + llm_on_ray/inference/predictor_deployment.py | 23 ++++++++++------- .../inference/predictors/hpu_predictor.py | 5 +++- llm_on_ray/inference/serve.py | 25 +++++++++++++++---- 6 files changed, 49 insertions(+), 17 deletions(-) diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py index 586a59f3a..29ce08092 100644 --- a/examples/inference/api_server_openai/query_openai_sdk.py +++ b/examples/inference/api_server_openai/query_openai_sdk.py @@ -40,6 +40,12 @@ help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation", ) +parser.add_argument( + "--debug_mode", + action="store_true", + help="If debug mode is enabled, debug logs will be printed", +) + args = parser.parse_args() if "OPENAI_API_KEY" in os.environ: @@ -65,6 +71,7 @@ def stream_chat(): max_tokens=args.max_new_tokens, temperature=args.temperature, top_p=args.top_p, + debug_mode=args.debug_mode, ): content = chunk.choices[0].delta.content if content is not None: @@ -81,6 +88,7 @@ def chunk_chat(): max_tokens=args.max_new_tokens, temperature=args.temperature, top_p=args.top_p, + debug_mode=args.debug_mode, ) for chunk in [output]: try: diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py index 101d02d46..5d664c5d7 100644 --- a/llm_on_ray/inference/api_openai_backend/query_client.py +++ b/llm_on_ray/inference/api_openai_backend/query_client.py @@ -57,8 +57,8 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0}) gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)}) - print("SSSSSS3:", request_config) - print("SSSSSS4:", gen_config) # no use + if request_config.get("debug_mode", False): + print("DEBUG: print request_config:", request_config) # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down async for x in handle_request( diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py index 7d405c7c7..6579626dc 100644 --- a/llm_on_ray/inference/inference_config.py +++ b/llm_on_ray/inference/inference_config.py @@ -171,6 +171,7 @@ class InferenceConfig(BaseModel): ipex: Ipex = Ipex() hpu_model_config: HpuModelConfig = HpuModelConfig() model_description: ModelDescription = ModelDescription() + debug_mode: bool = False # prevent warning of protected namespaces # DO NOT TOUCH diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index de7fca9f3..ac8eda08e 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -396,11 +396,13 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON content="Empty prompt is not supported.", ) config = json_request["config"] if "config" in json_request else {} - print("SSSSSS1:", config) - print("SSSSSS2:", input) + if config.get("debug_mode", False): + print("DEBUG:predictor_deployment.py:print config received from json:", config) + print("DEBUG:predictor_deployment.py::print inputs for prompts:", input) # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) - print("SSSSSS6:", prompts) + if config.get("debug_mode", False): + print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts) # Handle streaming response if streaming_response: @@ -419,15 +421,18 @@ async def openai_call( ): self.use_openai = True - # TODO: print input inside preprocess_prompts later - print("SSSSSS5:", input) + # TODO: Pass down config into preprocess_prompts for more logs. + if config.get("debug_mode", False): + print("DEBUG:predictor_deployment.py:print config received from query_client:", config) + print("DEBUG:predictor_deployment.py::print inputs for prompts:", input) # return prompt or list of prompts preprocessed - input = self.preprocess_prompts(input, tools, tool_choice) - print("SSSSSS7:", input) + prompts = self.preprocess_prompts(input, tools, tool_choice) + if config.get("debug_mode", False): + print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts) # Handle streaming response if streaming_response: - async for result in self.handle_streaming(input, config): + async for result in self.handle_streaming(prompts, config): yield result else: - yield await self.handle_non_streaming(input, config) + yield await self.handle_non_streaming(prompts, config) diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py index 74e128f2e..38877051e 100644 --- a/llm_on_ray/inference/predictors/hpu_predictor.py +++ b/llm_on_ray/inference/predictors/hpu_predictor.py @@ -79,7 +79,10 @@ def __init__(self, infer_conf: InferenceConfig): # decide correct torch dtype for loading HF model decide_torch_dtype(infer_conf) - print("SSSSSS8:", infer_conf) + debug_mode = infer_conf.debug_mode + + if debug_mode: + print("DEBUG:hpu_predictor:print inference config:", infer_conf) self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py index c8016c745..9d7427af0 100644 --- a/llm_on_ray/inference/serve.py +++ b/llm_on_ray/inference/serve.py @@ -41,12 +41,21 @@ def get_deployed_models(args): set(all_models_name) ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}." model_list = {model: all_models[model] for model in models} - print("--config_file is not set while --models is set, serving model(s):", model_list) + if args.debug_mode: + print( + "DEBUG:serve.py: --config_file is not set while --models is set, serving model(s):", + model_list, + ) else: model_list = all_models - print("--config_file and --models is not set, serving all models:", model_list) + if args.debug_mode: + print( + "DEBUG:serve.py: --config_file and --models is not set, serving all models:", + model_list, + ) else: - print("Reading from config file, " + args.config_file) + if args.debug_mode: + print("DEBUG:serve.py: Reading from config file, " + args.config_file) with open(args.config_file, "r") as f: infer_conf = parse_yaml_raw_as(InferenceConfig, f) model_list = {} @@ -131,6 +140,11 @@ def main(argv=None): parser.add_argument( "--max_batch_size", default=None, type=int, help="The max batch size for dynamic batching." ) + parser.add_argument( + "--debug_mode", + action="store_true", + help="If debug mode is enabled, debug logs will be printed", + ) # Print help if no arguments were provided if len(sys.argv) == 1: @@ -147,8 +161,9 @@ def main(argv=None): ray.init(address="auto") deployments, model_list = get_deployed_models(args) - print("Service is running with deployments:" + str(deployments)) - print("Service is running models:" + str(model_list)) + if args.debug_mode: + print("DEBUG:serve.py: Service is running with deployments:" + str(deployments)) + print("DEBUG:serve.py: Service is running models:" + str(model_list)) if args.simple: # provide simple model endpoint # models can be served to customed URLs according to configuration files. From 7f4a017a490e7ad4f083d53cebaaaa92f98d3faa Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 3 Jul 2024 07:34:39 +0000 Subject: [PATCH 3/5] fix --- .../api_openai_backend/query_client.py | 5 ++++- llm_on_ray/inference/predictor_deployment.py | 12 ++++++------ .../inference/predictors/hpu_predictor.py | 4 ---- llm_on_ray/inference/serve.py | 17 +++++++++-------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py index 5d664c5d7..b9fe6023b 100644 --- a/llm_on_ray/inference/api_openai_backend/query_client.py +++ b/llm_on_ray/inference/api_openai_backend/query_client.py @@ -37,6 +37,9 @@ from fastapi import HTTPException from llm_on_ray.inference.api_openai_backend.openai_protocol import ModelCard, Prompt from llm_on_ray.inference.api_openai_backend.request_handler import handle_request +from llm_on_ray.inference.logger import get_logger + +logger = get_logger(__name__) class RouterQueryClient: @@ -58,7 +61,7 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)}) if request_config.get("debug_mode", False): - print("DEBUG: print request_config:", request_config) + logger.debug(f"Print request_config: {request_config}") # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down async for x in handle_request( diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index c9ec43a66..43f1990aa 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -395,12 +395,12 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON input = request["text"] config = request["config"] if config.get("debug_mode", False): - print("DEBUG:predictor_deployment.py:print config received from json:", config) - print("DEBUG:predictor_deployment.py::print inputs for prompts:", input) + logger.debug(f"Print config received from json: {config}") + logger.debug(f"Print inputs for prompts: {input}") # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) if config.get("debug_mode", False): - print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts) + logger.debug(f"Print prompts from inputs: {prompts}") # Handle streaming response if streaming_response: @@ -421,12 +421,12 @@ async def openai_call( # TODO: Pass down config into preprocess_prompts for more logs. if config.get("debug_mode", False): - print("DEBUG:predictor_deployment.py:print config received from query_client:", config) - print("DEBUG:predictor_deployment.py::print inputs for prompts:", input) + logger.debug(f"Print config received from query_client: {config}") + logger.debug(f"Print inputs for prompts: {input}") # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input, tools, tool_choice) if config.get("debug_mode", False): - print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts) + logger.debug(f"Print prompts from inputs: {prompts}") # Handle streaming response if streaming_response: diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py index 38877051e..20e7bd8d8 100644 --- a/llm_on_ray/inference/predictors/hpu_predictor.py +++ b/llm_on_ray/inference/predictors/hpu_predictor.py @@ -184,7 +184,6 @@ def _process_config(self, config): def get_streamer(self): if self.infer_conf.deepspeed: - # Q2: Why always use the first worker? return ray.get(self.deepspeed_workers[0].get_streamer.remote()) else: return TextIteratorStreamer( @@ -200,8 +199,6 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput: self._process_config(config) - # TODO: Maybe we should get realtime load info of all cards, set a heathy usage ratio and pick the usable cards for serving. - # So that some errors like OOM can be prevented, and the server will be more robust. if self.infer_conf.deepspeed: return ray.get( [worker.generate.remote(prompt, **config) for worker in self.deepspeed_workers] @@ -227,7 +224,6 @@ def streaming_generate(self, prompt, streamer, **config): self._process_config(config) # Q1: Why it is handled here when using both deepspeed and hpu? if self.infer_conf.deepspeed: - # Q2: Why always use the first worker? self.deepspeed_workers[0].streaming_generate.remote(prompt, streamer, **config) for worker in self.deepspeed_workers[1:]: worker.streaming_generate.remote(prompt, self._create_dummy_streamer(), **config) diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py index 9d7427af0..6e7e8b73b 100644 --- a/llm_on_ray/inference/serve.py +++ b/llm_on_ray/inference/serve.py @@ -25,6 +25,9 @@ InferenceConfig, all_models, ) +from llm_on_ray.inference.logger import get_logger + +logger = get_logger(__name__) def get_deployed_models(args): @@ -42,16 +45,14 @@ def get_deployed_models(args): ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}." model_list = {model: all_models[model] for model in models} if args.debug_mode: - print( - "DEBUG:serve.py: --config_file is not set while --models is set, serving model(s):", - model_list, + logger.debug( + f"--config_file is not set while --models is set, serving model(s): {model_list}" ) else: model_list = all_models if args.debug_mode: - print( - "DEBUG:serve.py: --config_file and --models is not set, serving all models:", - model_list, + logger.debug( + f"--config_file and --models is not set, serving all models: {model_list}" ) else: if args.debug_mode: @@ -162,8 +163,8 @@ def main(argv=None): ray.init(address="auto") deployments, model_list = get_deployed_models(args) if args.debug_mode: - print("DEBUG:serve.py: Service is running with deployments:" + str(deployments)) - print("DEBUG:serve.py: Service is running models:" + str(model_list)) + logger.debug(f"Service is running with deployments: {str(deployments)}") + logger.debug(f"Service is running models: {str(model_list)}") if args.simple: # provide simple model endpoint # models can be served to customed URLs according to configuration files. From 21c9c7ed79ce2576e1338b0a5c76eb51f44fe03d Mon Sep 17 00:00:00 2001 From: Deegue Date: Mon, 8 Jul 2024 01:28:54 +0000 Subject: [PATCH 4/5] nit --- .../api_openai_backend/query_client.py | 3 +-- llm_on_ray/inference/predictor_deployment.py | 22 ++++++++----------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py index b9fe6023b..0e52c7960 100644 --- a/llm_on_ray/inference/api_openai_backend/query_client.py +++ b/llm_on_ray/inference/api_openai_backend/query_client.py @@ -60,8 +60,7 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0}) gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)}) - if request_config.get("debug_mode", False): - logger.debug(f"Print request_config: {request_config}") + logger.debug(f"Print request_config: {request_config}") # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down async for x in handle_request( diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 43f1990aa..448ce5197 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -394,13 +394,11 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON streaming_response = request["stream"] input = request["text"] config = request["config"] - if config.get("debug_mode", False): - logger.debug(f"Print config received from json: {config}") - logger.debug(f"Print inputs for prompts: {input}") + logger.debug(f"Print config received from json: {config}") + logger.debug(f"Print inputs for prompts: {input}") # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) - if config.get("debug_mode", False): - logger.debug(f"Print prompts from inputs: {prompts}") + logger.debug(f"Print prompts from inputs: {prompts}") # Handle streaming response if streaming_response: @@ -420,17 +418,15 @@ async def openai_call( self.use_openai = True # TODO: Pass down config into preprocess_prompts for more logs. - if config.get("debug_mode", False): - logger.debug(f"Print config received from query_client: {config}") - logger.debug(f"Print inputs for prompts: {input}") + logger.debug(f"Print config received from query_client: {config}") + logger.debug(f"Print inputs for prompts: {input}") # return prompt or list of prompts preprocessed - prompts = self.preprocess_prompts(input, tools, tool_choice) - if config.get("debug_mode", False): - logger.debug(f"Print prompts from inputs: {prompts}") + input = self.preprocess_prompts(input, tools, tool_choice) + logger.debug(f"Print prompts from inputs: {input}") # Handle streaming response if streaming_response: - async for result in self.handle_streaming(prompts, config): + async for result in self.handle_streaming(input, config): yield result else: - yield await self.handle_non_streaming(prompts, config) + yield await self.handle_non_streaming(input, config) From 2358e3e7b89b90befd682a70b8c3848573cb6bd7 Mon Sep 17 00:00:00 2001 From: Deegue Date: Mon, 8 Jul 2024 01:41:02 +0000 Subject: [PATCH 5/5] remove --- .../api_server_openai/query_openai_sdk.py | 8 ------ llm_on_ray/inference/inference_config.py | 1 - .../inference/predictors/hpu_predictor.py | 8 +++--- llm_on_ray/inference/serve.py | 25 ++++++------------- 4 files changed, 11 insertions(+), 31 deletions(-) diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py index 29ce08092..586a59f3a 100644 --- a/examples/inference/api_server_openai/query_openai_sdk.py +++ b/examples/inference/api_server_openai/query_openai_sdk.py @@ -40,12 +40,6 @@ help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation", ) -parser.add_argument( - "--debug_mode", - action="store_true", - help="If debug mode is enabled, debug logs will be printed", -) - args = parser.parse_args() if "OPENAI_API_KEY" in os.environ: @@ -71,7 +65,6 @@ def stream_chat(): max_tokens=args.max_new_tokens, temperature=args.temperature, top_p=args.top_p, - debug_mode=args.debug_mode, ): content = chunk.choices[0].delta.content if content is not None: @@ -88,7 +81,6 @@ def chunk_chat(): max_tokens=args.max_new_tokens, temperature=args.temperature, top_p=args.top_p, - debug_mode=args.debug_mode, ) for chunk in [output]: try: diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py index 6579626dc..7d405c7c7 100644 --- a/llm_on_ray/inference/inference_config.py +++ b/llm_on_ray/inference/inference_config.py @@ -171,7 +171,6 @@ class InferenceConfig(BaseModel): ipex: Ipex = Ipex() hpu_model_config: HpuModelConfig = HpuModelConfig() model_description: ModelDescription = ModelDescription() - debug_mode: bool = False # prevent warning of protected namespaces # DO NOT TOUCH diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py index 20e7bd8d8..97ec1dcb1 100644 --- a/llm_on_ray/inference/predictors/hpu_predictor.py +++ b/llm_on_ray/inference/predictors/hpu_predictor.py @@ -69,6 +69,9 @@ MllmPromptInput, ) from llm_on_ray.inference.utils import decide_torch_dtype +from llm_on_ray.inference.logger import get_logger + +logger = get_logger(__name__) class HPUPredictor(Predictor): @@ -79,10 +82,7 @@ def __init__(self, infer_conf: InferenceConfig): # decide correct torch dtype for loading HF model decide_torch_dtype(infer_conf) - debug_mode = infer_conf.debug_mode - - if debug_mode: - print("DEBUG:hpu_predictor:print inference config:", infer_conf) + logger.debug(f"Print inference config: {infer_conf}") self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py index 6e7e8b73b..5304c7b3b 100644 --- a/llm_on_ray/inference/serve.py +++ b/llm_on_ray/inference/serve.py @@ -44,19 +44,14 @@ def get_deployed_models(args): set(all_models_name) ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}." model_list = {model: all_models[model] for model in models} - if args.debug_mode: - logger.debug( - f"--config_file is not set while --models is set, serving model(s): {model_list}" - ) + logger.debug( + f"--config_file is not set while --models is set, serving model(s): {model_list}" + ) else: model_list = all_models - if args.debug_mode: - logger.debug( - f"--config_file and --models is not set, serving all models: {model_list}" - ) + logger.debug(f"--config_file and --models is not set, serving all models: {model_list}") else: - if args.debug_mode: - print("DEBUG:serve.py: Reading from config file, " + args.config_file) + print("DEBUG:serve.py: Reading from config file, " + args.config_file) with open(args.config_file, "r") as f: infer_conf = parse_yaml_raw_as(InferenceConfig, f) model_list = {} @@ -141,11 +136,6 @@ def main(argv=None): parser.add_argument( "--max_batch_size", default=None, type=int, help="The max batch size for dynamic batching." ) - parser.add_argument( - "--debug_mode", - action="store_true", - help="If debug mode is enabled, debug logs will be printed", - ) # Print help if no arguments were provided if len(sys.argv) == 1: @@ -162,9 +152,8 @@ def main(argv=None): ray.init(address="auto") deployments, model_list = get_deployed_models(args) - if args.debug_mode: - logger.debug(f"Service is running with deployments: {str(deployments)}") - logger.debug(f"Service is running models: {str(model_list)}") + logger.debug(f"Service is running with deployments: {str(deployments)}") + logger.debug(f"Service is running models: {str(model_list)}") if args.simple: # provide simple model endpoint # models can be served to customed URLs according to configuration files.