From 1295df57e8a783b175be0ca411938f71f6f71527 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 19 Jun 2024 07:40:07 +0000
Subject: [PATCH 1/5] add log & init

---
 .../api_openai_backend/query_client.py        | 10 ++++----
 llm_on_ray/inference/predictor_deployment.py  |  6 +++++
 .../inference/predictors/hpu_predictor.py     | 23 ++++++++++---------
 llm_on_ray/inference/serve.py                 | 14 +++++------
 4 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
index 569be40dc..101d02d46 100644
--- a/llm_on_ray/inference/api_openai_backend/query_client.py
+++ b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -54,10 +54,12 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
         top_p = request_config.get("top_p", 1.0)
         max_new_tokens = request_config.get("max_tokens", None)
         gen_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
-        if temperature != 1.0 or top_p != 1.0:
-            gen_config.update({"do_sample": True})
-        if request_config.get("ignore_eos", False):
-            gen_config.update({"ignore_eos": True})
+        gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0})
+        gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)})
+
+        print("SSSSSS3:", request_config)
+        print("SSSSSS4:", gen_config)  # no use
+        # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down
 
         async for x in handle_request(
             model=model,
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index a1055915d..de7fca9f3 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -396,8 +396,11 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
+        print("SSSSSS1:", config)
+        print("SSSSSS2:", input)
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
+        print("SSSSSS6:", prompts)
 
         # Handle streaming response
         if streaming_response:
@@ -416,8 +419,11 @@ async def openai_call(
     ):
         self.use_openai = True
 
+        # TODO: print input inside preprocess_prompts later
+        print("SSSSSS5:", input)
         # return prompt or list of prompts preprocessed
         input = self.preprocess_prompts(input, tools, tool_choice)
+        print("SSSSSS7:", input)
 
         # Handle streaming response
         if streaming_response:
diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
index 5e19c8733..74e128f2e 100644
--- a/llm_on_ray/inference/predictors/hpu_predictor.py
+++ b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -79,9 +79,16 @@ def __init__(self, infer_conf: InferenceConfig):
         # decide correct torch dtype for loading HF model
         decide_torch_dtype(infer_conf)
 
+        print("SSSSSS8:", infer_conf)
+
         self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile
         self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs
 
+        # optimize transformers for gaudi
+        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+        adapt_transformers_to_gaudi()
+
         if infer_conf.deepspeed:
             # DeepSpeed is enabled, start worker group
             # Prepare placement group
@@ -105,13 +112,6 @@ def __init__(self, infer_conf: InferenceConfig):
 
                 htcore.hpu_set_env()
 
-            # Tweak transformer to optimize performance on Gaudi
-            from optimum.habana.transformers.modeling_utils import (
-                adapt_transformers_to_gaudi,
-            )
-
-            adapt_transformers_to_gaudi()
-
             self.device = torch.device("hpu")
             model = AutoModelForCausalLM.from_pretrained(
                 model_desc.model_id_or_path, **model_desc.config.dict()
@@ -181,6 +181,7 @@ def _process_config(self, config):
 
     def get_streamer(self):
         if self.infer_conf.deepspeed:
+            # Q2: Why always use the first worker?
             return ray.get(self.deepspeed_workers[0].get_streamer.remote())
         else:
             return TextIteratorStreamer(
@@ -196,6 +197,8 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput:
 
         self._process_config(config)
 
+        # TODO: Maybe we should get realtime load info of all cards, set a heathy usage ratio and pick the usable cards for serving.
+        #       So that some errors like OOM can be prevented, and the server will be more robust.
         if self.infer_conf.deepspeed:
             return ray.get(
                 [worker.generate.remote(prompt, **config) for worker in self.deepspeed_workers]
@@ -219,7 +222,9 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput:
 
     def streaming_generate(self, prompt, streamer, **config):
         self._process_config(config)
+        # Q1: Why it is handled here when using both deepspeed and hpu?
         if self.infer_conf.deepspeed:
+            # Q2: Why always use the first worker?
             self.deepspeed_workers[0].streaming_generate.remote(prompt, streamer, **config)
             for worker in self.deepspeed_workers[1:]:
                 worker.streaming_generate.remote(prompt, self._create_dummy_streamer(), **config)
@@ -284,10 +289,6 @@ def load_model_and_tokenizer(self):
         self.world_size = int(os.environ["WORLD_SIZE"])
         self.local_rank = int(os.environ["LOCAL_RANK"])
         self.device = torch.device("hpu")
-        # optimize transformers for gaudi
-        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-        adapt_transformers_to_gaudi()
         self.load_model()
         model_desc = self.infer_conf.model_description
         self.tokenizer = load_tokenizer(self.model, model_desc.tokenizer_name_or_path)
diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
index ecd3bdee8..c8016c745 100644
--- a/llm_on_ray/inference/serve.py
+++ b/llm_on_ray/inference/serve.py
@@ -41,14 +41,14 @@ def get_deployed_models(args):
                 set(all_models_name)
             ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}."
             model_list = {model: all_models[model] for model in models}
+            print("--config_file is not set while --models is set, serving model(s):", model_list)
         else:
             model_list = all_models
+            print("--config_file and --models is not set, serving all models:", model_list)
     else:
-        # config_file has precedence over others
-        if args.config_file:
-            print("Reading from config file, " + args.config_file)
-            with open(args.config_file, "r") as f:
-                infer_conf = parse_yaml_raw_as(InferenceConfig, f)
+        print("Reading from config file, " + args.config_file)
+        with open(args.config_file, "r") as f:
+            infer_conf = parse_yaml_raw_as(InferenceConfig, f)
         model_list = {}
         model_list[infer_conf.name] = infer_conf
 
@@ -147,6 +147,8 @@ def main(argv=None):
 
     ray.init(address="auto")
     deployments, model_list = get_deployed_models(args)
+    print("Service is running with deployments:" + str(deployments))
+    print("Service is running models:" + str(model_list))
     if args.simple:
         # provide simple model endpoint
         # models can be served to customed URLs according to configuration files.
@@ -156,8 +158,6 @@ def main(argv=None):
         # all models are served under the same URL and then accessed
         # through model_id, so it needs to pass in a unified URL.
         host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
-        print("Service is running with deployments:" + str(deployments))
-        print("Service is running models:" + str(model_list))
         openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests)
 
     msg = "Service is deployed successfully."

From 651bb4bc64f9b541e459bf44ffa67bc24977ee0a Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 20 Jun 2024 06:38:11 +0000
Subject: [PATCH 2/5] init

---
 .../api_server_openai/query_openai_sdk.py     |  8 ++++++
 .../api_openai_backend/query_client.py        |  4 +--
 llm_on_ray/inference/inference_config.py      |  1 +
 llm_on_ray/inference/predictor_deployment.py  | 23 ++++++++++-------
 .../inference/predictors/hpu_predictor.py     |  5 +++-
 llm_on_ray/inference/serve.py                 | 25 +++++++++++++++----
 6 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py
index 586a59f3a..29ce08092 100644
--- a/examples/inference/api_server_openai/query_openai_sdk.py
+++ b/examples/inference/api_server_openai/query_openai_sdk.py
@@ -40,6 +40,12 @@
     help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation",
 )
 
+parser.add_argument(
+    "--debug_mode",
+    action="store_true",
+    help="If debug mode is enabled, debug logs will be printed",
+)
+
 args = parser.parse_args()
 
 if "OPENAI_API_KEY" in os.environ:
@@ -65,6 +71,7 @@ def stream_chat():
         max_tokens=args.max_new_tokens,
         temperature=args.temperature,
         top_p=args.top_p,
+        debug_mode=args.debug_mode,
     ):
         content = chunk.choices[0].delta.content
         if content is not None:
@@ -81,6 +88,7 @@ def chunk_chat():
         max_tokens=args.max_new_tokens,
         temperature=args.temperature,
         top_p=args.top_p,
+        debug_mode=args.debug_mode,
     )
     for chunk in [output]:
         try:
diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
index 101d02d46..5d664c5d7 100644
--- a/llm_on_ray/inference/api_openai_backend/query_client.py
+++ b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -57,8 +57,8 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
         gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0})
         gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)})
 
-        print("SSSSSS3:", request_config)
-        print("SSSSSS4:", gen_config)  # no use
+        if request_config.get("debug_mode", False):
+            print("DEBUG: print request_config:", request_config)
         # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down
 
         async for x in handle_request(
diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index 7d405c7c7..6579626dc 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -171,6 +171,7 @@ class InferenceConfig(BaseModel):
     ipex: Ipex = Ipex()
     hpu_model_config: HpuModelConfig = HpuModelConfig()
     model_description: ModelDescription = ModelDescription()
+    debug_mode: bool = False
 
     # prevent warning of protected namespaces
     # DO NOT TOUCH
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index de7fca9f3..ac8eda08e 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -396,11 +396,13 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
-        print("SSSSSS1:", config)
-        print("SSSSSS2:", input)
+        if config.get("debug_mode", False):
+            print("DEBUG:predictor_deployment.py:print config received from json:", config)
+            print("DEBUG:predictor_deployment.py::print inputs for prompts:", input)
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
-        print("SSSSSS6:", prompts)
+        if config.get("debug_mode", False):
+            print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts)
 
         # Handle streaming response
         if streaming_response:
@@ -419,15 +421,18 @@ async def openai_call(
     ):
         self.use_openai = True
 
-        # TODO: print input inside preprocess_prompts later
-        print("SSSSSS5:", input)
+        # TODO: Pass down config into preprocess_prompts for more logs.
+        if config.get("debug_mode", False):
+            print("DEBUG:predictor_deployment.py:print config received from query_client:", config)
+            print("DEBUG:predictor_deployment.py::print inputs for prompts:", input)
         # return prompt or list of prompts preprocessed
-        input = self.preprocess_prompts(input, tools, tool_choice)
-        print("SSSSSS7:", input)
+        prompts = self.preprocess_prompts(input, tools, tool_choice)
+        if config.get("debug_mode", False):
+            print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts)
 
         # Handle streaming response
         if streaming_response:
-            async for result in self.handle_streaming(input, config):
+            async for result in self.handle_streaming(prompts, config):
                 yield result
         else:
-            yield await self.handle_non_streaming(input, config)
+            yield await self.handle_non_streaming(prompts, config)
diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
index 74e128f2e..38877051e 100644
--- a/llm_on_ray/inference/predictors/hpu_predictor.py
+++ b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -79,7 +79,10 @@ def __init__(self, infer_conf: InferenceConfig):
         # decide correct torch dtype for loading HF model
         decide_torch_dtype(infer_conf)
 
-        print("SSSSSS8:", infer_conf)
+        debug_mode = infer_conf.debug_mode
+
+        if debug_mode:
+            print("DEBUG:hpu_predictor:print inference config:", infer_conf)
 
         self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile
         self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs
diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
index c8016c745..9d7427af0 100644
--- a/llm_on_ray/inference/serve.py
+++ b/llm_on_ray/inference/serve.py
@@ -41,12 +41,21 @@ def get_deployed_models(args):
                 set(all_models_name)
             ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}."
             model_list = {model: all_models[model] for model in models}
-            print("--config_file is not set while --models is set, serving model(s):", model_list)
+            if args.debug_mode:
+                print(
+                    "DEBUG:serve.py: --config_file is not set while --models is set, serving model(s):",
+                    model_list,
+                )
         else:
             model_list = all_models
-            print("--config_file and --models is not set, serving all models:", model_list)
+            if args.debug_mode:
+                print(
+                    "DEBUG:serve.py: --config_file and --models is not set, serving all models:",
+                    model_list,
+                )
     else:
-        print("Reading from config file, " + args.config_file)
+        if args.debug_mode:
+            print("DEBUG:serve.py: Reading from config file, " + args.config_file)
         with open(args.config_file, "r") as f:
             infer_conf = parse_yaml_raw_as(InferenceConfig, f)
         model_list = {}
@@ -131,6 +140,11 @@ def main(argv=None):
     parser.add_argument(
         "--max_batch_size", default=None, type=int, help="The max batch size for dynamic batching."
     )
+    parser.add_argument(
+        "--debug_mode",
+        action="store_true",
+        help="If debug mode is enabled, debug logs will be printed",
+    )
 
     # Print help if no arguments were provided
     if len(sys.argv) == 1:
@@ -147,8 +161,9 @@ def main(argv=None):
 
     ray.init(address="auto")
     deployments, model_list = get_deployed_models(args)
-    print("Service is running with deployments:" + str(deployments))
-    print("Service is running models:" + str(model_list))
+    if args.debug_mode:
+        print("DEBUG:serve.py: Service is running with deployments:" + str(deployments))
+        print("DEBUG:serve.py: Service is running models:" + str(model_list))
     if args.simple:
         # provide simple model endpoint
         # models can be served to customed URLs according to configuration files.

From 7f4a017a490e7ad4f083d53cebaaaa92f98d3faa Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 3 Jul 2024 07:34:39 +0000
Subject: [PATCH 3/5] fix

---
 .../api_openai_backend/query_client.py          |  5 ++++-
 llm_on_ray/inference/predictor_deployment.py    | 12 ++++++------
 .../inference/predictors/hpu_predictor.py       |  4 ----
 llm_on_ray/inference/serve.py                   | 17 +++++++++--------
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
index 5d664c5d7..b9fe6023b 100644
--- a/llm_on_ray/inference/api_openai_backend/query_client.py
+++ b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -37,6 +37,9 @@
 from fastapi import HTTPException
 from llm_on_ray.inference.api_openai_backend.openai_protocol import ModelCard, Prompt
 from llm_on_ray.inference.api_openai_backend.request_handler import handle_request
+from llm_on_ray.inference.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 class RouterQueryClient:
@@ -58,7 +61,7 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
         gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)})
 
         if request_config.get("debug_mode", False):
-            print("DEBUG: print request_config:", request_config)
+            logger.debug(f"Print request_config: {request_config}")
         # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down
 
         async for x in handle_request(
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index c9ec43a66..43f1990aa 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -395,12 +395,12 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
         input = request["text"]
         config = request["config"]
         if config.get("debug_mode", False):
-            print("DEBUG:predictor_deployment.py:print config received from json:", config)
-            print("DEBUG:predictor_deployment.py::print inputs for prompts:", input)
+            logger.debug(f"Print config received from json: {config}")
+            logger.debug(f"Print inputs for prompts: {input}")
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
         if config.get("debug_mode", False):
-            print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts)
+            logger.debug(f"Print prompts from inputs: {prompts}")
 
         # Handle streaming response
         if streaming_response:
@@ -421,12 +421,12 @@ async def openai_call(
 
         # TODO: Pass down config into preprocess_prompts for more logs.
         if config.get("debug_mode", False):
-            print("DEBUG:predictor_deployment.py:print config received from query_client:", config)
-            print("DEBUG:predictor_deployment.py::print inputs for prompts:", input)
+            logger.debug(f"Print config received from query_client: {config}")
+            logger.debug(f"Print inputs for prompts: {input}")
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input, tools, tool_choice)
         if config.get("debug_mode", False):
-            print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts)
+            logger.debug(f"Print prompts from inputs: {prompts}")
 
         # Handle streaming response
         if streaming_response:
diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
index 38877051e..20e7bd8d8 100644
--- a/llm_on_ray/inference/predictors/hpu_predictor.py
+++ b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -184,7 +184,6 @@ def _process_config(self, config):
 
     def get_streamer(self):
         if self.infer_conf.deepspeed:
-            # Q2: Why always use the first worker?
             return ray.get(self.deepspeed_workers[0].get_streamer.remote())
         else:
             return TextIteratorStreamer(
@@ -200,8 +199,6 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput:
 
         self._process_config(config)
 
-        # TODO: Maybe we should get realtime load info of all cards, set a heathy usage ratio and pick the usable cards for serving.
-        #       So that some errors like OOM can be prevented, and the server will be more robust.
         if self.infer_conf.deepspeed:
             return ray.get(
                 [worker.generate.remote(prompt, **config) for worker in self.deepspeed_workers]
@@ -227,7 +224,6 @@ def streaming_generate(self, prompt, streamer, **config):
         self._process_config(config)
         # Q1: Why it is handled here when using both deepspeed and hpu?
         if self.infer_conf.deepspeed:
-            # Q2: Why always use the first worker?
             self.deepspeed_workers[0].streaming_generate.remote(prompt, streamer, **config)
             for worker in self.deepspeed_workers[1:]:
                 worker.streaming_generate.remote(prompt, self._create_dummy_streamer(), **config)
diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
index 9d7427af0..6e7e8b73b 100644
--- a/llm_on_ray/inference/serve.py
+++ b/llm_on_ray/inference/serve.py
@@ -25,6 +25,9 @@
     InferenceConfig,
     all_models,
 )
+from llm_on_ray.inference.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 def get_deployed_models(args):
@@ -42,16 +45,14 @@ def get_deployed_models(args):
             ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}."
             model_list = {model: all_models[model] for model in models}
             if args.debug_mode:
-                print(
-                    "DEBUG:serve.py: --config_file is not set while --models is set, serving model(s):",
-                    model_list,
+                logger.debug(
+                    f"--config_file is not set while --models is set, serving model(s): {model_list}"
                 )
         else:
             model_list = all_models
             if args.debug_mode:
-                print(
-                    "DEBUG:serve.py: --config_file and --models is not set, serving all models:",
-                    model_list,
+                logger.debug(
+                    f"--config_file and --models is not set, serving all models: {model_list}"
                 )
     else:
         if args.debug_mode:
@@ -162,8 +163,8 @@ def main(argv=None):
     ray.init(address="auto")
     deployments, model_list = get_deployed_models(args)
     if args.debug_mode:
-        print("DEBUG:serve.py: Service is running with deployments:" + str(deployments))
-        print("DEBUG:serve.py: Service is running models:" + str(model_list))
+        logger.debug(f"Service is running with deployments: {str(deployments)}")
+        logger.debug(f"Service is running models: {str(model_list)}")
     if args.simple:
         # provide simple model endpoint
         # models can be served to customed URLs according to configuration files.

From 21c9c7ed79ce2576e1338b0a5c76eb51f44fe03d Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Mon, 8 Jul 2024 01:28:54 +0000
Subject: [PATCH 4/5] nit

---
 .../api_openai_backend/query_client.py        |  3 +--
 llm_on_ray/inference/predictor_deployment.py  | 22 ++++++++-----------
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
index b9fe6023b..0e52c7960 100644
--- a/llm_on_ray/inference/api_openai_backend/query_client.py
+++ b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -60,8 +60,7 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
         gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0})
         gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)})
 
-        if request_config.get("debug_mode", False):
-            logger.debug(f"Print request_config: {request_config}")
+        logger.debug(f"Print request_config: {request_config}")
         # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down
 
         async for x in handle_request(
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 43f1990aa..448ce5197 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -394,13 +394,11 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
         streaming_response = request["stream"]
         input = request["text"]
         config = request["config"]
-        if config.get("debug_mode", False):
-            logger.debug(f"Print config received from json: {config}")
-            logger.debug(f"Print inputs for prompts: {input}")
+        logger.debug(f"Print config received from json: {config}")
+        logger.debug(f"Print inputs for prompts: {input}")
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
-        if config.get("debug_mode", False):
-            logger.debug(f"Print prompts from inputs: {prompts}")
+        logger.debug(f"Print prompts from inputs: {prompts}")
 
         # Handle streaming response
         if streaming_response:
@@ -420,17 +418,15 @@ async def openai_call(
         self.use_openai = True
 
         # TODO: Pass down config into preprocess_prompts for more logs.
-        if config.get("debug_mode", False):
-            logger.debug(f"Print config received from query_client: {config}")
-            logger.debug(f"Print inputs for prompts: {input}")
+        logger.debug(f"Print config received from query_client: {config}")
+        logger.debug(f"Print inputs for prompts: {input}")
         # return prompt or list of prompts preprocessed
-        prompts = self.preprocess_prompts(input, tools, tool_choice)
-        if config.get("debug_mode", False):
-            logger.debug(f"Print prompts from inputs: {prompts}")
+        input = self.preprocess_prompts(input, tools, tool_choice)
+        logger.debug(f"Print prompts from inputs: {input}")
 
         # Handle streaming response
         if streaming_response:
-            async for result in self.handle_streaming(prompts, config):
+            async for result in self.handle_streaming(input, config):
                 yield result
         else:
-            yield await self.handle_non_streaming(prompts, config)
+            yield await self.handle_non_streaming(input, config)

From 2358e3e7b89b90befd682a70b8c3848573cb6bd7 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Mon, 8 Jul 2024 01:41:02 +0000
Subject: [PATCH 5/5] remove

---
 .../api_server_openai/query_openai_sdk.py     |  8 ------
 llm_on_ray/inference/inference_config.py      |  1 -
 .../inference/predictors/hpu_predictor.py     |  8 +++---
 llm_on_ray/inference/serve.py                 | 25 ++++++-------------
 4 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py
index 29ce08092..586a59f3a 100644
--- a/examples/inference/api_server_openai/query_openai_sdk.py
+++ b/examples/inference/api_server_openai/query_openai_sdk.py
@@ -40,12 +40,6 @@
     help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation",
 )
 
-parser.add_argument(
-    "--debug_mode",
-    action="store_true",
-    help="If debug mode is enabled, debug logs will be printed",
-)
-
 args = parser.parse_args()
 
 if "OPENAI_API_KEY" in os.environ:
@@ -71,7 +65,6 @@ def stream_chat():
         max_tokens=args.max_new_tokens,
         temperature=args.temperature,
         top_p=args.top_p,
-        debug_mode=args.debug_mode,
     ):
         content = chunk.choices[0].delta.content
         if content is not None:
@@ -88,7 +81,6 @@ def chunk_chat():
         max_tokens=args.max_new_tokens,
         temperature=args.temperature,
         top_p=args.top_p,
-        debug_mode=args.debug_mode,
     )
     for chunk in [output]:
         try:
diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index 6579626dc..7d405c7c7 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -171,7 +171,6 @@ class InferenceConfig(BaseModel):
     ipex: Ipex = Ipex()
     hpu_model_config: HpuModelConfig = HpuModelConfig()
     model_description: ModelDescription = ModelDescription()
-    debug_mode: bool = False
 
     # prevent warning of protected namespaces
     # DO NOT TOUCH
diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
index 20e7bd8d8..97ec1dcb1 100644
--- a/llm_on_ray/inference/predictors/hpu_predictor.py
+++ b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -69,6 +69,9 @@
     MllmPromptInput,
 )
 from llm_on_ray.inference.utils import decide_torch_dtype
+from llm_on_ray.inference.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 class HPUPredictor(Predictor):
@@ -79,10 +82,7 @@ def __init__(self, infer_conf: InferenceConfig):
         # decide correct torch dtype for loading HF model
         decide_torch_dtype(infer_conf)
 
-        debug_mode = infer_conf.debug_mode
-
-        if debug_mode:
-            print("DEBUG:hpu_predictor:print inference config:", infer_conf)
+        logger.debug(f"Print inference config: {infer_conf}")
 
         self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile
         self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs
diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
index 6e7e8b73b..5304c7b3b 100644
--- a/llm_on_ray/inference/serve.py
+++ b/llm_on_ray/inference/serve.py
@@ -44,19 +44,14 @@ def get_deployed_models(args):
                 set(all_models_name)
             ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}."
             model_list = {model: all_models[model] for model in models}
-            if args.debug_mode:
-                logger.debug(
-                    f"--config_file is not set while --models is set, serving model(s): {model_list}"
-                )
+            logger.debug(
+                f"--config_file is not set while --models is set, serving model(s): {model_list}"
+            )
         else:
             model_list = all_models
-            if args.debug_mode:
-                logger.debug(
-                    f"--config_file and --models is not set, serving all models: {model_list}"
-                )
+            logger.debug(f"--config_file and --models is not set, serving all models: {model_list}")
     else:
-        if args.debug_mode:
-            print("DEBUG:serve.py: Reading from config file, " + args.config_file)
+        print("DEBUG:serve.py: Reading from config file, " + args.config_file)
         with open(args.config_file, "r") as f:
             infer_conf = parse_yaml_raw_as(InferenceConfig, f)
         model_list = {}
@@ -141,11 +136,6 @@ def main(argv=None):
     parser.add_argument(
         "--max_batch_size", default=None, type=int, help="The max batch size for dynamic batching."
     )
-    parser.add_argument(
-        "--debug_mode",
-        action="store_true",
-        help="If debug mode is enabled, debug logs will be printed",
-    )
 
     # Print help if no arguments were provided
     if len(sys.argv) == 1:
@@ -162,9 +152,8 @@ def main(argv=None):
 
     ray.init(address="auto")
     deployments, model_list = get_deployed_models(args)
-    if args.debug_mode:
-        logger.debug(f"Service is running with deployments: {str(deployments)}")
-        logger.debug(f"Service is running models: {str(model_list)}")
+    logger.debug(f"Service is running with deployments: {str(deployments)}")
+    logger.debug(f"Service is running models: {str(model_list)}")
     if args.simple:
         # provide simple model endpoint
         # models can be served to customed URLs according to configuration files.