intel · Deegue · Jun 19, 2024 · Jun 20, 2024 · Jun 25, 2024 · Jul 3, 2024
diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -37,6 +37,9 @@
 from fastapi import HTTPException
 from llm_on_ray.inference.api_openai_backend.openai_protocol import ModelCard, Prompt
 from llm_on_ray.inference.api_openai_backend.request_handler import handle_request
+from llm_on_ray.inference.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 class RouterQueryClient:
@@ -54,10 +57,11 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
         top_p = request_config.get("top_p", 1.0)
         max_new_tokens = request_config.get("max_tokens", None)
         gen_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
-        if temperature != 1.0 or top_p != 1.0:
-            gen_config.update({"do_sample": True})
-        if request_config.get("ignore_eos", False):
-            gen_config.update({"ignore_eos": True})
+        gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0})
+        gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)})
+
+        logger.debug(f"Print request_config: {request_config}")
+        # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down
 
         async for x in handle_request(
             model=model,

diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
@@ -394,8 +394,11 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
         streaming_response = request["stream"]
         input = request["text"]
         config = request["config"]
-
+        logger.debug(f"Print config received from json: {config}")
+        logger.debug(f"Print inputs for prompts: {input}")
+        # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
+        logger.debug(f"Print prompts from inputs: {prompts}")
 
         # Handle streaming response
         if streaming_response:
@@ -414,8 +417,12 @@ async def openai_call(
     ):
         self.use_openai = True
 
+        # TODO: Pass down config into preprocess_prompts for more logs.
+        logger.debug(f"Print config received from query_client: {config}")
+        logger.debug(f"Print inputs for prompts: {input}")
         # return prompt or list of prompts preprocessed
         input = self.preprocess_prompts(input, tools, tool_choice)
+        logger.debug(f"Print prompts from inputs: {input}")
 
         # Handle streaming response
         if streaming_response:

diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -69,6 +69,9 @@
     MllmPromptInput,
 )
 from llm_on_ray.inference.utils import decide_torch_dtype
+from llm_on_ray.inference.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 class HPUPredictor(Predictor):
@@ -79,9 +82,16 @@ def __init__(self, infer_conf: InferenceConfig):
         # decide correct torch dtype for loading HF model
         decide_torch_dtype(infer_conf)
 
+        logger.debug(f"Print inference config: {infer_conf}")
+
         self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile
         self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs
 
+        # optimize transformers for gaudi
+        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+        adapt_transformers_to_gaudi()
+
 if infer_conf.deepspeed: 
 if infer_conf.deepspeed: 
         if infer_conf.deepspeed:
             # DeepSpeed is enabled, start worker group
             # Prepare placement group
@@ -105,13 +115,6 @@ def __init__(self, infer_conf: InferenceConfig):
 
                 htcore.hpu_set_env()
 
-            # Tweak transformer to optimize performance on Gaudi
-            from optimum.habana.transformers.modeling_utils import (
-                adapt_transformers_to_gaudi,
-            )
-
-            adapt_transformers_to_gaudi()
-
             self.device = torch.device("hpu")
             model = AutoModelForCausalLM.from_pretrained(
                 model_desc.model_id_or_path, **model_desc.config.dict()
@@ -219,6 +222,7 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput:
 
     def streaming_generate(self, prompt, streamer, **config):
         self._process_config(config)
+        # Q1: Why it is handled here when using both deepspeed and hpu?
         if self.infer_conf.deepspeed:
             self.deepspeed_workers[0].streaming_generate.remote(prompt, streamer, **config)
             for worker in self.deepspeed_workers[1:]:
@@ -284,10 +288,6 @@ def load_model_and_tokenizer(self):
         self.world_size = int(os.environ["WORLD_SIZE"])
         self.local_rank = int(os.environ["LOCAL_RANK"])
         self.device = torch.device("hpu")
-        # optimize transformers for gaudi
-        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-        adapt_transformers_to_gaudi()
         self.load_model()
         model_desc = self.infer_conf.model_description
         self.tokenizer = load_tokenizer(self.model, model_desc.tokenizer_name_or_path)

diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
@@ -25,6 +25,9 @@
     InferenceConfig,
     all_models,
 )
+from llm_on_ray.inference.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 def get_deployed_models(args):
@@ -41,14 +44,16 @@ def get_deployed_models(args):
                 set(all_models_name)
             ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}."
             model_list = {model: all_models[model] for model in models}
+            logger.debug(
+                f"--config_file is not set while --models is set, serving model(s): {model_list}"
+            )
         else:
             model_list = all_models
+            logger.debug(f"--config_file and --models is not set, serving all models: {model_list}")
     else:
-        # config_file has precedence over others
-        if args.config_file:
-            print("Reading from config file, " + args.config_file)
-            with open(args.config_file, "r") as f:
-                infer_conf = parse_yaml_raw_as(InferenceConfig, f)
+        print("DEBUG:serve.py: Reading from config file, " + args.config_file)
+        with open(args.config_file, "r") as f:
+            infer_conf = parse_yaml_raw_as(InferenceConfig, f)
         model_list = {}
         model_list[infer_conf.name] = infer_conf
 
@@ -147,6 +152,8 @@ def main(argv=None):
 
     ray.init(address="auto")
     deployments, model_list = get_deployed_models(args)
+    logger.debug(f"Service is running with deployments: {str(deployments)}")
+    logger.debug(f"Service is running models: {str(model_list)}")
     if args.simple:
         # provide simple model endpoint
         # models can be served to customed URLs according to configuration files.
@@ -156,8 +163,6 @@ def main(argv=None):
         # all models are served under the same URL and then accessed
         # through model_id, so it needs to pass in a unified URL.
         host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
-        print("Service is running with deployments:" + str(deployments))
-        print("Service is running models:" + str(model_list))
         openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests)
 
     msg = "Service is deployed successfully."