Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Inference] Add debug mode #257

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions llm_on_ray/inference/api_openai_backend/query_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@
from fastapi import HTTPException
from llm_on_ray.inference.api_openai_backend.openai_protocol import ModelCard, Prompt
from llm_on_ray.inference.api_openai_backend.request_handler import handle_request
from llm_on_ray.inference.logger import get_logger

logger = get_logger(__name__)


class RouterQueryClient:
Expand All @@ -54,10 +57,11 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
top_p = request_config.get("top_p", 1.0)
max_new_tokens = request_config.get("max_tokens", None)
gen_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
if temperature != 1.0 or top_p != 1.0:
gen_config.update({"do_sample": True})
if request_config.get("ignore_eos", False):
gen_config.update({"ignore_eos": True})
gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0})
gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)})

logger.debug(f"Print request_config: {request_config}")
# TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down

async for x in handle_request(
model=model,
Expand Down
9 changes: 8 additions & 1 deletion llm_on_ray/inference/predictor_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,11 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
streaming_response = request["stream"]
input = request["text"]
config = request["config"]

logger.debug(f"Print config received from json: {config}")
logger.debug(f"Print inputs for prompts: {input}")
# return prompt or list of prompts preprocessed
prompts = self.preprocess_prompts(input)
logger.debug(f"Print prompts from inputs: {prompts}")

# Handle streaming response
if streaming_response:
Expand All @@ -414,8 +417,12 @@ async def openai_call(
):
self.use_openai = True

# TODO: Pass down config into preprocess_prompts for more logs.
logger.debug(f"Print config received from query_client: {config}")
logger.debug(f"Print inputs for prompts: {input}")
# return prompt or list of prompts preprocessed
input = self.preprocess_prompts(input, tools, tool_choice)
logger.debug(f"Print prompts from inputs: {input}")

# Handle streaming response
if streaming_response:
Expand Down
22 changes: 11 additions & 11 deletions llm_on_ray/inference/predictors/hpu_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@
MllmPromptInput,
)
from llm_on_ray.inference.utils import decide_torch_dtype
from llm_on_ray.inference.logger import get_logger

logger = get_logger(__name__)


class HPUPredictor(Predictor):
Expand All @@ -79,9 +82,16 @@ def __init__(self, infer_conf: InferenceConfig):
# decide correct torch dtype for loading HF model
decide_torch_dtype(infer_conf)

logger.debug(f"Print inference config: {infer_conf}")

self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile
self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs

# optimize transformers for gaudi
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi

adapt_transformers_to_gaudi()

Comment on lines +90 to +94
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why move this function here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this function out of this if:

Both with deepspeed or not will execute this function.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I understand from the PR title, this PR is to add debug mode, why touch other code? Could you submit a separate PR to address other issues.

if infer_conf.deepspeed:
# DeepSpeed is enabled, start worker group
# Prepare placement group
Expand All @@ -105,13 +115,6 @@ def __init__(self, infer_conf: InferenceConfig):

htcore.hpu_set_env()

# Tweak transformer to optimize performance on Gaudi
from optimum.habana.transformers.modeling_utils import (
adapt_transformers_to_gaudi,
)

adapt_transformers_to_gaudi()

self.device = torch.device("hpu")
model = AutoModelForCausalLM.from_pretrained(
model_desc.model_id_or_path, **model_desc.config.dict()
Expand Down Expand Up @@ -219,6 +222,7 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput:

def streaming_generate(self, prompt, streamer, **config):
self._process_config(config)
# Q1: Why it is handled here when using both deepspeed and hpu?
if self.infer_conf.deepspeed:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here in hpu_predictor,py, it is a little bit confused since we have another predictor called deepspeed_predicotr.
Two predictors are for hpu and cpu, maybe we can change the name of deepspeed_predicotr, like cpu or base predictor.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is a TODO comment to consolidate these two predictors.

self.deepspeed_workers[0].streaming_generate.remote(prompt, streamer, **config)
for worker in self.deepspeed_workers[1:]:
Expand Down Expand Up @@ -284,10 +288,6 @@ def load_model_and_tokenizer(self):
self.world_size = int(os.environ["WORLD_SIZE"])
self.local_rank = int(os.environ["LOCAL_RANK"])
self.device = torch.device("hpu")
# optimize transformers for gaudi
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi

adapt_transformers_to_gaudi()
Comment on lines -287 to -290
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this function is not executed in every worker, will it work as expected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above, this function will be executed earlier.

self.load_model()
model_desc = self.infer_conf.model_description
self.tokenizer = load_tokenizer(self.model, model_desc.tokenizer_name_or_path)
Expand Down
19 changes: 12 additions & 7 deletions llm_on_ray/inference/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
InferenceConfig,
all_models,
)
from llm_on_ray.inference.logger import get_logger

logger = get_logger(__name__)


def get_deployed_models(args):
Expand All @@ -41,14 +44,16 @@ def get_deployed_models(args):
set(all_models_name)
), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}."
model_list = {model: all_models[model] for model in models}
logger.debug(
f"--config_file is not set while --models is set, serving model(s): {model_list}"
)
else:
model_list = all_models
logger.debug(f"--config_file and --models is not set, serving all models: {model_list}")
else:
# config_file has precedence over others
if args.config_file:
print("Reading from config file, " + args.config_file)
with open(args.config_file, "r") as f:
infer_conf = parse_yaml_raw_as(InferenceConfig, f)
print("DEBUG:serve.py: Reading from config file, " + args.config_file)
with open(args.config_file, "r") as f:
infer_conf = parse_yaml_raw_as(InferenceConfig, f)
model_list = {}
model_list[infer_conf.name] = infer_conf

Expand Down Expand Up @@ -147,6 +152,8 @@ def main(argv=None):

ray.init(address="auto")
deployments, model_list = get_deployed_models(args)
logger.debug(f"Service is running with deployments: {str(deployments)}")
logger.debug(f"Service is running models: {str(model_list)}")
if args.simple:
# provide simple model endpoint
# models can be served to customed URLs according to configuration files.
Expand All @@ -156,8 +163,6 @@ def main(argv=None):
# all models are served under the same URL and then accessed
# through model_id, so it needs to pass in a unified URL.
host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
print("Service is running with deployments:" + str(deployments))
print("Service is running models:" + str(model_list))
openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests)

msg = "Service is deployed successfully."
Expand Down
Loading