diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index e7235a846..2357b1729 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -105,9 +105,9 @@ jobs: run: | TARGET=${{steps.target.outputs.target}} if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then - docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple" else - docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/serve.py --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --simple --models ${{ matrix.model }}" fi docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" @@ -116,7 +116,7 @@ jobs: if: ${{ matrix.dtuner_model }} run: | TARGET=${{steps.target.outputs.target}} - docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml --simple" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" @@ -127,7 +127,7 @@ jobs: echo ${{ matrix.model }} is not supported! else docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed" - docker exec "${TARGET}" bash -c "python inference/serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed --simple" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" fi @@ -139,7 +139,7 @@ jobs: if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then echo ${{ matrix.model }} is not supported! else - docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" fi @@ -150,9 +150,9 @@ jobs: if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml" else - docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/serve.py" + docker exec "${TARGET}" bash -c "python inference/serve.py --models ${{ matrix.model }}" fi - docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python examples/inference/api_server_openai/query_http_requests.py" + docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }}" - name: Stop Ray run: | diff --git a/README.md b/README.md index 718269fb8..25e7b1225 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py ``` Or you can serve specific model to a simple endpoint according to the `port` and `route_prefix` parameters in configuration file, ```bash -python inference/serve.py --config_file inference/models/gpt2.yaml --serve_simple +python inference/serve.py --config_file inference/models/gpt2.yaml --simple ``` After deploying the model endpoint, you can access and test it by using the script below: ```bash diff --git a/docs/serve.md b/docs/serve.md index 6bf655b19..f0f3e3519 100644 --- a/docs/serve.md +++ b/docs/serve.md @@ -27,6 +27,19 @@ device: HPU LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP and [BigDL-LLM](serve_bigdl.md) for INT4/FP4/INT8/FP8 to reduce latency. You can follow the corresponding documents to enable them. ## Serving +We support three methods to specify the models to be served, and they have the following priorities. +1. Use inference configuration file if config_file is set. +``` +python inference/serve.py --config_file inference/models/gpt2.yaml +``` +2. Use relevant configuration parameters if model_id_or_path is set. +``` +python inference/serve.py --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix ...] +``` +3. If --config_file and --model_id_or_path are both None, it will serve all pre-defined models in inference/models/*.yaml, or part of them if models is set. +``` +python inference/serve.py --models gpt2 gpt-j-6b +``` ### OpenAI-compatible API To deploy your model, execute the following command with the model's configuration file. This will create an OpenAI-compatible API ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)) for serving. ```bash @@ -57,7 +70,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py ### Serving Model to a Simple Endpoint This will create a simple endpoint for serving according to the `port` and `route_prefix` parameters in conf file, for example: http://127.0.0.1:8000/gpt2. ```bash -python inference/serve.py --config_file --serve_simple +python inference/serve.py --config_file --simple ``` After deploying the model endpoint, you can access and test it by using the script below: ```bash diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py index 6418a58f3..b9e01ba6c 100644 --- a/examples/inference/api_server_openai/query_http_requests.py +++ b/examples/inference/api_server_openai/query_http_requests.py @@ -14,49 +14,70 @@ # limitations under the License. # -import os import json import requests +import argparse -s = requests.Session() +parser = argparse.ArgumentParser( + description="Example script to query with http requests", add_help=True +) +parser.add_argument( + "--request_api_base", + default="http://localhost:8000/v1", + type=str, + help="Deployed model endpoint url", +) +parser.add_argument("--model_name", default="gpt2", type=str, help="The name of model to request") +parser.add_argument( + "--streaming_response", + default=False, + action="store_true", + help="Whether to enable streaming response", +) +parser.add_argument( + "--max_new_tokens", default=None, help="The maximum numbers of tokens to generate" +) +parser.add_argument( + "--temperature", default=None, help="The value used to modulate the next token probabilities" +) +parser.add_argument( + "--top_p", + default=None, + help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation", +) + +args = parser.parse_args() -api_base = os.getenv("ENDPOINT_URL") -if api_base is None: - api_base = "http://localhost:8000/v1" -url = f"{api_base}/chat/completions" +s = requests.Session() +url = f"{args.request_api_base}/chat/completions" -model_name = os.getenv("MODEL_TO_SERVE", "gpt2") body = { - "model": model_name, + "model": args.model_name, "messages": [ {"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a long story with many words."}, ], - "temperature": 0.7, - "stream": True, + "stream": args.streaming_response, + "max_tokens": args.max_new_tokens, + "temperature": args.temperature, + "top_p": args.top_p, } proxies = {"http": None, "https": None} response = s.post(url, json=body, proxies=proxies) # type: ignore for chunk in response.iter_lines(decode_unicode=True): if chunk is not None: - try: + if args.streaming_response: # Get data from reponse chunk chunk_data = chunk.split("data: ")[1] - - # Get message choices from data - choices = json.loads(chunk_data)["choices"] - - # Pick content from first choice - content = choices[0]["delta"]["content"] - + if chunk_data != "[DONE]": + # Get message choices from data + choices = json.loads(chunk_data)["choices"] + # Pick content from first choice + content = choices[0]["delta"].get("content", "") + print(content, end="", flush=True) + else: + choices = json.loads(chunk)["choices"] + content = choices[0]["message"].get("content", "") print(content, end="", flush=True) - except json.decoder.JSONDecodeError: - # Chunk was not formatted as expected - pass - except KeyError: - # No message was contained in the chunk - pass - except Exception: - pass print("") diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py index d17e9f0bb..2ffb73438 100644 --- a/examples/inference/api_server_openai/query_openai_sdk.py +++ b/examples/inference/api_server_openai/query_openai_sdk.py @@ -15,21 +15,46 @@ # import openai -import os +import argparse + +parser = argparse.ArgumentParser( + description="Example script to query with openai sdk", add_help=True +) +parser.add_argument("--model_name", default="gpt2", type=str, help="The name of model to request") +parser.add_argument( + "--streaming_response", + default=False, + action="store_true", + help="Whether to enable streaming response", +) +parser.add_argument( + "--max_new_tokens", default=None, help="The maximum numbers of tokens to generate" +) +parser.add_argument( + "--temperature", default=None, help="The value used to modulate the next token probabilities" +) +parser.add_argument( + "--top_p", + default=None, + help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation", +) + +args = parser.parse_args() # List all models. models = openai.Model.list() print(models) # Note: not all arguments are currently supported and will be ignored by the backend. -model_name = os.getenv("MODEL_TO_SERVE", "gpt2") chat_completion = openai.ChatCompletion.create( - model=model_name, + model=args.model_name, messages=[ {"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a long story with many words."}, ], - temperature=0.7, - stream=False, + stream=args.streaming_response, + max_tokens=args.max_new_tokens, + temperature=args.temperature, + top_p=args.top_p, ) print(chat_completion) diff --git a/inference/api_openai_backend/router_app.py b/inference/api_openai_backend/router_app.py index fc4328610..269a2d422 100644 --- a/inference/api_openai_backend/router_app.py +++ b/inference/api_openai_backend/router_app.py @@ -108,7 +108,7 @@ async def _completions_wrapper( logger.error(f"{subresult_dict['error']}") all_results.pop() had_error = True - yield "data: " + ModelResponse(**subresult_dict).json() + "\n\n" + yield "data: " + ModelResponse(**subresult_dict).json() + "\n" # Return early in case of an error break choices = [ @@ -131,11 +131,11 @@ async def _completions_wrapper( model=body.model, choices=choices, usage=usage, - ).json() + "\n\n" + ).json() + "\n" if had_error: # Return early in case of an error break - yield "data: [DONE]\n\n" + yield "data: [DONE]\n" async def _chat_completions_wrapper( @@ -160,7 +160,7 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=None, - ).json() + "\n\n" + ).json() + "\n" all_results = [] async for results in generator: @@ -175,7 +175,7 @@ async def _chat_completions_wrapper( subresult_dict["finish_reason"] = None all_results.pop() had_error = True - yield "data: " + ModelResponse(**subresult_dict).json() + "\n\n" + yield "data: " + ModelResponse(**subresult_dict).json() + "\n" # Return early in case of an error break else: @@ -193,7 +193,7 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=None, - ).json() + "\n\n" + ).json() + "\n" if had_error: # Return early in case of an error break @@ -216,8 +216,8 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=usage, - ).json() + "\n\n" - yield "data: [DONE]\n\n" + ).json() + "\n" + yield "data: [DONE]\n" class Router: diff --git a/inference/inference_config.py b/inference/inference_config.py index b82d968b3..0e9fd50f7 100644 --- a/inference/inference_config.py +++ b/inference/inference_config.py @@ -136,12 +136,7 @@ def _check_workers_per_group(cls, v: int): m: InferenceConfig = parse_yaml_raw_as(InferenceConfig, f) _models[m.name] = m -env_model = "MODEL_TO_SERVE" -if env_model in os.environ: - all_models[os.environ[env_model]] = _models[os.environ[env_model]] -else: - # all_models["gpt-j-6B-finetuned-52K"] = gpt_j_finetuned_52K - all_models = _models.copy() +all_models = _models.copy() _gpt2_key = "gpt2" _gpt_j_6b = "gpt-j-6b" diff --git a/inference/serve.py b/inference/serve.py index 3fd59180c..e73397a79 100644 --- a/inference/serve.py +++ b/inference/serve.py @@ -25,21 +25,36 @@ def get_deployed_models(args): - # serve all pre-defined models, or model from MODEL_TO_SERVE env, if no model argument specified - if args.model is None and args.config_file is None: - model_list = all_models + """ + The priority of how to choose models to deploy based on passed parameters: + 1. Use inference configuration file if config_file is set, + 2. Use relevant configuration parameters to generate `InferenceConfig` if model_id_or_path is set, + 3. Serve all pre-defined models in inference/models/*.yaml, or part of them if models is set. + """ + if args.model_id_or_path is None and args.config_file is None: + if args.models: + models = args.models + all_models_name = list(all_models.keys()) + assert set(models).issubset( + set(all_models_name) + ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}." + model_list = {model: all_models[model] for model in models} + else: + model_list = all_models else: # config_file has precedence over others if args.config_file: print("reading from config file, " + args.config_file) with open(args.config_file, "r") as f: infer_conf = parse_yaml_raw_as(InferenceConfig, f) - else: # args.model should be set - print("reading from command line, " + args.model) + else: # args.model_id_or_path should be set + print("reading from command line, " + args.model_id_or_path) model_desc = ModelDescription() - model_desc.model_id_or_path = args.model + model_desc.model_id_or_path = args.model_id_or_path model_desc.tokenizer_name_or_path = ( - args.tokenizer if args.tokenizer is not None else args.model + args.tokenizer_id_or_path + if args.tokenizer_id_or_path is not None + else args.model_id_or_path ) infer_conf = InferenceConfig(model_description=model_desc) infer_conf.host = "127.0.0.1" if args.serve_local_only else "0.0.0.0" @@ -71,8 +86,17 @@ def main(argv=None): type=str, help="Inference configuration file in YAML. If specified, all other arguments will be ignored.", ) - parser.add_argument("--model", default=None, type=str, help="Model name or path.") - parser.add_argument("--tokenizer", default=None, type=str, help="Tokenizer name or path.") + parser.add_argument("--model_id_or_path", default=None, type=str, help="Model name or path.") + parser.add_argument( + "--tokenizer_id_or_path", default=None, type=str, help="Tokenizer name or path." + ) + parser.add_argument( + "--models", + nargs="*", + default=["gpt2"], + type=str, + help=f"Only used when config_file and model_id_or_path are both None, valid values can be any items in {list(all_models.keys())}.", + ) parser.add_argument("--port", default=8000, type=int, help="The port of deployment address.") parser.add_argument( "--route_prefix", @@ -108,7 +132,7 @@ def main(argv=None): help="Only support local access to url.", ) parser.add_argument( - "--serve_simple", + "--simple", action="store_true", help="Whether to serve OpenAI-compatible API for all models or serve simple endpoint based on model conf files.", ) @@ -122,7 +146,7 @@ def main(argv=None): ray.init(address="auto") deployments, model_list = get_deployed_models(args) - if args.serve_simple: + if args.simple: # provide simple model endpoint # models can be served to customed URLs according to configuration files. serve_run(deployments, model_list)