From 81b431c613c55fdfa199bb0482b972ffb26fd474 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Wed, 10 Jan 2024 17:19:42 +0800 Subject: [PATCH 1/7] move MODEL_TO_SERVE to command line parameter --- .github/workflows/workflow_inference.yml | 6 ++-- .../api_server_openai/query_http_requests.py | 27 ++++++++++------ .../api_server_openai/query_openai_sdk.py | 20 +++++++++--- inference/inference_config.py | 7 +---- inference/serve.py | 31 +++++++++++++------ 5 files changed, 58 insertions(+), 33 deletions(-) diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index 82b0fbc68..9b0ae3a1b 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -107,7 +107,7 @@ jobs: if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --serve_simple" else - docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/serve.py --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --serve_simple --model_to_serve ${{ matrix.model }}" fi docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" @@ -150,9 +150,9 @@ jobs: if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml" else - docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/serve.py" + docker exec "${TARGET}" bash -c "python inference/serve.py --model_to_serve ${{ matrix.model }}" fi - docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python examples/inference/api_server_openai/query_http_requests.py" + docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_to_serve ${{ matrix.model }}" - name: Stop Ray run: | diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py index d7e57021e..fac76190a 100644 --- a/examples/inference/api_server_openai/query_http_requests.py +++ b/examples/inference/api_server_openai/query_http_requests.py @@ -14,26 +14,33 @@ # limitations under the License. # -import os import json import requests +import argparse -s = requests.Session() +parser = argparse.ArgumentParser(description="Example script to query with http requests", add_help=True) +parser.add_argument("--request_api_base", default="http://localhost:8000/v1", type=str, help="Deployed model endpoint url") +parser.add_argument("--request_model", default="gpt2", type=str, help="The name of model to request") +parser.add_argument("--streaming_response", default=False, action="store_true", help="Whether to enable streaming response") +parser.add_argument("--max_new_tokens", default=None, help="The maximum numbers of tokens to generate") +parser.add_argument("--temperature", default=None, help="The value used to modulate the next token probabilities") +parser.add_argument("--top_p", default=None, help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation") + +args = parser.parse_args() -api_base = os.getenv("ENDPOINT_URL") -if api_base is None: - api_base = "http://localhost:8000/v1" -url = f"{api_base}/chat/completions" +s = requests.Session() +url = f"{args.api_base}/chat/completions" -model_name = os.getenv("MODEL_TO_SERVE", "gpt2") body = { - "model": model_name, + "model": args.request_model, "messages": [ {"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a long story with many words."} ], - "temperature": 0.7, - "stream": True, + "stream": args.streaming_response, + "max_tokens": args.max_new_tokens, + "temperature": args.temperature, + "top_p": args.top_p, } proxies = { "http": None, "https": None} diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py index 48e0974a4..f7f7436b5 100644 --- a/examples/inference/api_server_openai/query_openai_sdk.py +++ b/examples/inference/api_server_openai/query_openai_sdk.py @@ -15,21 +15,31 @@ # import openai -import os +import argparse + +parser = argparse.ArgumentParser(description="Example script to query with openai sdk", add_help=True) +parser.add_argument("--request_model", default="gpt2", type=str, help="The name of model to request") +parser.add_argument("--streaming_response", default=False, action="store_true", help="Whether to enable streaming response") +parser.add_argument("--max_new_tokens", default=None, help="The maximum numbers of tokens to generate") +parser.add_argument("--temperature", default=None, help="The value used to modulate the next token probabilities") +parser.add_argument("--top_p", default=None, help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation") + +args = parser.parse_args() # List all models. models = openai.Model.list() print(models) # Note: not all arguments are currently supported and will be ignored by the backend. -model_name = os.getenv("MODEL_TO_SERVE", "gpt2") chat_completion = openai.ChatCompletion.create( - model=model_name, + model=args.request_model, messages=[ {"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a long story with many words."} ], - temperature=0.7, - stream=False, + stream=args.streaming_response, + max_tokens=args.max_new_tokens, + temperature=args.temperature, + top_p=args.top_p, ) print(chat_completion) \ No newline at end of file diff --git a/inference/inference_config.py b/inference/inference_config.py index 812d579f4..502091058 100644 --- a/inference/inference_config.py +++ b/inference/inference_config.py @@ -129,12 +129,7 @@ def _check_workers_per_group(cls, v: int): m: InferenceConfig = parse_yaml_raw_as(InferenceConfig, f) _models[m.name] = m -env_model = "MODEL_TO_SERVE" -if env_model in os.environ: - all_models[os.environ[env_model]] = _models[os.environ[env_model]] -else: - # all_models["gpt-j-6B-finetuned-52K"] = gpt_j_finetuned_52K - all_models = _models.copy() +all_models = _models.copy() _gpt2_key = "gpt2" _gpt_j_6b = "gpt-j-6b" diff --git a/inference/serve.py b/inference/serve.py index 9da0b2662..c30cee0ae 100644 --- a/inference/serve.py +++ b/inference/serve.py @@ -24,20 +24,31 @@ from predictor_deployment import PredictorDeployment def get_deployed_models(args): - # serve all pre-defined models, or model from MODEL_TO_SERVE env, if no model argument specified - if args.model is None and args.config_file is None: - model_list = all_models + """ + The priority of how to choose models to deploy based on passed parameters: + 1. Use inference configuration file if config_file is set, + 2. Use relevant configuration parameters to generate `InferenceConfig` if model_id_or_path is set, + 3. Serve all pre-defined models in inference/models/*.yaml, or part of them if model_to_serve is set. + """ + if args.model_id_or_path is None and args.config_file is None: + model_to_serve = args.model_to_serve + if model_to_serve: + all_models_name = list(all_models.keys()) + assert set(model_to_serve).issubset(set(all_models_name)), f"model_to_serve must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {model_to_serve}." + model_list = {model: all_models[model] for model in model_to_serve} + else: + model_list = all_models else: # config_file has precedence over others if args.config_file: print("reading from config file, " + args.config_file) with open(args.config_file, "r") as f: infer_conf = parse_yaml_raw_as(InferenceConfig, f) - else: # args.model should be set - print("reading from command line, " + args.model) + else: # args.model_id_or_path should be set + print("reading from command line, " + args.model_id_or_path) model_desc = ModelDescription() - model_desc.model_id_or_path = args.model - model_desc.tokenizer_name_or_path = args.tokenizer if args.tokenizer is not None else args.model + model_desc.model_id_or_path = args.model_id_or_path + model_desc.tokenizer_name_or_path = args.tokenizer_id_or_path if args.tokenizer_id_or_path is not None else args.model_id_or_path infer_conf = InferenceConfig(model_description=model_desc) infer_conf.host = "127.0.0.1" if args.serve_local_only else "0.0.0.0" infer_conf.port = args.port @@ -60,8 +71,9 @@ def main(argv=None): import argparse parser = argparse.ArgumentParser(description="Model Serve Script", add_help=False) parser.add_argument("--config_file", type=str, help="inference configuration file in YAML. If specified, all other arguments are ignored") - parser.add_argument("--model", default=None, type=str, help="model name or path") - parser.add_argument("--tokenizer", default=None, type=str, help="tokenizer name or path") + parser.add_argument("--model_id_or_path", default=None, type=str, help="model name or path") + parser.add_argument("--tokenizer_id_or_path", default=None, type=str, help="tokenizer name or path") + parser.add_argument("--model_to_serve", nargs='*', default=["gpt2"], type=str, help="Only used when config_file and model_id_or_path are both None, it needs to be a subset of the values of key 'name' in inference/models/*.yaml") parser.add_argument("--port", default=8000, type=int, help="the port of deployment address") parser.add_argument("--route_prefix", default=None, type=str, help="the route prefix for HTTP requests.") parser.add_argument("--cpus_per_worker", default="24", type=int, help="cpus per worker") @@ -76,6 +88,7 @@ def main(argv=None): parser.add_argument("--keep_serve_terminal", action="store_true", help="whether to keep serve terminal.") args = parser.parse_args(argv) + print("type: ", type(args.model_to_serve), " content: ", args.model_to_serve) ray.init(address="auto") deployments, model_list = get_deployed_models(args) From 27c97de7703d68b01fa5ed9371c40ef24968f5ff Mon Sep 17 00:00:00 2001 From: KepingYan Date: Wed, 10 Jan 2024 17:26:35 +0800 Subject: [PATCH 2/7] update --- inference/serve.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inference/serve.py b/inference/serve.py index c30cee0ae..e0172f4f4 100644 --- a/inference/serve.py +++ b/inference/serve.py @@ -31,8 +31,8 @@ def get_deployed_models(args): 3. Serve all pre-defined models in inference/models/*.yaml, or part of them if model_to_serve is set. """ if args.model_id_or_path is None and args.config_file is None: - model_to_serve = args.model_to_serve - if model_to_serve: + if args.model_to_serve: + model_to_serve = args.model_to_serve all_models_name = list(all_models.keys()) assert set(model_to_serve).issubset(set(all_models_name)), f"model_to_serve must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {model_to_serve}." model_list = {model: all_models[model] for model in model_to_serve} @@ -88,7 +88,6 @@ def main(argv=None): parser.add_argument("--keep_serve_terminal", action="store_true", help="whether to keep serve terminal.") args = parser.parse_args(argv) - print("type: ", type(args.model_to_serve), " content: ", args.model_to_serve) ray.init(address="auto") deployments, model_list = get_deployed_models(args) From 30ab5bcf9ab022f2bc9433d0bb6fb8f4a37dee22 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Wed, 10 Jan 2024 22:37:59 +0800 Subject: [PATCH 3/7] fix --- .github/workflows/workflow_inference.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index 9b0ae3a1b..04623b6da 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -152,7 +152,7 @@ jobs: else docker exec "${TARGET}" bash -c "python inference/serve.py --model_to_serve ${{ matrix.model }}" fi - docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_to_serve ${{ matrix.model }}" + docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --request_model ${{ matrix.model }}" - name: Stop Ray run: | From c0f532c023fff5e5e09f31386d38fd3c4cd826f4 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Wed, 10 Jan 2024 23:39:29 +0800 Subject: [PATCH 4/7] fix --- examples/inference/api_server_openai/query_http_requests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py index fac76190a..695bcbf09 100644 --- a/examples/inference/api_server_openai/query_http_requests.py +++ b/examples/inference/api_server_openai/query_http_requests.py @@ -29,7 +29,7 @@ args = parser.parse_args() s = requests.Session() -url = f"{args.api_base}/chat/completions" +url = f"{args.request_api_base}/chat/completions" body = { "model": args.request_model, From e7a64848c9e3309d83f1d4c1225bbed1dc90c91e Mon Sep 17 00:00:00 2001 From: KepingYan Date: Thu, 11 Jan 2024 14:16:02 +0800 Subject: [PATCH 5/7] modify param name, update docs --- .github/workflows/workflow_inference.yml | 14 +++++++------- README.md | 2 +- docs/serve.md | 15 ++++++++++++++- .../api_server_openai/query_http_requests.py | 4 ++-- .../api_server_openai/query_openai_sdk.py | 4 ++-- inference/serve.py | 18 +++++++++--------- 6 files changed, 35 insertions(+), 22 deletions(-) diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index 04623b6da..d7faa5700 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -105,9 +105,9 @@ jobs: run: | TARGET=${{steps.target.outputs.target}} if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then - docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple" else - docker exec "${TARGET}" bash -c "python inference/serve.py --serve_simple --model_to_serve ${{ matrix.model }}" + docker exec "${TARGET}" bash -c "python inference/serve.py --simple --models ${{ matrix.model }}" fi docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" @@ -116,7 +116,7 @@ jobs: if: ${{ matrix.dtuner_model }} run: | TARGET=${{steps.target.outputs.target}} - docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml --simple" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" @@ -127,7 +127,7 @@ jobs: echo ${{ matrix.model }} is not supported! else docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed" - docker exec "${TARGET}" bash -c "python inference/serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed --simple" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" fi @@ -139,7 +139,7 @@ jobs: if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then echo ${{ matrix.model }} is not supported! else - docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --serve_simple" + docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" fi @@ -150,9 +150,9 @@ jobs: if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml" else - docker exec "${TARGET}" bash -c "python inference/serve.py --model_to_serve ${{ matrix.model }}" + docker exec "${TARGET}" bash -c "python inference/serve.py --models ${{ matrix.model }}" fi - docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --request_model ${{ matrix.model }}" + docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }}" - name: Stop Ray run: | diff --git a/README.md b/README.md index 718269fb8..25e7b1225 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py ``` Or you can serve specific model to a simple endpoint according to the `port` and `route_prefix` parameters in configuration file, ```bash -python inference/serve.py --config_file inference/models/gpt2.yaml --serve_simple +python inference/serve.py --config_file inference/models/gpt2.yaml --simple ``` After deploying the model endpoint, you can access and test it by using the script below: ```bash diff --git a/docs/serve.md b/docs/serve.md index 6bf655b19..56f10619c 100644 --- a/docs/serve.md +++ b/docs/serve.md @@ -27,6 +27,19 @@ device: HPU LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP and [BigDL-LLM](serve_bigdl.md) for INT4/FP4/INT8/FP8 to reduce latency. You can follow the corresponding documents to enable them. ## Serving +We support three methods to specify the models to be served, and they have the following priorities. +1. Use inference configuration file if config_file is set. +``` +python inference/serve.py --config_file inference/models/gpt2.yaml +``` +2. Use relevant configuration parameters if model_id_or_path is set. +``` +python inference/serve.py --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix...] +``` +3. If --config_file and --model_id_or_path are both None, it will serve all pre-defined models in inference/models/*.yaml, or part of them if models is set. +``` +python inference/serve.py --models gpt2 gpt-j-6b +``` ### OpenAI-compatible API To deploy your model, execute the following command with the model's configuration file. This will create an OpenAI-compatible API ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)) for serving. ```bash @@ -57,7 +70,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py ### Serving Model to a Simple Endpoint This will create a simple endpoint for serving according to the `port` and `route_prefix` parameters in conf file, for example: http://127.0.0.1:8000/gpt2. ```bash -python inference/serve.py --config_file --serve_simple +python inference/serve.py --config_file --simple ``` After deploying the model endpoint, you can access and test it by using the script below: ```bash diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py index 695bcbf09..fa1302d6d 100644 --- a/examples/inference/api_server_openai/query_http_requests.py +++ b/examples/inference/api_server_openai/query_http_requests.py @@ -20,7 +20,7 @@ parser = argparse.ArgumentParser(description="Example script to query with http requests", add_help=True) parser.add_argument("--request_api_base", default="http://localhost:8000/v1", type=str, help="Deployed model endpoint url") -parser.add_argument("--request_model", default="gpt2", type=str, help="The name of model to request") +parser.add_argument("--model_name", default="gpt2", type=str, help="The name of model to request") parser.add_argument("--streaming_response", default=False, action="store_true", help="Whether to enable streaming response") parser.add_argument("--max_new_tokens", default=None, help="The maximum numbers of tokens to generate") parser.add_argument("--temperature", default=None, help="The value used to modulate the next token probabilities") @@ -32,7 +32,7 @@ url = f"{args.request_api_base}/chat/completions" body = { - "model": args.request_model, + "model": args.model_name, "messages": [ {"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a long story with many words."} diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py index f7f7436b5..fe25f30da 100644 --- a/examples/inference/api_server_openai/query_openai_sdk.py +++ b/examples/inference/api_server_openai/query_openai_sdk.py @@ -18,7 +18,7 @@ import argparse parser = argparse.ArgumentParser(description="Example script to query with openai sdk", add_help=True) -parser.add_argument("--request_model", default="gpt2", type=str, help="The name of model to request") +parser.add_argument("--model_name", default="gpt2", type=str, help="The name of model to request") parser.add_argument("--streaming_response", default=False, action="store_true", help="Whether to enable streaming response") parser.add_argument("--max_new_tokens", default=None, help="The maximum numbers of tokens to generate") parser.add_argument("--temperature", default=None, help="The value used to modulate the next token probabilities") @@ -32,7 +32,7 @@ # Note: not all arguments are currently supported and will be ignored by the backend. chat_completion = openai.ChatCompletion.create( - model=args.request_model, + model=args.model_name, messages=[ {"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a long story with many words."} diff --git a/inference/serve.py b/inference/serve.py index e0172f4f4..a84ffdf16 100644 --- a/inference/serve.py +++ b/inference/serve.py @@ -28,14 +28,14 @@ def get_deployed_models(args): The priority of how to choose models to deploy based on passed parameters: 1. Use inference configuration file if config_file is set, 2. Use relevant configuration parameters to generate `InferenceConfig` if model_id_or_path is set, - 3. Serve all pre-defined models in inference/models/*.yaml, or part of them if model_to_serve is set. + 3. Serve all pre-defined models in inference/models/*.yaml, or part of them if models is set. """ if args.model_id_or_path is None and args.config_file is None: - if args.model_to_serve: - model_to_serve = args.model_to_serve + if args.models: + models = args.models all_models_name = list(all_models.keys()) - assert set(model_to_serve).issubset(set(all_models_name)), f"model_to_serve must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {model_to_serve}." - model_list = {model: all_models[model] for model in model_to_serve} + assert set(models).issubset(set(all_models_name)), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}." + model_list = {model: all_models[model] for model in models} else: model_list = all_models else: @@ -69,11 +69,11 @@ def get_deployed_models(args): def main(argv=None): # args import argparse - parser = argparse.ArgumentParser(description="Model Serve Script", add_help=False) + parser = argparse.ArgumentParser(description="Model Serve Script", add_help=True) parser.add_argument("--config_file", type=str, help="inference configuration file in YAML. If specified, all other arguments are ignored") parser.add_argument("--model_id_or_path", default=None, type=str, help="model name or path") parser.add_argument("--tokenizer_id_or_path", default=None, type=str, help="tokenizer name or path") - parser.add_argument("--model_to_serve", nargs='*', default=["gpt2"], type=str, help="Only used when config_file and model_id_or_path are both None, it needs to be a subset of the values of key 'name' in inference/models/*.yaml") + parser.add_argument("--models", nargs='*', default=["gpt2"], type=str, help=f"Only used when config_file and model_id_or_path are both None, valid values can be any items in {list(all_models.keys())}") parser.add_argument("--port", default=8000, type=int, help="the port of deployment address") parser.add_argument("--route_prefix", default=None, type=str, help="the route prefix for HTTP requests.") parser.add_argument("--cpus_per_worker", default="24", type=int, help="cpus per worker") @@ -84,14 +84,14 @@ def main(argv=None): parser.add_argument("--ipex", action='store_true', help="enable ipex optimization") parser.add_argument("--device", default="cpu", type=str, help="cpu, xpu, hpu or cuda") parser.add_argument("--serve_local_only", action="store_true", help="only support local access to url") - parser.add_argument("--serve_simple", action="store_true", help="whether to serve OpenAI-compatible API for all models or serve simple endpoint based on model conf files.") + parser.add_argument("--simple", action="store_true", help="whether to serve OpenAI-compatible API for all models or serve simple endpoint based on model conf files.") parser.add_argument("--keep_serve_terminal", action="store_true", help="whether to keep serve terminal.") args = parser.parse_args(argv) ray.init(address="auto") deployments, model_list = get_deployed_models(args) - if args.serve_simple: + if args.simple: # provide simple model endpoint # models can be served to customed URLs according to configuration files. serve_run(deployments, model_list) From 6cedffae3524aa658ab668ec7a6ac04a5e4b5375 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Mon, 15 Jan 2024 14:37:40 +0800 Subject: [PATCH 6/7] add blank --- docs/serve.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/serve.md b/docs/serve.md index 56f10619c..f0f3e3519 100644 --- a/docs/serve.md +++ b/docs/serve.md @@ -34,7 +34,7 @@ python inference/serve.py --config_file inference/models/gpt2.yaml ``` 2. Use relevant configuration parameters if model_id_or_path is set. ``` -python inference/serve.py --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix...] +python inference/serve.py --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix ...] ``` 3. If --config_file and --model_id_or_path are both None, it will serve all pre-defined models in inference/models/*.yaml, or part of them if models is set. ``` From 29d5a5222678cc2fc99b6a19b80d9892c0cee8c2 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Mon, 15 Jan 2024 17:13:15 +0800 Subject: [PATCH 7/7] report error --- .../api_server_openai/query_http_requests.py | 26 +++++++------------ inference/api_openai_backend/router_app.py | 16 ++++++------ 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py index e1b1a4ac8..b9e01ba6c 100644 --- a/examples/inference/api_server_openai/query_http_requests.py +++ b/examples/inference/api_server_openai/query_http_requests.py @@ -67,23 +67,17 @@ response = s.post(url, json=body, proxies=proxies) # type: ignore for chunk in response.iter_lines(decode_unicode=True): if chunk is not None: - try: + if args.streaming_response: # Get data from reponse chunk chunk_data = chunk.split("data: ")[1] - - # Get message choices from data - choices = json.loads(chunk_data)["choices"] - - # Pick content from first choice - content = choices[0]["delta"]["content"] - + if chunk_data != "[DONE]": + # Get message choices from data + choices = json.loads(chunk_data)["choices"] + # Pick content from first choice + content = choices[0]["delta"].get("content", "") + print(content, end="", flush=True) + else: + choices = json.loads(chunk)["choices"] + content = choices[0]["message"].get("content", "") print(content, end="", flush=True) - except json.decoder.JSONDecodeError: - # Chunk was not formatted as expected - pass - except KeyError: - # No message was contained in the chunk - pass - except Exception: - pass print("") diff --git a/inference/api_openai_backend/router_app.py b/inference/api_openai_backend/router_app.py index fc4328610..269a2d422 100644 --- a/inference/api_openai_backend/router_app.py +++ b/inference/api_openai_backend/router_app.py @@ -108,7 +108,7 @@ async def _completions_wrapper( logger.error(f"{subresult_dict['error']}") all_results.pop() had_error = True - yield "data: " + ModelResponse(**subresult_dict).json() + "\n\n" + yield "data: " + ModelResponse(**subresult_dict).json() + "\n" # Return early in case of an error break choices = [ @@ -131,11 +131,11 @@ async def _completions_wrapper( model=body.model, choices=choices, usage=usage, - ).json() + "\n\n" + ).json() + "\n" if had_error: # Return early in case of an error break - yield "data: [DONE]\n\n" + yield "data: [DONE]\n" async def _chat_completions_wrapper( @@ -160,7 +160,7 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=None, - ).json() + "\n\n" + ).json() + "\n" all_results = [] async for results in generator: @@ -175,7 +175,7 @@ async def _chat_completions_wrapper( subresult_dict["finish_reason"] = None all_results.pop() had_error = True - yield "data: " + ModelResponse(**subresult_dict).json() + "\n\n" + yield "data: " + ModelResponse(**subresult_dict).json() + "\n" # Return early in case of an error break else: @@ -193,7 +193,7 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=None, - ).json() + "\n\n" + ).json() + "\n" if had_error: # Return early in case of an error break @@ -216,8 +216,8 @@ async def _chat_completions_wrapper( model=body.model, choices=choices, usage=usage, - ).json() + "\n\n" - yield "data: [DONE]\n\n" + ).json() + "\n" + yield "data: [DONE]\n" class Router: