intel · KepingYan · Jan 16, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -105,9 +105,9 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --serve_simple"
+            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
           else
-            docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/serve.py --serve_simple"
+            docker exec "${TARGET}" bash -c "python inference/serve.py --simple --models ${{ matrix.model }}"
           fi
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
@@ -116,7 +116,7 @@ jobs:
         if: ${{ matrix.dtuner_model }}
         run: |
           TARGET=${{steps.target.outputs.target}}
-          docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml --serve_simple"
+          docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml --simple"
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
 
@@ -127,7 +127,7 @@ jobs:
             echo ${{ matrix.model }} is not supported!
           else
             docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed --serve_simple"
+            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed --simple"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
           fi
@@ -139,7 +139,7 @@ jobs:
           if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --serve_simple"
+            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
           fi
@@ -150,9 +150,9 @@ jobs:
           if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
             docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml"
           else
-            docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/serve.py"
+            docker exec "${TARGET}" bash -c "python inference/serve.py --models ${{ matrix.model }}"
           fi
-          docker exec "${TARGET}" bash -c "MODEL_TO_SERVE=\"${{ matrix.model }}\" python examples/inference/api_server_openai/query_http_requests.py"
+          docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }}"
 
       - name: Stop Ray
         run: |

diff --git a/README.md b/README.md
@@ -89,7 +89,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py
 ```
 Or you can serve specific model to a simple endpoint according to the `port` and `route_prefix` parameters in configuration file,
 ```bash
-python inference/serve.py --config_file inference/models/gpt2.yaml --serve_simple
+python inference/serve.py --config_file inference/models/gpt2.yaml --simple
 ```
 After deploying the model endpoint, you can access and test it by using the script below:
 ```bash

diff --git a/docs/serve.md b/docs/serve.md
@@ -27,6 +27,19 @@ device: HPU
 LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP and [BigDL-LLM](serve_bigdl.md) for INT4/FP4/INT8/FP8 to reduce latency. You can follow the corresponding documents to enable them.
 
 ## Serving
+We support three methods to specify the models to be served, and they have the following priorities.
+1. Use inference configuration file if config_file is set.
+```
+python inference/serve.py --config_file inference/models/gpt2.yaml
+```
+2. Use relevant configuration parameters if model_id_or_path is set.
+```
+python inference/serve.py --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix ...]
+```
+3. If --config_file and --model_id_or_path are both None, it will serve all pre-defined models in inference/models/*.yaml, or part of them if models is set.
+```
+python inference/serve.py --models gpt2 gpt-j-6b
+```
 ### OpenAI-compatible API
 To deploy your model, execute the following command with the model's configuration file. This will create an OpenAI-compatible API ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)) for serving.
 ```bash
@@ -57,7 +70,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py
 ### Serving Model to a Simple Endpoint
 This will create a simple endpoint for serving according to the `port` and `route_prefix` parameters in conf file, for example: http://127.0.0.1:8000/gpt2.
 ```bash
-python inference/serve.py --config_file <path to the conf file> --serve_simple
+python inference/serve.py --config_file <path to the conf file> --simple
 ```
 After deploying the model endpoint, you can access and test it by using the script below:
 ```bash

diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py
@@ -14,49 +14,70 @@
 # limitations under the License.
 #
 
-import os
 import json
 import requests
+import argparse
 
-s = requests.Session()
+parser = argparse.ArgumentParser(
+    description="Example script to query with http requests", add_help=True
+)
+parser.add_argument(
+    "--request_api_base",
+    default="http://localhost:8000/v1",
+    type=str,
+    help="Deployed model endpoint url",
+)
+parser.add_argument("--model_name", default="gpt2", type=str, help="The name of model to request")
+parser.add_argument(
+    "--streaming_response",
+    default=False,
+    action="store_true",
+    help="Whether to enable streaming response",
+)
+parser.add_argument(
+    "--max_new_tokens", default=None, help="The maximum numbers of tokens to generate"
+)
+parser.add_argument(
+    "--temperature", default=None, help="The value used to modulate the next token probabilities"
+)
+parser.add_argument(
+    "--top_p",
+    default=None,
+    help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation",
+)
+
+args = parser.parse_args()
 
-api_base = os.getenv("ENDPOINT_URL")
-if api_base is None:
-    api_base = "http://localhost:8000/v1"
-url = f"{api_base}/chat/completions"
+s = requests.Session()
+url = f"{args.request_api_base}/chat/completions"
 
-model_name = os.getenv("MODEL_TO_SERVE", "gpt2")
 body = {
-    "model": model_name,
+    "model": args.model_name,
     "messages": [
         {"role": "assistant", "content": "You are a helpful assistant."},
         {"role": "user", "content": "Tell me a long story with many words."},
     ],
-    "temperature": 0.7,
-    "stream": True,
+    "stream": args.streaming_response,
+    "max_tokens": args.max_new_tokens,
+    "temperature": args.temperature,
+    "top_p": args.top_p,
 }
 
 proxies = {"http": None, "https": None}
 response = s.post(url, json=body, proxies=proxies)  # type: ignore
 for chunk in response.iter_lines(decode_unicode=True):
     if chunk is not None:
-        try:
+        if args.streaming_response:
             # Get data from reponse chunk
             chunk_data = chunk.split("data: ")[1]
-
-            # Get message choices from data
-            choices = json.loads(chunk_data)["choices"]
-
-            # Pick content from first choice
-            content = choices[0]["delta"]["content"]
-
+            if chunk_data != "[DONE]":
+                # Get message choices from data
+                choices = json.loads(chunk_data)["choices"]
+                # Pick content from first choice
+                content = choices[0]["delta"].get("content", "")
+                print(content, end="", flush=True)
+        else:
+            choices = json.loads(chunk)["choices"]
+            content = choices[0]["message"].get("content", "")
             print(content, end="", flush=True)
-        except json.decoder.JSONDecodeError:
-            # Chunk was not formatted as expected
-            pass
-        except KeyError:
-            # No message was contained in the chunk
-            pass
-        except Exception:
-            pass
 print("")
diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py
@@ -15,21 +15,46 @@
 #
 
 import openai
-import os
+import argparse
+
+parser = argparse.ArgumentParser(
+    description="Example script to query with openai sdk", add_help=True
+)
+parser.add_argument("--model_name", default="gpt2", type=str, help="The name of model to request")
+parser.add_argument(
+    "--streaming_response",
+    default=False,
+    action="store_true",
+    help="Whether to enable streaming response",
+)
+parser.add_argument(
+    "--max_new_tokens", default=None, help="The maximum numbers of tokens to generate"
+)
+parser.add_argument(
+    "--temperature", default=None, help="The value used to modulate the next token probabilities"
+)
+parser.add_argument(
+    "--top_p",
+    default=None,
+    help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation",
+)
+
+args = parser.parse_args()
 
 # List all models.
 models = openai.Model.list()
 print(models)
 
 # Note: not all arguments are currently supported and will be ignored by the backend.
-model_name = os.getenv("MODEL_TO_SERVE", "gpt2")
 chat_completion = openai.ChatCompletion.create(
-    model=model_name,
+    model=args.model_name,
     messages=[
         {"role": "assistant", "content": "You are a helpful assistant."},
         {"role": "user", "content": "Tell me a long story with many words."},
     ],
-    temperature=0.7,
-    stream=False,
+    stream=args.streaming_response,
+    max_tokens=args.max_new_tokens,
+    temperature=args.temperature,
+    top_p=args.top_p,
 )
 print(chat_completion)
diff --git a/inference/api_openai_backend/router_app.py b/inference/api_openai_backend/router_app.py
@@ -108,7 +108,7 @@ async def _completions_wrapper(
                     logger.error(f"{subresult_dict['error']}")
                     all_results.pop()
                     had_error = True
-                    yield "data: " + ModelResponse(**subresult_dict).json() + "\n\n"
+                    yield "data: " + ModelResponse(**subresult_dict).json() + "\n"
                     # Return early in case of an error
                     break
                 choices = [
@@ -131,11 +131,11 @@ async def _completions_wrapper(
                     model=body.model,
                     choices=choices,
                     usage=usage,
-                ).json() + "\n\n"
+                ).json() + "\n"
             if had_error:
                 # Return early in case of an error
                 break
-        yield "data: [DONE]\n\n"
+        yield "data: [DONE]\n"
 
 
 async def _chat_completions_wrapper(
@@ -160,7 +160,7 @@ async def _chat_completions_wrapper(
             model=body.model,
             choices=choices,
             usage=None,
-        ).json() + "\n\n"
+        ).json() + "\n"
 
         all_results = []
         async for results in generator:
@@ -175,7 +175,7 @@ async def _chat_completions_wrapper(
                     subresult_dict["finish_reason"] = None
                     all_results.pop()
                     had_error = True
-                    yield "data: " + ModelResponse(**subresult_dict).json() + "\n\n"
+                    yield "data: " + ModelResponse(**subresult_dict).json() + "\n"
                     # Return early in case of an error
                     break
                 else:
@@ -193,7 +193,7 @@ async def _chat_completions_wrapper(
                         model=body.model,
                         choices=choices,
                         usage=None,
-                    ).json() + "\n\n"
+                    ).json() + "\n"
             if had_error:
                 # Return early in case of an error
                 break
@@ -216,8 +216,8 @@ async def _chat_completions_wrapper(
                 model=body.model,
                 choices=choices,
                 usage=usage,
-            ).json() + "\n\n"
-        yield "data: [DONE]\n\n"
+            ).json() + "\n"
+        yield "data: [DONE]\n"
 
 
 class Router:

diff --git a/inference/inference_config.py b/inference/inference_config.py
@@ -136,12 +136,7 @@ def _check_workers_per_group(cls, v: int):
         m: InferenceConfig = parse_yaml_raw_as(InferenceConfig, f)
         _models[m.name] = m
 
-env_model = "MODEL_TO_SERVE"
-if env_model in os.environ:
-    all_models[os.environ[env_model]] = _models[os.environ[env_model]]
-else:
-    # all_models["gpt-j-6B-finetuned-52K"] = gpt_j_finetuned_52K
-    all_models = _models.copy()
+all_models = _models.copy()
 
 _gpt2_key = "gpt2"
 _gpt_j_6b = "gpt-j-6b"

diff --git a/inference/serve.py b/inference/serve.py
@@ -25,21 +25,36 @@
 
 
 def get_deployed_models(args):
-    # serve all pre-defined models, or model from MODEL_TO_SERVE env, if no model argument specified
-    if args.model is None and args.config_file is None:
-        model_list = all_models
+    """
+    The priority of how to choose models to deploy based on passed parameters:
+    1. Use inference configuration file if config_file is set,
+    2. Use relevant configuration parameters to generate `InferenceConfig` if model_id_or_path is set,
+    3. Serve all pre-defined models in inference/models/*.yaml, or part of them if models is set.
+    """
+    if args.model_id_or_path is None and args.config_file is None:
+        if args.models:
+            models = args.models
+            all_models_name = list(all_models.keys())
+            assert set(models).issubset(
+                set(all_models_name)
+            ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}."
+            model_list = {model: all_models[model] for model in models}
+        else:
+            model_list = all_models
     else:
         # config_file has precedence over others
         if args.config_file:
             print("reading from config file, " + args.config_file)
             with open(args.config_file, "r") as f:
                 infer_conf = parse_yaml_raw_as(InferenceConfig, f)
-        else:  # args.model should be set
-            print("reading from command line, " + args.model)
+        else:  # args.model_id_or_path should be set
+            print("reading from command line, " + args.model_id_or_path)
             model_desc = ModelDescription()
-            model_desc.model_id_or_path = args.model
+            model_desc.model_id_or_path = args.model_id_or_path
             model_desc.tokenizer_name_or_path = (
-                args.tokenizer if args.tokenizer is not None else args.model
+                args.tokenizer_id_or_path
+                if args.tokenizer_id_or_path is not None
+                else args.model_id_or_path
             )
             infer_conf = InferenceConfig(model_description=model_desc)
             infer_conf.host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
@@ -71,8 +86,17 @@ def main(argv=None):
         type=str,
         help="Inference configuration file in YAML. If specified, all other arguments will be ignored.",
     )
-    parser.add_argument("--model", default=None, type=str, help="Model name or path.")
-    parser.add_argument("--tokenizer", default=None, type=str, help="Tokenizer name or path.")
+    parser.add_argument("--model_id_or_path", default=None, type=str, help="Model name or path.")
+    parser.add_argument(
+        "--tokenizer_id_or_path", default=None, type=str, help="Tokenizer name or path."
+    )
+    parser.add_argument(
+        "--models",
+        nargs="*",
+        default=["gpt2"],
+        type=str,
+        help=f"Only used when config_file and model_id_or_path are both None, valid values can be any items in {list(all_models.keys())}.",
+    )
     parser.add_argument("--port", default=8000, type=int, help="The port of deployment address.")
     parser.add_argument(
         "--route_prefix",
@@ -108,7 +132,7 @@ def main(argv=None):
         help="Only support local access to url.",
     )
     parser.add_argument(
-        "--serve_simple",
+        "--simple",
         action="store_true",
         help="Whether to serve OpenAI-compatible API for all models or serve simple endpoint based on model conf files.",
     )
@@ -122,7 +146,7 @@ def main(argv=None):
 
     ray.init(address="auto")
     deployments, model_list = get_deployed_models(args)
-    if args.serve_simple:
+    if args.simple:
         # provide simple model endpoint
         # models can be served to customed URLs according to configuration files.
         serve_run(deployments, model_list)