From 7c4a2954df91294941e85d70109a089a62395c49 Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Tue, 4 Jun 2024 14:30:10 +0000
Subject: [PATCH 01/10] add simpleclass

---
 .../api_server_simple/query_single.py         | 14 +++-
 .../api_simple_backend/simple_protocol.py     | 78 +++++++++++++++++++
 2 files changed, 88 insertions(+), 4 deletions(-)
 create mode 100644 llm_on_ray/inference/api_simple_backend/simple_protocol.py

diff --git a/examples/inference/api_server_simple/query_single.py b/examples/inference/api_server_simple/query_single.py
index 62bb4dc45..5aabdd8ea 100644
--- a/examples/inference/api_server_simple/query_single.py
+++ b/examples/inference/api_server_simple/query_single.py
@@ -17,6 +17,10 @@
 import requests
 import argparse
 from typing import Dict, Union
+from llm_on_ray.inference.api_simple_backend.simple_protocol import (
+    SimpleRequest,
+    SimpleModelResponse,
+)
 
 parser = argparse.ArgumentParser(
     description="Example script to query with single request", add_help=True
@@ -66,20 +70,22 @@
 if args.top_k:
     config["top_k"] = float(args.top_k)
 
-sample_input = {"text": prompt, "config": config, "stream": args.streaming_response}
+sample_input = SimpleRequest(text=prompt, config=config, stream=args.streaming_response)
 
 proxies = {"http": None, "https": None}
 outputs = requests.post(
     args.model_endpoint,
     proxies=proxies,  # type: ignore
-    json=sample_input,
+    json=sample_input.dict(),
     stream=args.streaming_response,
 )
 
 outputs.raise_for_status()
+
+simple_response = SimpleModelResponse.from_requests_response(outputs)
 if args.streaming_response:
-    for output in outputs.iter_content(chunk_size=None, decode_unicode=True):
+    for output in simple_response.iter_content(chunk_size=1, decode_unicode=True):
         print(output, end="", flush=True)
     print()
 else:
-    print(outputs.text, flush=True)
+    print(simple_response.text, flush=True)
diff --git a/llm_on_ray/inference/api_simple_backend/simple_protocol.py b/llm_on_ray/inference/api_simple_backend/simple_protocol.py
new file mode 100644
index 000000000..120f3969f
--- /dev/null
+++ b/llm_on_ray/inference/api_simple_backend/simple_protocol.py
@@ -0,0 +1,78 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# ===========================================================================
+#
+# This file is adapted from
+# https://github.com/ray-project/ray-llm/blob/b3560aa55dadf6978f0de0a6f8f91002a5d2bed1/aviary/common/models.py
+# Copyright 2023 Anyscale
+#
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Dict, Optional, Union, Iterator
+import requests
+from pydantic import BaseModel
+
+
+class SimpleRequest(BaseModel):
+    text: str
+    config: Dict[str, Union[int, float]]
+    stream: Optional[bool] = False
+
+
+class SimpleModelResponse(BaseModel):
+    headers: Dict[str, str]
+    text: str
+    content: bytes
+    status_code: int
+    url: str
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    response: Optional[requests.Response] = None
+
+    @staticmethod
+    def from_requests_response(response: requests.Response):
+        return SimpleModelResponse(
+            headers=dict(response.headers),
+            text=response.text,
+            content=response.content,
+            status_code=response.status_code,
+            url=response.url,
+            response=response,
+        )
+
+    def iter_content(
+        self, chunk_size: Optional[int] = 1, decode_unicode: bool = False
+    ) -> Iterator[Union[bytes, str]]:
+        if self.response is not None:
+            return self.response.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode)
+        else:
+            return iter([])

From 7c8552118fca80b933c993fc1d088e5a0380a37d Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Fri, 14 Jun 2024 13:59:52 +0000
Subject: [PATCH 02/10] add dynamic_batch

---
 .../api_server_simple/query_dynamic_batch.py  | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/examples/inference/api_server_simple/query_dynamic_batch.py b/examples/inference/api_server_simple/query_dynamic_batch.py
index a9e1b8837..2982a9636 100644
--- a/examples/inference/api_server_simple/query_dynamic_batch.py
+++ b/examples/inference/api_server_simple/query_dynamic_batch.py
@@ -18,6 +18,10 @@
 import aiohttp
 import argparse
 from typing import Dict, Union
+from llm_on_ray.inference.api_simple_backend.simple_protocol import (
+    SimpleRequest,
+    SimpleModelResponse,
+)
 
 parser = argparse.ArgumentParser(
     description="Example script to query with multiple requests", add_help=True
@@ -63,9 +67,8 @@
     config["top_k"] = float(args.top_k)
 
 
-async def send_query(session, endpoint, prompt, config):
-    json_request = {"text": prompt, "config": config}
-    async with session.post(endpoint, json=json_request) as resp:
+async def send_query(session, endpoint, req):
+    async with session.post(endpoint, json=req.dict()) as resp:
         return await resp.text()
 
 
@@ -86,16 +89,15 @@ async def send_query(session, endpoint, prompt, config):
 
 configs = [config1] * 5 + [config2] * (len(prompts) - 5)
 
+reqs = [SimpleRequest(text=prompt, config=config) for prompt, config in zip(prompts, configs)]
+
 
-async def send_all_query(endpoint, prompts, configs):
+async def send_all_query(endpoint, reqs):
     async with aiohttp.ClientSession() as session:
-        tasks = [
-            send_query(session, endpoint, prompt, config)
-            for prompt, config in zip(prompts, configs)
-        ]
+        tasks = [send_query(session, endpoint, req) for req in reqs]
         responses = await asyncio.gather(*tasks)
         print("\n--------------\n".join(responses))
         print("\nTotal responses:", len(responses))
 
 
-asyncio.run(send_all_query(args.model_endpoint, prompts, configs))
+asyncio.run(send_all_query(args.model_endpoint, reqs))

From 34407cb352a67906cdb772a2ff6e8c53d7084727 Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Mon, 17 Jun 2024 16:01:12 +0000
Subject: [PATCH 03/10] add server request

---
 .../api_simple_backend/simple_protocol.py     | 58 +++++++++++--------
 llm_on_ray/inference/predictor_deployment.py  | 23 ++++----
 tests/inference/test_example_simple.py        |  4 +-
 3 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/llm_on_ray/inference/api_simple_backend/simple_protocol.py b/llm_on_ray/inference/api_simple_backend/simple_protocol.py
index 120f3969f..297d2edd6 100644
--- a/llm_on_ray/inference/api_simple_backend/simple_protocol.py
+++ b/llm_on_ray/inference/api_simple_backend/simple_protocol.py
@@ -13,31 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-#
-# ===========================================================================
-#
-# This file is adapted from
-# https://github.com/ray-project/ray-llm/blob/b3560aa55dadf6978f0de0a6f8f91002a5d2bed1/aviary/common/models.py
-# Copyright 2023 Anyscale
-#
-# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Dict, Optional, Union, Iterator
+from typing import Dict, Optional, Union, Iterator, List
 import requests
-from pydantic import BaseModel
+from pydantic import BaseModel, ValidationError, validator
 
 
 class SimpleRequest(BaseModel):
@@ -45,6 +23,38 @@ class SimpleRequest(BaseModel):
     config: Dict[str, Union[int, float]]
     stream: Optional[bool] = False
 
+    @validator("text")
+    def text_must_not_be_empty(cls, v):
+        if not v.strip():
+            raise ValueError("Empty prompt is not supported.")
+        return v
+
+    @validator("config", pre=True)
+    def check_config_type(cls, value):
+        allowed_keys = ["max_new_tokens", "temperature", "top_p", "top_k"]
+        config_keys = value.keys()
+
+        if not isinstance(value, dict):
+            raise ValueError("Config must be a dictionary")
+
+        if not all(isinstance(key, str) for key in value.keys()):
+            raise ValueError("All keys in config must be strings")
+
+        if not all(isinstance(val, (int, float)) for val in value.values()):
+            raise ValueError("All values in config must be integers or floats")
+
+        if config_keys not in allowed_keys:
+            invalid_keys = config_keys - allowed_keys
+            raise ValueError(f'Invalid config keys: {", ".join(invalid_keys)}')
+
+        return value
+
+    @validator("stream", pre=True)
+    def check_stream_type(cls, value):
+        if not isinstance(value, bool) and value is not None:
+            raise ValueError("Stream must be a boolean or None")
+        return value
+
 
 class SimpleModelResponse(BaseModel):
     headers: Dict[str, str]
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index a1055915d..9233d9d38 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -34,6 +34,10 @@
     ErrorResponse,
     ModelResponse,
 )
+from llm_on_ray.inference.api_simple_backend.simple_protocol import (
+    SimpleRequest,
+    SimpleModelResponse,
+)
 from llm_on_ray.inference.predictor import GenerateInput
 from llm_on_ray.inference.utils import get_prompt_format, PromptFormat
 from llm_on_ray.inference.api_openai_backend.tools import OpenAIToolsPrompter, ChatPromptCapture
@@ -377,26 +381,23 @@ def preprocess_prompts(
         else:
             raise HTTPException(400, "Invalid prompt format.")
 
-    async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]:
+    async def __call__(
+        self, http_request: SimpleRequest
+    ) -> Union[StreamingResponse, JSONResponse, str]:
         self.use_openai = False
-
         try:
             json_request: Dict[str, Any] = await http_request.json()
+            simple_request = SimpleRequest(**json_request)
         except ValueError:
             return JSONResponse(
                 status_code=400,
                 content="Invalid JSON format from http request.",
             )
-        streaming_response = json_request["stream"] if "stream" in json_request else False
-        input = json_request["text"] if "text" in json_request else ""
 
-        if input == "":
-            return JSONResponse(
-                status_code=400,
-                content="Empty prompt is not supported.",
-            )
-        config = json_request["config"] if "config" in json_request else {}
-        # return prompt or list of prompts preprocessed
+        input = simple_request.text
+        config = simple_request.config
+        streaming_response = simple_request.stream
+
         prompts = self.preprocess_prompts(input)
 
         # Handle streaming response
diff --git a/tests/inference/test_example_simple.py b/tests/inference/test_example_simple.py
index a0842f481..486bfe4c3 100644
--- a/tests/inference/test_example_simple.py
+++ b/tests/inference/test_example_simple.py
@@ -86,8 +86,8 @@ def script_with_args(
         for script_name in ["query_single.py", "query_dynamic_batch.py"]
         for base_url in ["http://localhost:8000/"]
         for model_name in ["gpt2"]
-        for streaming_response in [None]
-        for max_new_tokens in [None]
+        for streaming_response in [None, True, "error"]
+        for max_new_tokens in [None, 128, "error"]
         for temperature in [None]
         for top_p in [None]
         for top_k in [None]

From 7545708319d20c910adbd29bb211109df31c01f5 Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Mon, 17 Jun 2024 16:29:23 +0000
Subject: [PATCH 04/10] afix numpy

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6a3c44685..001926257 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ classifiers = [
 dependencies = [
     "accelerate",
     "datasets>=2.14.6",
-    "numpy",
+    "numpy==1.26.4",
     "ray>=2.10",
     "ray[serve,tune]>=2.10",
     "typing>=3.7.4.3",

From aee34299ffd41579334dd1d01fc1324582eac0e3 Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Mon, 17 Jun 2024 16:55:47 +0000
Subject: [PATCH 05/10] update to 2.3

---
 pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 001926257..7867aff24 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ classifiers = [
 dependencies = [
     "accelerate",
     "datasets>=2.14.6",
-    "numpy==1.26.4",
+    "numpy",
     "ray>=2.10",
     "ray[serve,tune]>=2.10",
     "typing>=3.7.4.3",
@@ -41,9 +41,9 @@ dependencies = [
 [project.optional-dependencies]
 cpu = [
     "transformers>=4.38.0, <=4.38.1",
-    "intel_extension_for_pytorch==2.2.0",
-    "torch==2.2.0",
-    "oneccl_bind_pt==2.2.0"
+    "intel_extension_for_pytorch==2.3.0",
+    "torch==2.3.0",
+    "oneccl_bind_pt==2.3.0"
 ]
 
 gpu = [

From 31fadc06f3b0f8b6a17aeee36fdebfed45a8028e Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Tue, 18 Jun 2024 11:04:24 +0000
Subject: [PATCH 06/10] fix np

---
 pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7867aff24..5a8e89306 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ classifiers = [
 dependencies = [
     "accelerate",
     "datasets>=2.14.6",
-    "numpy",
+    "numpy<2.0.0",
     "ray>=2.10",
     "ray[serve,tune]>=2.10",
     "typing>=3.7.4.3",
@@ -41,9 +41,9 @@ dependencies = [
 [project.optional-dependencies]
 cpu = [
     "transformers>=4.38.0, <=4.38.1",
-    "intel_extension_for_pytorch==2.3.0",
-    "torch==2.3.0",
-    "oneccl_bind_pt==2.3.0"
+    "intel_extension_for_pytorch==2.2.0",
+    "torch==2.2.0",
+    "oneccl_bind_pt==2.2.0"
 ]
 
 gpu = [

From 1af240651eb5158a70e86064dee895ab70f31f95 Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Tue, 18 Jun 2024 15:52:59 +0000
Subject: [PATCH 07/10] fix

---
 .../inference/api_simple_backend/simple_protocol.py | 11 +++++++----
 llm_on_ray/inference/predictor_deployment.py        | 13 +++++--------
 tests/inference/test_example_simple.py              |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llm_on_ray/inference/api_simple_backend/simple_protocol.py b/llm_on_ray/inference/api_simple_backend/simple_protocol.py
index 297d2edd6..b873763ad 100644
--- a/llm_on_ray/inference/api_simple_backend/simple_protocol.py
+++ b/llm_on_ray/inference/api_simple_backend/simple_protocol.py
@@ -20,7 +20,7 @@
 
 class SimpleRequest(BaseModel):
     text: str
-    config: Dict[str, Union[int, float]]
+    config: Dict[str, Union[int, float]] = {}
     stream: Optional[bool] = False
 
     @validator("text")
@@ -32,7 +32,10 @@ def text_must_not_be_empty(cls, v):
     @validator("config", pre=True)
     def check_config_type(cls, value):
         allowed_keys = ["max_new_tokens", "temperature", "top_p", "top_k"]
-        config_keys = value.keys()
+        allowed_set = set(allowed_keys)
+        config_dict = value.keys()
+        config_keys = [key for key in config_dict]
+        config_set = set(config_keys)
 
         if not isinstance(value, dict):
             raise ValueError("Config must be a dictionary")
@@ -43,8 +46,8 @@ def check_config_type(cls, value):
         if not all(isinstance(val, (int, float)) for val in value.values()):
             raise ValueError("All values in config must be integers or floats")
 
-        if config_keys not in allowed_keys:
-            invalid_keys = config_keys - allowed_keys
+        if not config_set.issubset(allowed_set):
+            invalid_keys = config_set - allowed_set
             raise ValueError(f'Invalid config keys: {", ".join(invalid_keys)}')
 
         return value
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 9233d9d38..ed67f5119 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -381,22 +381,19 @@ def preprocess_prompts(
         else:
             raise HTTPException(400, "Invalid prompt format.")
 
-    async def __call__(
-        self, http_request: SimpleRequest
-    ) -> Union[StreamingResponse, JSONResponse, str]:
+    async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]:
         self.use_openai = False
         try:
-            json_request: Dict[str, Any] = await http_request.json()
-            simple_request = SimpleRequest(**json_request)
+            request: Dict[str, Any] = await http_request.json()
         except ValueError:
             return JSONResponse(
                 status_code=400,
                 content="Invalid JSON format from http request.",
             )
 
-        input = simple_request.text
-        config = simple_request.config
-        streaming_response = simple_request.stream
+        streaming_response = request["stream"]
+        input = request["text"]
+        config = request["config"]
 
         prompts = self.preprocess_prompts(input)
 
diff --git a/tests/inference/test_example_simple.py b/tests/inference/test_example_simple.py
index 486bfe4c3..ec0949fe3 100644
--- a/tests/inference/test_example_simple.py
+++ b/tests/inference/test_example_simple.py
@@ -87,7 +87,7 @@ def script_with_args(
         for base_url in ["http://localhost:8000/"]
         for model_name in ["gpt2"]
         for streaming_response in [None, True, "error"]
-        for max_new_tokens in [None, 128, "error"]
+        for max_new_tokens in [None]
         for temperature in [None]
         for top_p in [None]
         for top_k in [None]

From d8c01195ce552d4bda7f8f716bc8ac0ea6c67e23 Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Tue, 18 Jun 2024 16:24:45 +0000
Subject: [PATCH 08/10] fix

---
 examples/inference/api_server_simple/query_single.py | 2 +-
 tests/inference/test_example_simple.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/inference/api_server_simple/query_single.py b/examples/inference/api_server_simple/query_single.py
index 5aabdd8ea..f0d618331 100644
--- a/examples/inference/api_server_simple/query_single.py
+++ b/examples/inference/api_server_simple/query_single.py
@@ -71,7 +71,7 @@
     config["top_k"] = float(args.top_k)
 
 sample_input = SimpleRequest(text=prompt, config=config, stream=args.streaming_response)
-
+print(args.streaming_response)
 proxies = {"http": None, "https": None}
 outputs = requests.post(
     args.model_endpoint,
diff --git a/tests/inference/test_example_simple.py b/tests/inference/test_example_simple.py
index ec0949fe3..a0842f481 100644
--- a/tests/inference/test_example_simple.py
+++ b/tests/inference/test_example_simple.py
@@ -86,7 +86,7 @@ def script_with_args(
         for script_name in ["query_single.py", "query_dynamic_batch.py"]
         for base_url in ["http://localhost:8000/"]
         for model_name in ["gpt2"]
-        for streaming_response in [None, True, "error"]
+        for streaming_response in [None]
         for max_new_tokens in [None]
         for temperature in [None]
         for top_p in [None]

From 18b4a257b78bb93da7f8bfde348b529122de1e3d Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Tue, 18 Jun 2024 16:50:26 +0000
Subject: [PATCH 09/10] add pytest

---
 .../api_server_simple/query_single.py         |  2 +-
 tests/inference/test_simple_protocal.py       | 90 +++++++++++++++++++
 2 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 tests/inference/test_simple_protocal.py

diff --git a/examples/inference/api_server_simple/query_single.py b/examples/inference/api_server_simple/query_single.py
index f0d618331..5aabdd8ea 100644
--- a/examples/inference/api_server_simple/query_single.py
+++ b/examples/inference/api_server_simple/query_single.py
@@ -71,7 +71,7 @@
     config["top_k"] = float(args.top_k)
 
 sample_input = SimpleRequest(text=prompt, config=config, stream=args.streaming_response)
-print(args.streaming_response)
+
 proxies = {"http": None, "https": None}
 outputs = requests.post(
     args.model_endpoint,
diff --git a/tests/inference/test_simple_protocal.py b/tests/inference/test_simple_protocal.py
new file mode 100644
index 000000000..070c8eb73
--- /dev/null
+++ b/tests/inference/test_simple_protocal.py
@@ -0,0 +1,90 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import subprocess
+import pytest
+import os
+from basic_set import start_serve
+import requests
+from llm_on_ray.inference.api_simple_backend.simple_protocol import (
+    SimpleRequest,
+    SimpleModelResponse,
+)
+
+
+executed_models = []
+
+
+# Parametrize the test function with different combinations of parameters
+# TODO: more models and combinations will be added and tested.
+@pytest.mark.parametrize(
+    "prompt,streaming_response,max_new_tokens,temperature,top_p, top_k",
+    [
+        (
+            prompt,
+            streaming_response,
+            max_new_tokens,
+            temperature,
+            top_p,
+            top_k,
+        )
+        for prompt in ["Once upon a time", ""]
+        for streaming_response in [None, True, "error"]
+        for max_new_tokens in [None, 128, "error"]
+        for temperature in [None]
+        for top_p in [None]
+        for top_k in [None]
+    ],
+)
+def test_script(prompt, streaming_response, max_new_tokens, temperature, top_p, top_k):
+    global executed_models
+
+    # Check if this modelname has already executed start_serve
+    if "gpt2" not in executed_models:
+        start_serve("gpt2", simple=True)
+        # Mark this modelname has already executed start_serve
+        executed_models.append("gpt2")
+    config = {}
+    if max_new_tokens:
+        config["max_new_tokens"] = max_new_tokens
+    if temperature:
+        config["temperature"] = temperature
+    if top_p:
+        config["top_p"] = top_p
+    if top_k:
+        config["top_k"] = top_k
+
+    try:
+        sample_input = SimpleRequest(text=prompt, config=config, stream=streaming_response)
+    except ValueError as e:
+        print(e)
+        return
+    outputs = requests.post(
+        "http://localhost:8000/gpt2",
+        proxies={"http": None, "https": None},  # type: ignore
+        json=sample_input.dict(),
+        stream=streaming_response,
+    )
+
+    outputs.raise_for_status()
+
+    simple_response = SimpleModelResponse.from_requests_response(outputs)
+    if streaming_response:
+        for output in simple_response.iter_content(chunk_size=1, decode_unicode=True):
+            print(output, end="", flush=True)
+        print()
+    else:
+        print(simple_response.text, flush=True)

From cdc3b8c8d7d4f78b52c3187304a8e9694c96a5b9 Mon Sep 17 00:00:00 2001
From: yutianchen666 <tianchen.yu@intel.com>
Date: Fri, 21 Jun 2024 09:19:04 +0000
Subject: [PATCH 10/10] fix name

---
 .../{test_simple_protocal.py => test_simple_protocol.py}          | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/inference/{test_simple_protocal.py => test_simple_protocol.py} (100%)

diff --git a/tests/inference/test_simple_protocal.py b/tests/inference/test_simple_protocol.py
similarity index 100%
rename from tests/inference/test_simple_protocal.py
rename to tests/inference/test_simple_protocol.py