From 7c4a2954df91294941e85d70109a089a62395c49 Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Tue, 4 Jun 2024 14:30:10 +0000 Subject: [PATCH 01/10] add simpleclass --- .../api_server_simple/query_single.py | 14 +++- .../api_simple_backend/simple_protocol.py | 78 +++++++++++++++++++ 2 files changed, 88 insertions(+), 4 deletions(-) create mode 100644 llm_on_ray/inference/api_simple_backend/simple_protocol.py diff --git a/examples/inference/api_server_simple/query_single.py b/examples/inference/api_server_simple/query_single.py index 62bb4dc45..5aabdd8ea 100644 --- a/examples/inference/api_server_simple/query_single.py +++ b/examples/inference/api_server_simple/query_single.py @@ -17,6 +17,10 @@ import requests import argparse from typing import Dict, Union +from llm_on_ray.inference.api_simple_backend.simple_protocol import ( + SimpleRequest, + SimpleModelResponse, +) parser = argparse.ArgumentParser( description="Example script to query with single request", add_help=True @@ -66,20 +70,22 @@ if args.top_k: config["top_k"] = float(args.top_k) -sample_input = {"text": prompt, "config": config, "stream": args.streaming_response} +sample_input = SimpleRequest(text=prompt, config=config, stream=args.streaming_response) proxies = {"http": None, "https": None} outputs = requests.post( args.model_endpoint, proxies=proxies, # type: ignore - json=sample_input, + json=sample_input.dict(), stream=args.streaming_response, ) outputs.raise_for_status() + +simple_response = SimpleModelResponse.from_requests_response(outputs) if args.streaming_response: - for output in outputs.iter_content(chunk_size=None, decode_unicode=True): + for output in simple_response.iter_content(chunk_size=1, decode_unicode=True): print(output, end="", flush=True) print() else: - print(outputs.text, flush=True) + print(simple_response.text, flush=True) diff --git a/llm_on_ray/inference/api_simple_backend/simple_protocol.py b/llm_on_ray/inference/api_simple_backend/simple_protocol.py new file mode 100644 index 000000000..120f3969f --- /dev/null +++ b/llm_on_ray/inference/api_simple_backend/simple_protocol.py @@ -0,0 +1,78 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# =========================================================================== +# +# This file is adapted from +# https://github.com/ray-project/ray-llm/blob/b3560aa55dadf6978f0de0a6f8f91002a5d2bed1/aviary/common/models.py +# Copyright 2023 Anyscale +# +# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Dict, Optional, Union, Iterator +import requests +from pydantic import BaseModel + + +class SimpleRequest(BaseModel): + text: str + config: Dict[str, Union[int, float]] + stream: Optional[bool] = False + + +class SimpleModelResponse(BaseModel): + headers: Dict[str, str] + text: str + content: bytes + status_code: int + url: str + + class Config: + arbitrary_types_allowed = True + + response: Optional[requests.Response] = None + + @staticmethod + def from_requests_response(response: requests.Response): + return SimpleModelResponse( + headers=dict(response.headers), + text=response.text, + content=response.content, + status_code=response.status_code, + url=response.url, + response=response, + ) + + def iter_content( + self, chunk_size: Optional[int] = 1, decode_unicode: bool = False + ) -> Iterator[Union[bytes, str]]: + if self.response is not None: + return self.response.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode) + else: + return iter([]) From 7c8552118fca80b933c993fc1d088e5a0380a37d Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Fri, 14 Jun 2024 13:59:52 +0000 Subject: [PATCH 02/10] add dynamic_batch --- .../api_server_simple/query_dynamic_batch.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/examples/inference/api_server_simple/query_dynamic_batch.py b/examples/inference/api_server_simple/query_dynamic_batch.py index a9e1b8837..2982a9636 100644 --- a/examples/inference/api_server_simple/query_dynamic_batch.py +++ b/examples/inference/api_server_simple/query_dynamic_batch.py @@ -18,6 +18,10 @@ import aiohttp import argparse from typing import Dict, Union +from llm_on_ray.inference.api_simple_backend.simple_protocol import ( + SimpleRequest, + SimpleModelResponse, +) parser = argparse.ArgumentParser( description="Example script to query with multiple requests", add_help=True @@ -63,9 +67,8 @@ config["top_k"] = float(args.top_k) -async def send_query(session, endpoint, prompt, config): - json_request = {"text": prompt, "config": config} - async with session.post(endpoint, json=json_request) as resp: +async def send_query(session, endpoint, req): + async with session.post(endpoint, json=req.dict()) as resp: return await resp.text() @@ -86,16 +89,15 @@ async def send_query(session, endpoint, prompt, config): configs = [config1] * 5 + [config2] * (len(prompts) - 5) +reqs = [SimpleRequest(text=prompt, config=config) for prompt, config in zip(prompts, configs)] + -async def send_all_query(endpoint, prompts, configs): +async def send_all_query(endpoint, reqs): async with aiohttp.ClientSession() as session: - tasks = [ - send_query(session, endpoint, prompt, config) - for prompt, config in zip(prompts, configs) - ] + tasks = [send_query(session, endpoint, req) for req in reqs] responses = await asyncio.gather(*tasks) print("\n--------------\n".join(responses)) print("\nTotal responses:", len(responses)) -asyncio.run(send_all_query(args.model_endpoint, prompts, configs)) +asyncio.run(send_all_query(args.model_endpoint, reqs)) From 34407cb352a67906cdb772a2ff6e8c53d7084727 Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Mon, 17 Jun 2024 16:01:12 +0000 Subject: [PATCH 03/10] add server request --- .../api_simple_backend/simple_protocol.py | 58 +++++++++++-------- llm_on_ray/inference/predictor_deployment.py | 23 ++++---- tests/inference/test_example_simple.py | 4 +- 3 files changed, 48 insertions(+), 37 deletions(-) diff --git a/llm_on_ray/inference/api_simple_backend/simple_protocol.py b/llm_on_ray/inference/api_simple_backend/simple_protocol.py index 120f3969f..297d2edd6 100644 --- a/llm_on_ray/inference/api_simple_backend/simple_protocol.py +++ b/llm_on_ray/inference/api_simple_backend/simple_protocol.py @@ -13,31 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# -# =========================================================================== -# -# This file is adapted from -# https://github.com/ray-project/ray-llm/blob/b3560aa55dadf6978f0de0a6f8f91002a5d2bed1/aviary/common/models.py -# Copyright 2023 Anyscale -# -# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import Dict, Optional, Union, Iterator +from typing import Dict, Optional, Union, Iterator, List import requests -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError, validator class SimpleRequest(BaseModel): @@ -45,6 +23,38 @@ class SimpleRequest(BaseModel): config: Dict[str, Union[int, float]] stream: Optional[bool] = False + @validator("text") + def text_must_not_be_empty(cls, v): + if not v.strip(): + raise ValueError("Empty prompt is not supported.") + return v + + @validator("config", pre=True) + def check_config_type(cls, value): + allowed_keys = ["max_new_tokens", "temperature", "top_p", "top_k"] + config_keys = value.keys() + + if not isinstance(value, dict): + raise ValueError("Config must be a dictionary") + + if not all(isinstance(key, str) for key in value.keys()): + raise ValueError("All keys in config must be strings") + + if not all(isinstance(val, (int, float)) for val in value.values()): + raise ValueError("All values in config must be integers or floats") + + if config_keys not in allowed_keys: + invalid_keys = config_keys - allowed_keys + raise ValueError(f'Invalid config keys: {", ".join(invalid_keys)}') + + return value + + @validator("stream", pre=True) + def check_stream_type(cls, value): + if not isinstance(value, bool) and value is not None: + raise ValueError("Stream must be a boolean or None") + return value + class SimpleModelResponse(BaseModel): headers: Dict[str, str] diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index a1055915d..9233d9d38 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -34,6 +34,10 @@ ErrorResponse, ModelResponse, ) +from llm_on_ray.inference.api_simple_backend.simple_protocol import ( + SimpleRequest, + SimpleModelResponse, +) from llm_on_ray.inference.predictor import GenerateInput from llm_on_ray.inference.utils import get_prompt_format, PromptFormat from llm_on_ray.inference.api_openai_backend.tools import OpenAIToolsPrompter, ChatPromptCapture @@ -377,26 +381,23 @@ def preprocess_prompts( else: raise HTTPException(400, "Invalid prompt format.") - async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]: + async def __call__( + self, http_request: SimpleRequest + ) -> Union[StreamingResponse, JSONResponse, str]: self.use_openai = False - try: json_request: Dict[str, Any] = await http_request.json() + simple_request = SimpleRequest(**json_request) except ValueError: return JSONResponse( status_code=400, content="Invalid JSON format from http request.", ) - streaming_response = json_request["stream"] if "stream" in json_request else False - input = json_request["text"] if "text" in json_request else "" - if input == "": - return JSONResponse( - status_code=400, - content="Empty prompt is not supported.", - ) - config = json_request["config"] if "config" in json_request else {} - # return prompt or list of prompts preprocessed + input = simple_request.text + config = simple_request.config + streaming_response = simple_request.stream + prompts = self.preprocess_prompts(input) # Handle streaming response diff --git a/tests/inference/test_example_simple.py b/tests/inference/test_example_simple.py index a0842f481..486bfe4c3 100644 --- a/tests/inference/test_example_simple.py +++ b/tests/inference/test_example_simple.py @@ -86,8 +86,8 @@ def script_with_args( for script_name in ["query_single.py", "query_dynamic_batch.py"] for base_url in ["http://localhost:8000/"] for model_name in ["gpt2"] - for streaming_response in [None] - for max_new_tokens in [None] + for streaming_response in [None, True, "error"] + for max_new_tokens in [None, 128, "error"] for temperature in [None] for top_p in [None] for top_k in [None] From 7545708319d20c910adbd29bb211109df31c01f5 Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Mon, 17 Jun 2024 16:29:23 +0000 Subject: [PATCH 04/10] afix numpy --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6a3c44685..001926257 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ dependencies = [ "accelerate", "datasets>=2.14.6", - "numpy", + "numpy==1.26.4", "ray>=2.10", "ray[serve,tune]>=2.10", "typing>=3.7.4.3", From aee34299ffd41579334dd1d01fc1324582eac0e3 Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Mon, 17 Jun 2024 16:55:47 +0000 Subject: [PATCH 05/10] update to 2.3 --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 001926257..7867aff24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ dependencies = [ "accelerate", "datasets>=2.14.6", - "numpy==1.26.4", + "numpy", "ray>=2.10", "ray[serve,tune]>=2.10", "typing>=3.7.4.3", @@ -41,9 +41,9 @@ dependencies = [ [project.optional-dependencies] cpu = [ "transformers>=4.38.0, <=4.38.1", - "intel_extension_for_pytorch==2.2.0", - "torch==2.2.0", - "oneccl_bind_pt==2.2.0" + "intel_extension_for_pytorch==2.3.0", + "torch==2.3.0", + "oneccl_bind_pt==2.3.0" ] gpu = [ From 31fadc06f3b0f8b6a17aeee36fdebfed45a8028e Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Tue, 18 Jun 2024 11:04:24 +0000 Subject: [PATCH 06/10] fix np --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7867aff24..5a8e89306 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ dependencies = [ "accelerate", "datasets>=2.14.6", - "numpy", + "numpy<2.0.0", "ray>=2.10", "ray[serve,tune]>=2.10", "typing>=3.7.4.3", @@ -41,9 +41,9 @@ dependencies = [ [project.optional-dependencies] cpu = [ "transformers>=4.38.0, <=4.38.1", - "intel_extension_for_pytorch==2.3.0", - "torch==2.3.0", - "oneccl_bind_pt==2.3.0" + "intel_extension_for_pytorch==2.2.0", + "torch==2.2.0", + "oneccl_bind_pt==2.2.0" ] gpu = [ From 1af240651eb5158a70e86064dee895ab70f31f95 Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Tue, 18 Jun 2024 15:52:59 +0000 Subject: [PATCH 07/10] fix --- .../inference/api_simple_backend/simple_protocol.py | 11 +++++++---- llm_on_ray/inference/predictor_deployment.py | 13 +++++-------- tests/inference/test_example_simple.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llm_on_ray/inference/api_simple_backend/simple_protocol.py b/llm_on_ray/inference/api_simple_backend/simple_protocol.py index 297d2edd6..b873763ad 100644 --- a/llm_on_ray/inference/api_simple_backend/simple_protocol.py +++ b/llm_on_ray/inference/api_simple_backend/simple_protocol.py @@ -20,7 +20,7 @@ class SimpleRequest(BaseModel): text: str - config: Dict[str, Union[int, float]] + config: Dict[str, Union[int, float]] = {} stream: Optional[bool] = False @validator("text") @@ -32,7 +32,10 @@ def text_must_not_be_empty(cls, v): @validator("config", pre=True) def check_config_type(cls, value): allowed_keys = ["max_new_tokens", "temperature", "top_p", "top_k"] - config_keys = value.keys() + allowed_set = set(allowed_keys) + config_dict = value.keys() + config_keys = [key for key in config_dict] + config_set = set(config_keys) if not isinstance(value, dict): raise ValueError("Config must be a dictionary") @@ -43,8 +46,8 @@ def check_config_type(cls, value): if not all(isinstance(val, (int, float)) for val in value.values()): raise ValueError("All values in config must be integers or floats") - if config_keys not in allowed_keys: - invalid_keys = config_keys - allowed_keys + if not config_set.issubset(allowed_set): + invalid_keys = config_set - allowed_set raise ValueError(f'Invalid config keys: {", ".join(invalid_keys)}') return value diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 9233d9d38..ed67f5119 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -381,22 +381,19 @@ def preprocess_prompts( else: raise HTTPException(400, "Invalid prompt format.") - async def __call__( - self, http_request: SimpleRequest - ) -> Union[StreamingResponse, JSONResponse, str]: + async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]: self.use_openai = False try: - json_request: Dict[str, Any] = await http_request.json() - simple_request = SimpleRequest(**json_request) + request: Dict[str, Any] = await http_request.json() except ValueError: return JSONResponse( status_code=400, content="Invalid JSON format from http request.", ) - input = simple_request.text - config = simple_request.config - streaming_response = simple_request.stream + streaming_response = request["stream"] + input = request["text"] + config = request["config"] prompts = self.preprocess_prompts(input) diff --git a/tests/inference/test_example_simple.py b/tests/inference/test_example_simple.py index 486bfe4c3..ec0949fe3 100644 --- a/tests/inference/test_example_simple.py +++ b/tests/inference/test_example_simple.py @@ -87,7 +87,7 @@ def script_with_args( for base_url in ["http://localhost:8000/"] for model_name in ["gpt2"] for streaming_response in [None, True, "error"] - for max_new_tokens in [None, 128, "error"] + for max_new_tokens in [None] for temperature in [None] for top_p in [None] for top_k in [None] From d8c01195ce552d4bda7f8f716bc8ac0ea6c67e23 Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Tue, 18 Jun 2024 16:24:45 +0000 Subject: [PATCH 08/10] fix --- examples/inference/api_server_simple/query_single.py | 2 +- tests/inference/test_example_simple.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/inference/api_server_simple/query_single.py b/examples/inference/api_server_simple/query_single.py index 5aabdd8ea..f0d618331 100644 --- a/examples/inference/api_server_simple/query_single.py +++ b/examples/inference/api_server_simple/query_single.py @@ -71,7 +71,7 @@ config["top_k"] = float(args.top_k) sample_input = SimpleRequest(text=prompt, config=config, stream=args.streaming_response) - +print(args.streaming_response) proxies = {"http": None, "https": None} outputs = requests.post( args.model_endpoint, diff --git a/tests/inference/test_example_simple.py b/tests/inference/test_example_simple.py index ec0949fe3..a0842f481 100644 --- a/tests/inference/test_example_simple.py +++ b/tests/inference/test_example_simple.py @@ -86,7 +86,7 @@ def script_with_args( for script_name in ["query_single.py", "query_dynamic_batch.py"] for base_url in ["http://localhost:8000/"] for model_name in ["gpt2"] - for streaming_response in [None, True, "error"] + for streaming_response in [None] for max_new_tokens in [None] for temperature in [None] for top_p in [None] From 18b4a257b78bb93da7f8bfde348b529122de1e3d Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Tue, 18 Jun 2024 16:50:26 +0000 Subject: [PATCH 09/10] add pytest --- .../api_server_simple/query_single.py | 2 +- tests/inference/test_simple_protocal.py | 90 +++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 tests/inference/test_simple_protocal.py diff --git a/examples/inference/api_server_simple/query_single.py b/examples/inference/api_server_simple/query_single.py index f0d618331..5aabdd8ea 100644 --- a/examples/inference/api_server_simple/query_single.py +++ b/examples/inference/api_server_simple/query_single.py @@ -71,7 +71,7 @@ config["top_k"] = float(args.top_k) sample_input = SimpleRequest(text=prompt, config=config, stream=args.streaming_response) -print(args.streaming_response) + proxies = {"http": None, "https": None} outputs = requests.post( args.model_endpoint, diff --git a/tests/inference/test_simple_protocal.py b/tests/inference/test_simple_protocal.py new file mode 100644 index 000000000..070c8eb73 --- /dev/null +++ b/tests/inference/test_simple_protocal.py @@ -0,0 +1,90 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import subprocess +import pytest +import os +from basic_set import start_serve +import requests +from llm_on_ray.inference.api_simple_backend.simple_protocol import ( + SimpleRequest, + SimpleModelResponse, +) + + +executed_models = [] + + +# Parametrize the test function with different combinations of parameters +# TODO: more models and combinations will be added and tested. +@pytest.mark.parametrize( + "prompt,streaming_response,max_new_tokens,temperature,top_p, top_k", + [ + ( + prompt, + streaming_response, + max_new_tokens, + temperature, + top_p, + top_k, + ) + for prompt in ["Once upon a time", ""] + for streaming_response in [None, True, "error"] + for max_new_tokens in [None, 128, "error"] + for temperature in [None] + for top_p in [None] + for top_k in [None] + ], +) +def test_script(prompt, streaming_response, max_new_tokens, temperature, top_p, top_k): + global executed_models + + # Check if this modelname has already executed start_serve + if "gpt2" not in executed_models: + start_serve("gpt2", simple=True) + # Mark this modelname has already executed start_serve + executed_models.append("gpt2") + config = {} + if max_new_tokens: + config["max_new_tokens"] = max_new_tokens + if temperature: + config["temperature"] = temperature + if top_p: + config["top_p"] = top_p + if top_k: + config["top_k"] = top_k + + try: + sample_input = SimpleRequest(text=prompt, config=config, stream=streaming_response) + except ValueError as e: + print(e) + return + outputs = requests.post( + "http://localhost:8000/gpt2", + proxies={"http": None, "https": None}, # type: ignore + json=sample_input.dict(), + stream=streaming_response, + ) + + outputs.raise_for_status() + + simple_response = SimpleModelResponse.from_requests_response(outputs) + if streaming_response: + for output in simple_response.iter_content(chunk_size=1, decode_unicode=True): + print(output, end="", flush=True) + print() + else: + print(simple_response.text, flush=True) From cdc3b8c8d7d4f78b52c3187304a8e9694c96a5b9 Mon Sep 17 00:00:00 2001 From: yutianchen666 Date: Fri, 21 Jun 2024 09:19:04 +0000 Subject: [PATCH 10/10] fix name --- .../{test_simple_protocal.py => test_simple_protocol.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/inference/{test_simple_protocal.py => test_simple_protocol.py} (100%) diff --git a/tests/inference/test_simple_protocal.py b/tests/inference/test_simple_protocol.py similarity index 100% rename from tests/inference/test_simple_protocal.py rename to tests/inference/test_simple_protocol.py