From ede84a513770adedaa366a50717cacd3e89f513b Mon Sep 17 00:00:00 2001 From: joostinyi <63941848+joostinyi@users.noreply.github.com> Date: Fri, 10 Nov 2023 13:24:49 -0800 Subject: [PATCH] rename truss and add warning in README (#64) --- .../README.md | 22 ++-- .../TRT-LLM-README.md | 28 ++--- .../config.yaml | 2 +- .../data/.gitattributes | 0 .../model/__init__.py | 0 .../model/model.py | 32 +++-- .../packages/client.py | 68 +++++++---- .../ensemble/config.pbtxt | 0 .../postprocessing/1/model.py | 84 +++++++------ .../postprocessing/config.pbtxt | 2 +- .../preprocessing/1/model.py | 115 ++++++++++-------- .../preprocessing/config.pbtxt | 2 +- .../tensorrt_llm/config.pbtxt | 2 +- .../packages/utils.py | 12 +- 14 files changed, 207 insertions(+), 162 deletions(-) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/README.md (73%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/TRT-LLM-README.md (90%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/config.yaml (96%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/data/.gitattributes (100%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/model/__init__.py (100%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/model/model.py (91%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/packages/client.py (77%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/packages/inflight_batcher_llm/ensemble/config.pbtxt (100%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/packages/inflight_batcher_llm/postprocessing/1/model.py (75%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/packages/inflight_batcher_llm/postprocessing/config.pbtxt (99%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/packages/inflight_batcher_llm/preprocessing/1/model.py (74%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/packages/inflight_batcher_llm/preprocessing/config.pbtxt (99%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt (99%) rename llama/{llama-7b-trt => llama-2-7b-trt-llm}/packages/utils.py (93%) diff --git a/llama/llama-7b-trt/README.md b/llama/llama-2-7b-trt-llm/README.md similarity index 73% rename from llama/llama-7b-trt/README.md rename to llama/llama-2-7b-trt-llm/README.md index 402b7062..779f0a79 100644 --- a/llama/llama-7b-trt/README.md +++ b/llama/llama-2-7b-trt-llm/README.md @@ -1,20 +1,22 @@ [![Deploy to Baseten](https://user-images.githubusercontent.com/2389286/236301770-16f46d4f-4e23-4db5-9462-f578ec31e751.svg)](https://app.baseten.co/explore/llama) -# LLaMA-7B-Chat Truss +# LLaMA2-7B-Chat Truss -This is a [Truss](https://truss.baseten.co/) for an int8 SmoothQuant version of LLaMA-7B-Chat. Llama is a family of language models released by Meta. This README will walk you through how to deploy this Truss on Baseten to get your own instance of LLaMA-7B-Chat. +This is a [Truss](https://truss.baseten.co/) for an int8 SmoothQuant version of LLaMA2-7B-Chat. Llama is a family of language models released by Meta. This README will walk you through how to deploy this Truss on Baseten to get your own instance of LLaMA2-7B-Chat. + +**Warning: This example is only intended for usage on a single A100, changing your resource type for this deployment will result in unsupported behavior** ## Truss Truss is an open-source model serving framework developed by Baseten. It allows you to develop and deploy machine learning models onto Baseten (and other platforms like [AWS](https://truss.baseten.co/deploy/aws) or [GCP](https://truss.baseten.co/deploy/gcp). Using Truss, you can develop a GPU model using [live-reload](https://baseten.co/blog/technical-deep-dive-truss-live-reload), package models and their associated code, create Docker containers and deploy on Baseten. -## Deploying LLaMA-7B +## Deploying LLaMA2-7B-Chat First, clone this repository: ```sh git clone https://github.com/basetenlabs/truss-examples/ -cd llama/llama-7b-trt +cd llama/llama-2-7b-trt-llm ``` Before deployment: @@ -22,7 +24,7 @@ Before deployment: 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` -With `llama-7b-trt` as your working directory, you can deploy the model with: +With `llama-2-7b-trt-llm` as your working directory, you can deploy the model with: ```sh truss push @@ -32,8 +34,8 @@ Paste your Baseten API key if prompted. For more information, see [Truss documentation](https://truss.baseten.co). -## LLaMA-7B API documentation -This section provides an overview of the LLaMA-7B API, its parameters, and how to use it. The API consists of a single route named `predict`, which you can invoke to generate text based on the provided instruction. +## LLaMA2-7B API documentation +This section provides an overview of the LLaMA2-7B API, its parameters, and how to use it. The API consists of a single route named `predict`, which you can invoke to generate text based on the provided instruction. ### API route: `predict` @@ -42,12 +44,12 @@ We expect requests will the following information: - ```text_input``` (str): The prompt you'd like to complete - ```output_len``` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt. -- ```beam_width``` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. +- ```beam_width``` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. - ```bad_words_list``` (list, default:[]): A list of words to not include in generated output. - ```stop_words_list``` (list, default:[]): A list of words to stop generation upon encountering. -- ```repetition_penalty``` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. +- ```repetition_penalty``` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. -This Truss will stream responses back. Responses will be buffered chunks of text. +This Truss will stream responses back. Responses will be buffered chunks of text. ## Example usage diff --git a/llama/llama-7b-trt/TRT-LLM-README.md b/llama/llama-2-7b-trt-llm/TRT-LLM-README.md similarity index 90% rename from llama/llama-7b-trt/TRT-LLM-README.md rename to llama/llama-2-7b-trt-llm/TRT-LLM-README.md index 3d7049d5..501a0aae 100644 --- a/llama/llama-7b-trt/TRT-LLM-README.md +++ b/llama/llama-2-7b-trt-llm/TRT-LLM-README.md @@ -1,19 +1,19 @@ # TRTLLM - + ### Overview -This Truss adds support for TRT-LLM engines via Triton Inference Server. TRT-LLM is a highly-performant language model runtime. We leverage the C++ runtime to take advantage of in-flight batching (aka continous batching). +This Truss adds support for TRT-LLM engines via Triton Inference Server. TRT-LLM is a highly-performant language model runtime. We leverage the C++ runtime to take advantage of in-flight batching (aka continous batching). -### Prerequisites +### Prerequisites -To use this Truss, your engine must be built with in-flight batching support. Refer to your architecture-specific `build.py` re: how to build with in-flight-batching support. +To use this Truss, your engine must be built with in-flight batching support. Refer to your architecture-specific `build.py` re: how to build with in-flight-batching support. ### Config -This Truss is primarily config driven. This means that most settings you'll need to edit are located in the `config.yaml`. These settings are all located underneath the `model_metadata` key. +This Truss is primarily config driven. This means that most settings you'll need to edit are located in the `config.yaml`. These settings are all located underneath the `model_metadata` key. -- `tensor_parallelism` (int): If you built your model with tensor parallelism support, you'll need to set this value with the same value used during the build engine step. This value should be the same as the number of GPUs in the `resources` section. +- `tensor_parallelism` (int): If you built your model with tensor parallelism support, you'll need to set this value with the same value used during the build engine step. This value should be the same as the number of GPUs in the `resources` section. *Pipeline parallelism is not supported in this version but will be added later. As noted from Nvidia, pipeline parallelism reduces the need for high-bandwidth communication but may incur load-balancing issues and may be less efficient in terms of GPU utilization.* @@ -28,9 +28,9 @@ secrets: hf_access_token: "my_hf_api_key" ``` -### Performance +### Performance -TRT-LLM engines are designed to be highly performant. Once your Truss has been deployed, you may find that you're not fully utilizing the GPU. The following are levers to improve performance but require trial-and-error to identify appropriates. All of these values live inside the `config.pbtxt` for a given ensemble model. +TRT-LLM engines are designed to be highly performant. Once your Truss has been deployed, you may find that you're not fully utilizing the GPU. The following are levers to improve performance but require trial-and-error to identify appropriates. All of these values live inside the `config.pbtxt` for a given ensemble model. #### Preprocessing / Postprocessing @@ -42,7 +42,7 @@ instance_group [ } ] ``` -By default, we load 1 instance of the pre/post models. If you find that the tokenizer is a bottleneck, increasing the `count` variable here will load more replicas of these models and Triton will automatically load balance across model instances. +By default, we load 1 instance of the pre/post models. If you find that the tokenizer is a bottleneck, increasing the `count` variable here will load more replicas of these models and Triton will automatically load balance across model instances. ### Tensorrt LLM ``` @@ -53,7 +53,7 @@ parameters: { } } ``` -By default, we set the `max_tokens_in_paged_kv_cache` to 10000. For a 7B model on 1 A100 with a batch size of 8, we have over 60GB of GPU memory left over. We can increase this value to 100k comfortably and allow for more tokens in the KV cache. Your mileage will vary based on the size of your model and the hardware you're running on. +By default, we set the `max_tokens_in_paged_kv_cache` to 10000. For a 7B model on 1 A100 with a batch size of 8, we have over 60GB of GPU memory left over. We can increase this value to 100k comfortably and allow for more tokens in the KV cache. Your mileage will vary based on the size of your model and the hardware you're running on. ``` parameters: { @@ -73,7 +73,7 @@ parameters: { } } ``` -The `max_num_sequences` param is the maximum numbers of requests that the inference server can maintain state for at a given time (state = KV cache + decoder state). If this value is greater than your max batch size, we'll try to ping pong processing between max_num_sequences // max_batch_size batches. This assumes that `enable_trt_overlap` is set to `True` (as it is by default in this Truss). Setting this value higher allows for more parallel processing but uses more GPU memory. +The `max_num_sequences` param is the maximum numbers of requests that the inference server can maintain state for at a given time (state = KV cache + decoder state). If this value is greater than your max batch size, we'll try to ping pong processing between max_num_sequences // max_batch_size batches. This assumes that `enable_trt_overlap` is set to `True` (as it is by default in this Truss). Setting this value higher allows for more parallel processing but uses more GPU memory. ### API @@ -82,9 +82,9 @@ We expect requests will the following information: - ```text_input``` (str): The prompt you'd like to complete - ```output_len``` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt. -- ```beam_width``` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. +- ```beam_width``` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. - ```bad_words_list``` (list, default:[]): A list of words to not include in generated output. - ```stop_words_list``` (list, default:[]): A list of words to stop generation upon encountering. -- ```repetition_penalty``` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. +- ```repetition_penalty``` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. -This Truss will stream responses back. Responses will be buffered chunks of text. \ No newline at end of file +This Truss will stream responses back. Responses will be buffered chunks of text. diff --git a/llama/llama-7b-trt/config.yaml b/llama/llama-2-7b-trt-llm/config.yaml similarity index 96% rename from llama/llama-7b-trt/config.yaml rename to llama/llama-2-7b-trt-llm/config.yaml index 7190f9ed..684966bf 100644 --- a/llama/llama-7b-trt/config.yaml +++ b/llama/llama-2-7b-trt-llm/config.yaml @@ -23,4 +23,4 @@ resources: secrets: {} system_packages: [] runtime: - predict_concurrency: 256 \ No newline at end of file + predict_concurrency: 256 diff --git a/llama/llama-7b-trt/data/.gitattributes b/llama/llama-2-7b-trt-llm/data/.gitattributes similarity index 100% rename from llama/llama-7b-trt/data/.gitattributes rename to llama/llama-2-7b-trt-llm/data/.gitattributes diff --git a/llama/llama-7b-trt/model/__init__.py b/llama/llama-2-7b-trt-llm/model/__init__.py similarity index 100% rename from llama/llama-7b-trt/model/__init__.py rename to llama/llama-2-7b-trt-llm/model/__init__.py diff --git a/llama/llama-7b-trt/model/model.py b/llama/llama-2-7b-trt-llm/model/model.py similarity index 91% rename from llama/llama-7b-trt/model/model.py rename to llama/llama-2-7b-trt-llm/model/model.py index c1ed34a8..1476805a 100644 --- a/llama/llama-7b-trt/model/model.py +++ b/llama/llama-2-7b-trt-llm/model/model.py @@ -1,12 +1,14 @@ -import numpy as np -from client import UserData, TritonClient -from threading import Thread -from utils import prepare_grpc_tensor, download_engine -from pathlib import Path from itertools import count +from pathlib import Path +from threading import Thread + +import numpy as np +from client import TritonClient, UserData +from utils import download_engine, prepare_grpc_tensor TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/") + class Model: def __init__(self, **kwargs): self._data_dir = kwargs["data_dir"] @@ -16,7 +18,9 @@ def __init__(self, **kwargs): self.triton_client = None def load(self): - tensor_parallel_count = self._config["model_metadata"].get("tensor_parallelism", 1) + tensor_parallel_count = self._config["model_metadata"].get( + "tensor_parallelism", 1 + ) is_hf_token = "hf_access_token" in self._secrets._base_secrets.keys() is_external_engine_repo = "engine_repository" in self._config["model_metadata"] @@ -26,18 +30,20 @@ def load(self): model_repository_dir=TRITON_MODEL_REPOSITORY_PATH, tensor_parallel_count=tensor_parallel_count, ) - + # Download model from Hugging Face Hub if specified if is_external_engine_repo: download_engine( engine_repository=self._config["model_metadata"]["engine_repository"], fp=self._data_dir, - auth_token=self._secrets["hf_access_token"] if is_hf_token else None + auth_token=self._secrets["hf_access_token"] if is_hf_token else None, ) - + # Load Triton Server and model env = { - "triton_tokenizer_repository": self._config["model_metadata"]["tokenizer_repository"], + "triton_tokenizer_repository": self._config["model_metadata"][ + "tokenizer_repository" + ], } if is_hf_token: env["HUGGING_FACE_HUB_TOKEN"] = self._secrets["hf_access_token"] @@ -74,13 +80,13 @@ def predict(self, model_input): prepare_grpc_tensor("stop_words", stop_words_list), prepare_grpc_tensor("stream", streaming_data), prepare_grpc_tensor("beam_width", beam_width_data), - prepare_grpc_tensor("repetition_penalty", repetition_penalty_data) + prepare_grpc_tensor("repetition_penalty", repetition_penalty_data), ] # Start GRPC stream in a separate thread stream_thread = Thread( target=self.triton_client.start_grpc_stream, - args=(user_data, model_name, inputs, stream_uuid) + args=(user_data, model_name, inputs, stream_uuid), ) stream_thread.start() @@ -89,4 +95,4 @@ def predict(self, model_input): yield i # Clean up GRPC stream and thread - self.triton_client.stop_grpc_stream(stream_uuid, stream_thread) \ No newline at end of file + self.triton_client.stop_grpc_stream(stream_uuid, stream_thread) diff --git a/llama/llama-7b-trt/packages/client.py b/llama/llama-2-7b-trt-llm/packages/client.py similarity index 77% rename from llama/llama-7b-trt/packages/client.py rename to llama/llama-2-7b-trt-llm/packages/client.py index 8fe00b4b..5a7e1597 100644 --- a/llama/llama-7b-trt/packages/client.py +++ b/llama/llama-2-7b-trt-llm/packages/client.py @@ -1,28 +1,34 @@ -import os import json +import os import subprocess import time from functools import partial -import tritonclient.grpc as grpcclient -import tritonclient.http as httpclient from pathlib import Path from queue import Queue -from utils import prepare_model_repository -from tritonclient.utils import InferenceServerException from threading import Thread +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException +from utils import prepare_model_repository + + class UserData: def __init__(self): self._completed_requests = Queue() + def callback(user_data, result, error): if error: user_data._completed_requests.put(error) else: user_data._completed_requests.put(result) + class TritonClient: - def __init__(self, data_dir: Path, model_repository_dir: Path, tensor_parallel_count=1): + def __init__( + self, data_dir: Path, model_repository_dir: Path, tensor_parallel_count=1 + ): self._data_dir = data_dir self._model_repository_dir = model_repository_dir self._tensor_parallel_count = tensor_parallel_count @@ -31,7 +37,9 @@ def __init__(self, data_dir: Path, model_repository_dir: Path, tensor_parallel_c def start_grpc_stream(self, user_data, model_name, inputs, stream_uuid): """Starts a GRPC stream and sends a request to the Triton server.""" - grpc_client_instance = grpcclient.InferenceServerClient(url="localhost:8001", verbose=False) + grpc_client_instance = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=False + ) self._grpc_client_map[stream_uuid] = grpc_client_instance grpc_client_instance.start_stream(callback=partial(callback, user_data)) grpc_client_instance.async_stream_infer( @@ -59,9 +67,12 @@ def start_server( if mpi == 1: command = [ "tritonserver", - "--model-repository", str(self._model_repository_dir), - "--grpc-port", "8001", - "--http-port", "8003" + "--model-repository", + str(self._model_repository_dir), + "--grpc-port", + "8001", + "--http-port", + "8003", ] command = [ "mpirun", @@ -72,12 +83,15 @@ def start_server( "-n", "1", "tritonserver", - "--model-repository", str(self._model_repository_dir), - "--grpc-port", "8001", - "--http-port", "8003", + "--model-repository", + str(self._model_repository_dir), + "--grpc-port", + "8001", + "--http-port", + "8003", "--disable-auto-complete-config", f"--backend-config=python,shm-region-prefix-name=prefix{str(i)}_", - ":" + ":", ] return subprocess.Popen( command, @@ -89,7 +103,9 @@ def load_server_and_model(self, env: dict): prepare_model_repository(self._data_dir) self.start_server(mpi=self._tensor_parallel_count, env=env) - self._http_client = httpclient.InferenceServerClient(url="localhost:8003", verbose=False) + self._http_client = httpclient.InferenceServerClient( + url="localhost:8003", verbose=False + ) is_server_up = False while not is_server_up: try: @@ -112,8 +128,12 @@ def _is_final_response(result): return True if result: - final_response_param = result.get_response().parameters.get("triton_final_response") - return final_response_param.bool_param if final_response_param else False + final_response_param = result.get_response().parameters.get( + "triton_final_response" + ) + return ( + final_response_param.bool_param if final_response_param else False + ) return False result = None @@ -122,16 +142,10 @@ def _is_final_response(result): try: result = user_data._completed_requests.get() if not isinstance(result, InferenceServerException): - res = result.as_numpy('text_output') + res = result.as_numpy("text_output") yield res[0].decode("utf-8") else: - yield json.dumps({ - "status": "error", - "message": result.message() - }) + yield json.dumps({"status": "error", "message": result.message()}) except Exception as e: - yield json.dumps({ - "status": "error", - "message": str(e) - }) - break \ No newline at end of file + yield json.dumps({"status": "error", "message": str(e)}) + break diff --git a/llama/llama-7b-trt/packages/inflight_batcher_llm/ensemble/config.pbtxt b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/ensemble/config.pbtxt similarity index 100% rename from llama/llama-7b-trt/packages/inflight_batcher_llm/ensemble/config.pbtxt rename to llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/ensemble/config.pbtxt diff --git a/llama/llama-7b-trt/packages/inflight_batcher_llm/postprocessing/1/model.py b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/postprocessing/1/model.py similarity index 75% rename from llama/llama-7b-trt/packages/inflight_batcher_llm/postprocessing/1/model.py rename to llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/postprocessing/1/model.py index a92fa861..49ebaa36 100644 --- a/llama/llama-7b-trt/packages/inflight_batcher_llm/postprocessing/1/model.py +++ b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/postprocessing/1/model.py @@ -24,13 +24,14 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os import json +import os +from collections import OrderedDict import numpy as np import triton_python_backend_utils as pb_utils from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -from collections import OrderedDict + class TritonPythonModel: """Your Python model must use the same class name. Every Python model @@ -53,35 +54,32 @@ def initialize(self, args): * model_name: Model name """ # Parse model configs - model_config = json.loads(args['model_config']) + model_config = json.loads(args["model_config"]) tokenizer_dir = os.environ["triton_tokenizer_repository"] - tokenizer_type = model_config['parameters']['tokenizer_type'][ - 'string_value'] - - if tokenizer_type == 't5': - self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'auto': - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'llama': + tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"] + + if tokenizer_type == "t5": + self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, padding_side="left") + elif tokenizer_type == "auto": + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, padding_side="left" + ) + elif tokenizer_type == "llama": self.tokenizer = LlamaTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side='left') + tokenizer_dir, legacy=False, padding_side="left" + ) else: - raise AttributeError( - f'Unexpected tokenizer type: {tokenizer_type}') + raise AttributeError(f"Unexpected tokenizer type: {tokenizer_type}") self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs - output_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT") + output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") # Convert Triton types to numpy types - self.output_dtype = pb_utils.triton_string_to_numpy( - output_config['data_type']) + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) self.state_dict = OrderedDict() self.cache_size = 100 - + def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only @@ -109,16 +107,20 @@ def execute(self, requests): for idx, request in enumerate(requests): # Get request ID request_id = request.request_id() - + # Get input tensors - tokens_batch = pb_utils.get_input_tensor_by_name(request, 'TOKENS_BATCH').as_numpy().flatten() - + tokens_batch = ( + pb_utils.get_input_tensor_by_name(request, "TOKENS_BATCH") + .as_numpy() + .flatten() + ) + # Get prior state for request ID if request_id in self.state_dict: - previous_tokens = self.state_dict[request_id]['tokens'] + previous_tokens = self.state_dict[request_id]["tokens"] accumulated_tokens = np.concatenate([previous_tokens, tokens_batch]) - self.state_dict[request_id]['tokens'] = accumulated_tokens - + self.state_dict[request_id]["tokens"] = accumulated_tokens + # Move request ID to end of queue to prevent it from being evicted self.state_dict.move_to_end(request_id) else: @@ -126,29 +128,39 @@ def execute(self, requests): if len(self.state_dict) > self.cache_size: self.state_dict.popitem(last=False) - self.state_dict[request_id] = {'tokens': tokens_batch, 'prev_str': ""} + self.state_dict[request_id] = {"tokens": tokens_batch, "prev_str": ""} # Postprocess output data - new_string = self._postprocessing(self.state_dict[request_id]['tokens']) - old_string = self.state_dict[request_id]['prev_str'] - + new_string = self._postprocessing(self.state_dict[request_id]["tokens"]) + old_string = self.state_dict[request_id]["prev_str"] + # Compute delta between previous and new string delta = self._compute_delta(old_string, new_string) - self.state_dict[request_id]['prev_str'] = new_string + self.state_dict[request_id]["prev_str"] = new_string # Create output tensor - output_tensor = pb_utils.Tensor('OUTPUT', np.array([delta]).astype(self.output_dtype)) - inference_response = pb_utils.InferenceResponse(output_tensors=[output_tensor]) + output_tensor = pb_utils.Tensor( + "OUTPUT", np.array([delta]).astype(self.output_dtype) + ) + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_tensor] + ) responses.append(inference_response) return responses def _compute_delta(self, prev_str, new_str): - delta = "".join([char for index, char in enumerate(new_str) if index >= len(prev_str) or char != prev_str[index]]) + delta = "".join( + [ + char + for index, char in enumerate(new_str) + if index >= len(prev_str) or char != prev_str[index] + ] + ) return delta def finalize(self): - print('Cleaning up...') + print("Cleaning up...") def _postprocessing(self, tokens): decoded_tokens = self.tokenizer.decode(tokens) diff --git a/llama/llama-7b-trt/packages/inflight_batcher_llm/postprocessing/config.pbtxt b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/postprocessing/config.pbtxt similarity index 99% rename from llama/llama-7b-trt/packages/inflight_batcher_llm/postprocessing/config.pbtxt rename to llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/postprocessing/config.pbtxt index a5539eb8..0abeaac8 100644 --- a/llama/llama-7b-trt/packages/inflight_batcher_llm/postprocessing/config.pbtxt +++ b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/postprocessing/config.pbtxt @@ -61,4 +61,4 @@ instance_group [ count: 1 kind: KIND_CPU } -] \ No newline at end of file +] diff --git a/llama/llama-7b-trt/packages/inflight_batcher_llm/preprocessing/1/model.py b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/preprocessing/1/model.py similarity index 74% rename from llama/llama-7b-trt/packages/inflight_batcher_llm/preprocessing/1/model.py rename to llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/preprocessing/1/model.py index 9365c84c..a39fc458 100644 --- a/llama/llama-7b-trt/packages/inflight_batcher_llm/preprocessing/1/model.py +++ b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/preprocessing/1/model.py @@ -24,9 +24,9 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os import csv import json +import os from typing import List import numpy as np @@ -57,39 +57,45 @@ def initialize(self, args): * model_name: Model name """ # Parse model configs - model_config = json.loads(args['model_config']) + model_config = json.loads(args["model_config"]) tokenizer_dir = os.environ["triton_tokenizer_repository"] - tokenizer_type = model_config['parameters']['tokenizer_type'][ - 'string_value'] - - if tokenizer_type == 't5': - self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'auto': - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'llama': + tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"] + + if tokenizer_type == "t5": + self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, padding_side="left") + elif tokenizer_type == "auto": + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, padding_side="left" + ) + elif tokenizer_type == "llama": self.tokenizer = LlamaTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side='left') + tokenizer_dir, legacy=False, padding_side="left" + ) else: - raise AttributeError( - f'Unexpected tokenizer type: {tokenizer_type}') + raise AttributeError(f"Unexpected tokenizer type: {tokenizer_type}") self.tokenizer.pad_token = self.tokenizer.eos_token - self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token, - add_special_tokens=False)[0] + self.pad_id = self.tokenizer.encode( + self.tokenizer.pad_token, add_special_tokens=False + )[0] # Parse model output configs and convert Triton types to numpy types input_names = [ - "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS" + "INPUT_ID", + "REQUEST_INPUT_LEN", + "BAD_WORDS_IDS", + "STOP_WORDS_IDS", ] for input_name in input_names: setattr( self, input_name.lower() + "_dtype", pb_utils.triton_string_to_numpy( - pb_utils.get_output_config_by_name( - model_config, input_name)['data_type'])) + pb_utils.get_output_config_by_name(model_config, input_name)[ + "data_type" + ] + ), + ) def execute(self, requests): """`execute` must be implemented in every Python model. `execute` @@ -117,15 +123,17 @@ def execute(self, requests): # and create a pb_utils.InferenceResponse for each of them. for idx, request in enumerate(requests): # Get input tensors - query = pb_utils.get_input_tensor_by_name(request, - 'QUERY').as_numpy() + query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() request_output_len = pb_utils.get_input_tensor_by_name( - request, 'REQUEST_OUTPUT_LEN').as_numpy() + request, "REQUEST_OUTPUT_LEN" + ).as_numpy() bad_words_dict = pb_utils.get_input_tensor_by_name( - request, 'BAD_WORDS_DICT').as_numpy() + request, "BAD_WORDS_DICT" + ).as_numpy() stop_words_dict = pb_utils.get_input_tensor_by_name( - request, 'STOP_WORDS_DICT').as_numpy() + request, "STOP_WORDS_DICT" + ).as_numpy() # Preprocessing input data. input_id, request_input_len = self._create_request(query) @@ -135,17 +143,17 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. input_id_tensor = pb_utils.Tensor( - 'INPUT_ID', - np.array(input_id).astype(self.input_id_dtype)) + "INPUT_ID", np.array(input_id).astype(self.input_id_dtype) + ) request_input_len_tensor = pb_utils.Tensor( - 'REQUEST_INPUT_LEN', - np.array(request_input_len).astype( - self.request_input_len_dtype)) + "REQUEST_INPUT_LEN", + np.array(request_input_len).astype(self.request_input_len_dtype), + ) request_output_len_tensor = pb_utils.Tensor( - 'REQUEST_OUTPUT_LEN', request_output_len) - bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) - stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS', - stop_words) + "REQUEST_OUTPUT_LEN", request_output_len + ) + bad_words_ids_tensor = pb_utils.Tensor("BAD_WORDS_IDS", bad_words) + stop_words_ids_tensor = pb_utils.Tensor("STOP_WORDS_IDS", stop_words) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -154,10 +162,15 @@ def execute(self, requests): # # pb_utils.InferenceResponse( # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_output_len_tensor - ]) + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, + bad_words_ids_tensor, + stop_words_ids_tensor, + request_input_len_tensor, + request_output_len_tensor, + ] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -169,28 +182,25 @@ def finalize(self): Implementing `finalize` function is optional. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") def _create_request(self, query): """ - query : batch string (2D numpy array) + query : batch string (2D numpy array) """ start_ids = [ - torch.IntTensor(self.tokenizer.encode(s[0].decode())) - for s in query + torch.IntTensor(self.tokenizer.encode(s[0].decode())) for s in query ] start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids]) - start_ids = pad_sequence(start_ids, - batch_first=True, - padding_value=self.pad_id) + start_ids = pad_sequence(start_ids, batch_first=True, padding_value=self.pad_id) # input_len = min(start_lengths) - #attn_mask = torch.ones((batch_size, input_len, input_len)).tril() + # attn_mask = torch.ones((batch_size, input_len, input_len)).tril() return start_ids, start_lengths def _to_word_list_format(self, word_dict: List[List[str]]): - ''' + """ format of word_dict len(word_dict) should be same to batch_size word_dict[i] means the words for batch i @@ -198,7 +208,7 @@ def _to_word_list_format(self, word_dict: List[List[str]]): This string can contains several sentences and split by ",". For example, if word_dict[2] = " I am happy, I am sad", then this function will return the ids for two short sentences " I am happy" and " I am sad". - ''' + """ assert self.tokenizer != None, "need to set tokenizer" flat_ids = [] @@ -226,10 +236,7 @@ def _to_word_list_format(self, word_dict: List[List[str]]): pad_to = max(1, max(len(ids) for ids in flat_ids)) for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): - flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), - constant_values=0) - offsets[i] = np.pad(offs, (0, pad_to - len(offs)), - constant_values=-1) + flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) + offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) - return np.array([flat_ids, offsets], dtype="int32").transpose( - (1, 0, 2)) + return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) diff --git a/llama/llama-7b-trt/packages/inflight_batcher_llm/preprocessing/config.pbtxt b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/preprocessing/config.pbtxt similarity index 99% rename from llama/llama-7b-trt/packages/inflight_batcher_llm/preprocessing/config.pbtxt rename to llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/preprocessing/config.pbtxt index 8ffd9039..f9b150dd 100644 --- a/llama/llama-7b-trt/packages/inflight_batcher_llm/preprocessing/config.pbtxt +++ b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/preprocessing/config.pbtxt @@ -96,4 +96,4 @@ instance_group [ count: 1 kind: KIND_CPU } -] \ No newline at end of file +] diff --git a/llama/llama-7b-trt/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt similarity index 99% rename from llama/llama-7b-trt/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt rename to llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt index ce49438c..cf4a4352 100644 --- a/llama/llama-7b-trt/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt +++ b/llama/llama-2-7b-trt-llm/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt @@ -205,4 +205,4 @@ parameters: { value: { string_value: "True" } -} \ No newline at end of file +} diff --git a/llama/llama-7b-trt/packages/utils.py b/llama/llama-2-7b-trt-llm/packages/utils.py similarity index 93% rename from llama/llama-7b-trt/packages/utils.py rename to llama/llama-2-7b-trt-llm/packages/utils.py index 3673bf78..e00e605c 100644 --- a/llama/llama-7b-trt/packages/utils.py +++ b/llama/llama-2-7b-trt-llm/packages/utils.py @@ -1,8 +1,10 @@ -from huggingface_hub import snapshot_download from pathlib import Path + import tritonclient.grpc as grpcclient +from huggingface_hub import snapshot_download from tritonclient.utils import np_to_triton_dtype + def move_all_files(src: Path, dest: Path): """ Moves all files from `src` to `dest` recursively. @@ -15,6 +17,7 @@ def move_all_files(src: Path, dest: Path): else: item.rename(dest_item) + def prepare_model_repository(data_dir: Path): """ Moves all files from `data_dir` to the model repository directory. @@ -30,12 +33,13 @@ def prepare_model_repository(data_dir: Path): # Move all files and directories from data_dir to dest_dir move_all_files(data_dir, dest_dir) + def prepare_grpc_tensor(name, input): - t = grpcclient.InferInput(name, input.shape, - np_to_triton_dtype(input.dtype)) + t = grpcclient.InferInput(name, input.shape, np_to_triton_dtype(input.dtype)) t.set_data_from_numpy(input) return t + def download_engine(engine_repository: str, fp: Path, auth_token=None): """ Downloads the specified engine from Hugging Face Hub. @@ -46,4 +50,4 @@ def download_engine(engine_repository: str, fp: Path, auth_token=None): local_dir_use_symlinks=False, max_workers=4, **({"use_auth_token": auth_token} if auth_token is not None else {}), - ) \ No newline at end of file + )