From eed1098364e94357886fbcfb20774e6078483f28 Mon Sep 17 00:00:00 2001 From: michaelfeil <63565275+michaelfeil@users.noreply.github.com> Date: Fri, 14 Feb 2025 19:56:14 +0000 Subject: [PATCH] update all models in trt-llm --- .../README.md | 2 +- .../config.yaml | 2 +- .../README.md | 2 +- .../config.yaml | 2 +- .../README.md | 2 +- .../README.md | 11 +- .../config.yaml | 7 +- .../README.md | 2 +- .../README.md | 2 +- .../Briton-microsoft-phi-4/README.md | 5 +- .../Briton-microsoft-phi-4/config.yaml | 5 +- .../README.md | 2 +- .../README.md | 2 +- .../README.md | 6 +- .../config.yaml | 4 +- .../README.md | 1 - .../README.md | 126 ------------------ .../config.yaml | 30 ----- .../README.md | 4 +- .../config.yaml | 4 +- .../templating/deploy_all.py | 6 +- .../templating/generate_templates.py | 94 ++++++++----- .../templating/test_deploy.py | 54 ++++++++ text-embeddings-inference/README.md | 1 - 24 files changed, 145 insertions(+), 231 deletions(-) delete mode 100644 11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md delete mode 100644 11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml create mode 100644 11-embeddings-reranker-classification-tensorrt/templating/test_deploy.py diff --git a/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md b/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md index 60a3676..bbbf214 100644 --- a/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md +++ b/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md @@ -92,7 +92,7 @@ model_name: BEI-ncbi-medcpt-cross-encoder-reranker-truss-example python_version: py39 requirements: [] resources: - accelerator: L4 + accelerator: A10G cpu: '1' memory: 10Gi use_gpu: true diff --git a/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml b/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml index 861f2d1..ddb5b45 100644 --- a/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml +++ b/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml @@ -8,7 +8,7 @@ model_name: BEI-ncbi-medcpt-cross-encoder-reranker-truss-example python_version: py39 requirements: [] resources: - accelerator: L4 + accelerator: A10G cpu: '1' memory: 10Gi use_gpu: true diff --git a/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md index 50b7934..3bb7f51 100644 --- a/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md +++ b/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md @@ -105,7 +105,7 @@ trt_llm: source: HF max_num_tokens: 131072 max_seq_len: 1000001 - num_builder_gpus: 2 + num_builder_gpus: 4 quantization_type: fp8 ``` diff --git a/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml index 8efe759..ec22240 100644 --- a/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml +++ b/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml @@ -24,5 +24,5 @@ trt_llm: source: HF max_num_tokens: 131072 max_seq_len: 1000001 - num_builder_gpus: 2 + num_builder_gpus: 4 quantization_type: fp8 diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md index 8cf5417..b6b2bf6 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md @@ -27,7 +27,7 @@ Before deployment: 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` -Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`. +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. First, clone this repository: ```sh diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md index 9ad739a..a5fe008 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md @@ -27,7 +27,7 @@ Before deployment: 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` - +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. First, clone this repository: ```sh @@ -129,7 +129,7 @@ print(completion.choices[0].message.tool_calls) ## Config.yaml -By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. +By default, the following configuration is used for this deployment. ```yaml build_commands: [] @@ -162,11 +162,8 @@ trt_llm: repo: meta-llama/Llama-3.2-3B-Instruct revision: main source: HF - max_seq_len: 131072 - num_builder_gpus: 4 - plugin_configuration: - use_fp8_context_fmha: true - quantization_type: fp8_kv + max_seq_len: 4096 + quantization_type: no_quant tensor_parallel_count: 1 runtime: enable_chunked_context: true diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml index b74f8c3..7ff6941 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml @@ -28,11 +28,8 @@ trt_llm: repo: meta-llama/Llama-3.2-3B-Instruct revision: main source: HF - max_seq_len: 131072 - num_builder_gpus: 4 - plugin_configuration: - use_fp8_context_fmha: true - quantization_type: fp8_kv + max_seq_len: 4096 + quantization_type: no_quant tensor_parallel_count: 1 runtime: enable_chunked_context: true diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md index 5771d9f..da26c49 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md @@ -27,7 +27,7 @@ Before deployment: 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` -Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`. +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. First, clone this repository: ```sh diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md index d4fc59e..0aa8192 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md @@ -27,7 +27,7 @@ Before deployment: 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` - +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. First, clone this repository: ```sh diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md index 86bf7f3..fe354b2 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md @@ -149,7 +149,7 @@ model_name: Briton-microsoft-phi-4-truss-example python_version: py39 requirements: [] resources: - accelerator: L4 + accelerator: L4:2 cpu: '1' memory: 10Gi use_gpu: true @@ -163,11 +163,10 @@ trt_llm: revision: main source: HF max_seq_len: 16384 - num_builder_gpus: 4 plugin_configuration: use_fp8_context_fmha: true quantization_type: fp8_kv - tensor_parallel_count: 1 + tensor_parallel_count: 2 runtime: enable_chunked_context: true diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml index c38c390..4d744a5 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml +++ b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml @@ -15,7 +15,7 @@ model_name: Briton-microsoft-phi-4-truss-example python_version: py39 requirements: [] resources: - accelerator: L4 + accelerator: L4:2 cpu: '1' memory: 10Gi use_gpu: true @@ -29,10 +29,9 @@ trt_llm: revision: main source: HF max_seq_len: 16384 - num_builder_gpus: 4 plugin_configuration: use_fp8_context_fmha: true quantization_type: fp8_kv - tensor_parallel_count: 1 + tensor_parallel_count: 2 runtime: enable_chunked_context: true diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md index bc86618..51eee9f 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md @@ -27,7 +27,7 @@ Before deployment: 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` - +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. First, clone this repository: ```sh diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md index a8a198a..dfc1644 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md @@ -27,7 +27,7 @@ Before deployment: 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` -Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`. +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. First, clone this repository: ```sh diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md index 7501482..9a4b043 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md @@ -27,7 +27,7 @@ Before deployment: 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` -Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`. + First, clone this repository: ```sh @@ -149,7 +149,7 @@ model_name: Briton-tiiuae-falcon3-10b-instruct-truss-example python_version: py39 requirements: [] resources: - accelerator: L4:4 + accelerator: L4:2 cpu: '1' memory: 10Gi use_gpu: true @@ -166,7 +166,7 @@ trt_llm: plugin_configuration: use_fp8_context_fmha: true quantization_type: fp8_kv - tensor_parallel_count: 4 + tensor_parallel_count: 2 runtime: enable_chunked_context: true diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml index 2131ea0..13619c2 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml +++ b/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml @@ -15,7 +15,7 @@ model_name: Briton-tiiuae-falcon3-10b-instruct-truss-example python_version: py39 requirements: [] resources: - accelerator: L4:4 + accelerator: L4:2 cpu: '1' memory: 10Gi use_gpu: true @@ -32,6 +32,6 @@ trt_llm: plugin_configuration: use_fp8_context_fmha: true quantization_type: fp8_kv - tensor_parallel_count: 4 + tensor_parallel_count: 2 runtime: enable_chunked_context: true diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md index 3637cc3..260aea4 100644 --- a/11-embeddings-reranker-classification-tensorrt/README.md +++ b/11-embeddings-reranker-classification-tensorrt/README.md @@ -35,7 +35,6 @@ You can find the following deployments in this repository: - [sentence-transformers/all-MiniLM-L6-v2-embedding-TEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding) ## Reranker Deployments: - - [Alibaba-NLP/gte-multilingual-reranker-base-TEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base) - [BAAI/bge-reranker-large-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-baai-bge-reranker-large) - [BAAI/bge-reranker-v2-m3-multilingual-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-baai-bge-reranker-v2-m3-multilingual) - [ncbi/MedCPT-Cross-Encoder-reranker-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker) diff --git a/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md b/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md deleted file mode 100644 index a1bd64d..0000000 --- a/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md +++ /dev/null @@ -1,126 +0,0 @@ -# Huggingface's text-embeddings-inference with Alibaba-NLP/gte-multilingual-reranker-base - -This is a Deployment for Huggingface's text-embeddings-inference with Alibaba-NLP/gte-multilingual-reranker-base. TEI is huggingface's solution for (text) embeddings, reranking models and prediction models. - -Supported models are tagged here: https://huggingface.co/models?other=text-embeddings-inference&sort=trending - - -For TEI you have to perform a manual selection of the Docker Image. We have mirrored the following images: -``` -CPU baseten/text-embeddings-inference-mirror:cpu-1.6 -Turing (T4, ...) baseten/text-embeddings-inference-mirror:turing-1.6 -Ampere 80 (A100, A30) baseten/text-embeddings-inference-mirror:1.6 -Ampere 86 (A10, A10G, A40, ...) baseten/text-embeddings-inference-mirror:86-1.6 -Ada Lovelace (L4, ...) baseten/text-embeddings-inference-mirror:89-1.6 -Hopper (H100/H100 40GB/H200) baseten/text-embeddings-inference-mirror:hopper-1.6 -``` - - -# Examples: -This deployment is specifically designed for the Hugging Face model [Alibaba-NLP/gte-multilingual-reranker-base](https://huggingface.co/Alibaba-NLP/gte-multilingual-reranker-base). -Suitable models can be identified by the `ForSequenceClassification` suffix in the model name. Reranker models may have at most one label, which contains the score of the reranking. - -Alibaba-NLP/gte-multilingual-reranker-base is a reranker model, used to re-rank a list of items, given a query. \nIt is frequently used in search engines, recommendation systems, and more. - - -## Deployment with Truss - -Before deployment: - -1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). -2. Install the latest version of Truss: `pip install --upgrade truss` - - -First, clone this repository: -```sh -git clone https://github.com/basetenlabs/truss-examples.git -cd 11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base -``` - -With `11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. - -```sh -truss push --publish -# prints: -# ✨ Model TEI-alibaba-nlp-gte-multilingual-reranker-base-truss-example was successfully pushed ✨ -# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx -``` - -## Call your model - -### API-Schema: -POST-Route: `https://model-xxxxxx.api.baseten.co/environments/production/sync/rerank`: -```json -{ - "query": "What is Baseten?", - "raw_scores": false, - "return_text": false, - "texts": [ - "Deep Learning is ...", "Baseten is a fast inference provider" - ], - "truncate": false, - "truncation_direction": "right" -} -``` - -Returns: -```json -[ - { - "index": 0, - "score": 1, - "text": "Deep Learning is ..." - } -] -``` -The OpenAPI.json is available under https://model-xxxxxx.api.baseten.co/environments/production/sync/openapi.json for more details. - -#### Advanced: -You may also use Baseten's async jobs API, which returns a request_id, which you can use to query the status of the job and get the results. - -POST-Route: `https://model-xxxxxx.api.baseten.co/environments/production/async/rerank` -Read more about [Baseten's Async API here](https://docs.baseten.co/invoke/async) - -### OpenAI compatible client library -OpenAI.com does not have a rerank endpoint, therefore no client library is available. - - -## Config.yaml -By default, the following configuration is used for this deployment. - -```yaml -base_image: - image: baseten/text-embeddings-inference-mirror:89-1.6 -build_commands: -- 'git clone https://huggingface.co/Alibaba-NLP/gte-multilingual-reranker-base /data/local-model - # optional step to download the weights of the model into the image, otherwise specify - the --model-id Alibaba-NLP/gte-multilingual-reranker-base directly `start_command`' -docker_server: - liveness_endpoint: /health - predict_endpoint: /rerank - readiness_endpoint: /health - server_port: 7997 - start_command: text-embeddings-router --port 7997 --model-id /data/local-model --max-client-batch-size - 128 --max-concurrent-requests 40 --max-batch-tokens 16384 -environment_variables: {} -external_package_dirs: [] -model_metadata: - example_model_input: - input: This redirects to the embedding endpoint. Use the /sync API to reach /rerank -model_name: TEI-alibaba-nlp-gte-multilingual-reranker-base-truss-example -python_version: py39 -requirements: [] -resources: - accelerator: L4 - cpu: '1' - memory: 2Gi - use_gpu: true -runtime: - predict_concurrency: 40 -secrets: {} -system_packages: [] - -``` - -## Support -If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml b/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml deleted file mode 100644 index 45d4db7..0000000 --- a/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml +++ /dev/null @@ -1,30 +0,0 @@ -base_image: - image: baseten/text-embeddings-inference-mirror:89-1.6 -build_commands: -- 'git clone https://huggingface.co/Alibaba-NLP/gte-multilingual-reranker-base /data/local-model - # optional step to download the weights of the model into the image, otherwise specify - the --model-id Alibaba-NLP/gte-multilingual-reranker-base directly `start_command`' -docker_server: - liveness_endpoint: /health - predict_endpoint: /rerank - readiness_endpoint: /health - server_port: 7997 - start_command: text-embeddings-router --port 7997 --model-id /data/local-model --max-client-batch-size - 128 --max-concurrent-requests 40 --max-batch-tokens 16384 -environment_variables: {} -external_package_dirs: [] -model_metadata: - example_model_input: - input: This redirects to the embedding endpoint. Use the /sync API to reach /rerank -model_name: TEI-alibaba-nlp-gte-multilingual-reranker-base-truss-example -python_version: py39 -requirements: [] -resources: - accelerator: L4 - cpu: '1' - memory: 2Gi - use_gpu: true -runtime: - predict_concurrency: 40 -secrets: {} -system_packages: [] diff --git a/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md b/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md index c857c23..bdc7fe8 100644 --- a/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md +++ b/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md @@ -131,7 +131,7 @@ By default, the following configuration is used for this deployment. ```yaml base_image: - image: baseten/text-embeddings-inference-mirror:89-1.6 + image: baseten/text-embeddings-inference-mirror:86-1.6 build_commands: - 'git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 /data/local-model # optional step to download the weights of the model into the image, otherwise specify @@ -154,7 +154,7 @@ model_name: TEI-sentence-transformers-all-minilm-l6-v2-embedding-truss-example python_version: py39 requirements: [] resources: - accelerator: L4 + accelerator: A10G cpu: '1' memory: 2Gi use_gpu: true diff --git a/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/config.yaml b/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/config.yaml index d0a2fb4..77d2380 100644 --- a/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/config.yaml +++ b/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/config.yaml @@ -1,5 +1,5 @@ base_image: - image: baseten/text-embeddings-inference-mirror:89-1.6 + image: baseten/text-embeddings-inference-mirror:86-1.6 build_commands: - 'git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 /data/local-model # optional step to download the weights of the model into the image, otherwise specify @@ -22,7 +22,7 @@ model_name: TEI-sentence-transformers-all-minilm-l6-v2-embedding-truss-example python_version: py39 requirements: [] resources: - accelerator: L4 + accelerator: A10G cpu: '1' memory: 2Gi use_gpu: true diff --git a/11-embeddings-reranker-classification-tensorrt/templating/deploy_all.py b/11-embeddings-reranker-classification-tensorrt/templating/deploy_all.py index 0e1fdc2..845f70b 100644 --- a/11-embeddings-reranker-classification-tensorrt/templating/deploy_all.py +++ b/11-embeddings-reranker-classification-tensorrt/templating/deploy_all.py @@ -44,7 +44,11 @@ def wrapper(*args, **kwargs): def matches_name(model: dict, key: str = "name") -> bool: - return model[key].endswith("truss-example") and FILTER in model[key] + return ( + model[key].endswith("truss-example") + and FILTER in model[key] + and (not "405b" in model[key]) + ) @retry(max_retries=3, delay=2) diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py index d7f472e..991df4d 100644 --- a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py +++ b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py @@ -1,7 +1,9 @@ from dataclasses import field +from functools import cached_property from pathlib import Path from typing import Any, Optional +import requests from pydantic import dataclasses from transformers import AutoConfig from truss.base.trt_llm_config import ( @@ -64,6 +66,13 @@ def make_truss_config(self, dp: "Deployment") -> TrussConfig: max_position_embeddings = hf_cfg.max_position_embeddings max_num_tokens = max(16384, max_position_embeddings) + + num_builder_gpus = 1 + if dp.accelerator in [Accelerator.H100]: + num_builder_gpus = 2 + elif dp.accelerator in [Accelerator.L4]: + num_builder_gpus = 4 + return TrussConfig( model_metadata=dp.task.model_metadata, trt_llm=TRTLLMConfiguration( @@ -80,11 +89,7 @@ def make_truss_config(self, dp: "Deployment") -> TrussConfig: { "quantization_type": TrussTRTLLMQuantizationType.FP8, # give more resources / cpu ram + vram on build if the model uses not-mig - "num_builder_gpus": ( - 2 - if dp.accelerator in [Accelerator.H100, Accelerator.L4] - else 1 - ), + "num_builder_gpus": num_builder_gpus, } if dp.is_fp8 else {} @@ -192,7 +197,21 @@ def make_truss_config(self, dp): ) # make sure model is available max_position_embeddings = hf_cfg.max_position_embeddings assert self.trt_config is not None - self.trt_config.build.max_seq_len = max(max_position_embeddings, 512) + self.trt_config.build.max_seq_len = max_position_embeddings + assert max_position_embeddings >= 512, "Model needs to have at least 512 tokens" + if ( + dp.accelerator in [Accelerator.L4, Accelerator.A10G] + and self.trt_config.build.tensor_parallel_count == 1 + ): + # limit context length on single small gpus as its hard to tune + self.trt_config.build.max_seq_len = min( + self.trt_config.build.max_seq_len, 4096 + ) + secrets = {} + if dp.is_gated: + # fix: pass-through access token + # TODO: remove need to the token at runtime + secrets["hf_access_token"] = None return TrussConfig( model_metadata=dp.task.model_metadata, @@ -530,7 +549,6 @@ class Deployment: accelerator: Accelerator task: Task solution: Solution - is_gated: bool = False is_fp8: bool = False def __init__(self, *args, **kwargs): @@ -540,16 +558,26 @@ def __init__(self, *args, **kwargs): "fp8" in self.solution.trt_config.build.quantization_type.value ) - try: - AutoConfig.from_pretrained(self.hf_model_id, token="invalid") - self.is_gated = False - except Exception: - try: - # has only access with permissions - AutoConfig.from_pretrained(self.hf_model_id) - self.is_gated = True - except Exception: - raise + @cached_property + def hf_config(self): + return AutoConfig.from_pretrained(self.hf_model_id, trust_remote_code=True) + + @cached_property + def is_gated(self): + # make sure the model is available via AutoConfig + assert self.hf_config is not None + + # model_name = "unsloth/phi-4" + # Attempt to fetch the weights file rather than config.json + url = f"https://huggingface.co/{self.hf_model_id}/resolve/main/config.json" + + response = requests.get(url) + if response.status_code == 200: + return False + elif response.status_code == 401: + return True + else: + raise ValueError(f"Received HTTP status code: {response.status_code}") @property def folder_name(self): @@ -571,9 +599,10 @@ def generate_bei_deployment(dp: Deployment): folder_relative_path = SUBFOLDER / dp.folder_name full_folder_path = root / folder_relative_path - is_gated = ( + is_gated_notice = ( "Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). " - "Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`." + "Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. " + "Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store." if dp.is_gated else "" ) @@ -620,7 +649,7 @@ def generate_bei_deployment(dp: Deployment): 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). 2. Install the latest version of Truss: `pip install --upgrade truss` -{is_gated} +{is_gated_notice} First, clone this repository: ```sh @@ -759,7 +788,7 @@ def generate_bei_deployment(dp: Deployment): Deployment( "ncbi/MedCPT-Cross-Encoder-reranker", "ncbi/MedCPT-Cross-Encoder", - Accelerator.L4, + Accelerator.A10G, Reranker(), solution=BEI(), ), @@ -815,7 +844,7 @@ def generate_bei_deployment(dp: Deployment): Deployment( # name="sentence-transformers/all-MiniLM-L6-v2-embedding", hf_model_id="sentence-transformers/all-MiniLM-L6-v2", - accelerator=Accelerator.L4, + accelerator=Accelerator.A10G, task=Embedder(), solution=HFTEI(), ), @@ -840,13 +869,6 @@ def generate_bei_deployment(dp: Deployment): task=Embedder(), solution=HFTEI(), ), - Deployment( # - name="Alibaba-NLP/gte-multilingual-reranker-base", - hf_model_id="Alibaba-NLP/gte-multilingual-reranker-base", - accelerator=Accelerator.L4, - task=Reranker(), - solution=HFTEI(), - ), ] @@ -904,7 +926,6 @@ def llamalike_config( repoid="meta-llama/Llama-3.3-70B-Instruct", tp=2 ) ), - is_gated=True, ), # meta-llama/Llama-3.1-405B tp8 Deployment( "meta-llama/Llama-3.2-3B-Instruct", @@ -912,7 +933,11 @@ def llamalike_config( Accelerator.L4, TextGen(), solution=Briton( - trt_config=llamalike_config(repoid="meta-llama/Llama-3.2-3B-Instruct") + trt_config=llamalike_config( + repoid="meta-llama/Llama-3.2-3B-Instruct", + tp=1, + quant=TrussTRTLLMQuantizationType.NO_QUANT, + ) ), ), Deployment( @@ -923,7 +948,6 @@ def llamalike_config( solution=Briton( trt_config=llamalike_config(repoid="meta-llama/Llama-3.1-405B", tp=8) ), - is_gated=True, ), Deployment( "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", @@ -971,14 +995,13 @@ def llamalike_config( repoid="mistralai/Mistral-Small-24B-Instruct-2501" ) ), - is_gated=True, ), # unsloth/phi-4 Deployment( "microsoft/phi-4", "unsloth/phi-4", Accelerator.L4, TextGen(), - solution=Briton(trt_config=llamalike_config(repoid="unsloth/phi-4")), + solution=Briton(trt_config=llamalike_config(repoid="unsloth/phi-4", tp=2)), ), Deployment( "tiiuae/Falcon3-10B-Instruct", @@ -986,9 +1009,8 @@ def llamalike_config( Accelerator.L4, TextGen(), solution=Briton( - trt_config=llamalike_config(repoid="tiiuae/Falcon3-10B-Instruct", tp=4) + trt_config=llamalike_config(repoid="tiiuae/Falcon3-10B-Instruct", tp=2) ), - is_gated=True, ), Deployment( "mistralai/Mistral-7B-Instruct-v0.3", diff --git a/11-embeddings-reranker-classification-tensorrt/templating/test_deploy.py b/11-embeddings-reranker-classification-tensorrt/templating/test_deploy.py new file mode 100644 index 0000000..573f8bf --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/templating/test_deploy.py @@ -0,0 +1,54 @@ +def test_deploy(deploy_id: str = "03ykpnkw"): + import os + + from openai import OpenAI + + client = OpenAI( + api_key=os.environ["BASETEN_API_KEY"], + base_url=f"https://model-{deploy_id}.api.baseten.co/environments/production/sync/v1", + ) + + # Default completion + response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, + ) + assert "baseten" in response_completion.choices[0].text.lower() + + # Chat completion + response_chat = client.chat.completions.create( + model="", + messages=[{"role": "user", "content": "Tell me everything about Baseten.co!"}], + temperature=0.3, + max_tokens=100, + ) + assert "baseten" in response_chat.choices[0].message.content.lower() + # Structured output + from pydantic import BaseModel + + class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + + completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + { + "role": "user", + "content": "Alice and Bob are going to a science fair on Friday.", + }, + ], + response_format=CalendarEvent, + ) + + event = completion.choices[0].message.parsed + assert "science" in event.name.lower() + print(f"✅ All tests passed for deployment {deploy_id}") + + +if __name__ == "__main__": + test_deploy() diff --git a/text-embeddings-inference/README.md b/text-embeddings-inference/README.md index 37526a6..a52303f 100644 --- a/text-embeddings-inference/README.md +++ b/text-embeddings-inference/README.md @@ -1,7 +1,6 @@ # Notice This section has moved to [jina-ai/jina-embeddings-v2-base-en-TEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt) with an overview over fast embeddings. - # Text Embeddings Inference Truss This is a Trussless Customer Server example to deploy [text-embeddings-inference](https://github.com/huggingface/text-embeddings-inference), a high performance server that handles text-embeddings, ranranking and classification models as api.