From eed1098364e94357886fbcfb20774e6078483f28 Mon Sep 17 00:00:00 2001
From: michaelfeil <63565275+michaelfeil@users.noreply.github.com>
Date: Fri, 14 Feb 2025 19:56:14 +0000
Subject: [PATCH] update all models in trt-llm

---
 .../README.md                                 |   2 +-
 .../config.yaml                               |   2 +-
 .../README.md                                 |   2 +-
 .../config.yaml                               |   2 +-
 .../README.md                                 |   2 +-
 .../README.md                                 |  11 +-
 .../config.yaml                               |   7 +-
 .../README.md                                 |   2 +-
 .../README.md                                 |   2 +-
 .../Briton-microsoft-phi-4/README.md          |   5 +-
 .../Briton-microsoft-phi-4/config.yaml        |   5 +-
 .../README.md                                 |   2 +-
 .../README.md                                 |   2 +-
 .../README.md                                 |   6 +-
 .../config.yaml                               |   4 +-
 .../README.md                                 |   1 -
 .../README.md                                 | 126 ------------------
 .../config.yaml                               |  30 -----
 .../README.md                                 |   4 +-
 .../config.yaml                               |   4 +-
 .../templating/deploy_all.py                  |   6 +-
 .../templating/generate_templates.py          |  94 ++++++++-----
 .../templating/test_deploy.py                 |  54 ++++++++
 text-embeddings-inference/README.md           |   1 -
 24 files changed, 145 insertions(+), 231 deletions(-)
 delete mode 100644 11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md
 delete mode 100644 11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml
 create mode 100644 11-embeddings-reranker-classification-tensorrt/templating/test_deploy.py

diff --git a/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md b/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md
index 60a3676..bbbf214 100644
--- a/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md
@@ -92,7 +92,7 @@ model_name: BEI-ncbi-medcpt-cross-encoder-reranker-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: A10G
   cpu: '1'
   memory: 10Gi
   use_gpu: true
diff --git a/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml b/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml
index 861f2d1..ddb5b45 100644
--- a/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml
+++ b/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml
@@ -8,7 +8,7 @@ model_name: BEI-ncbi-medcpt-cross-encoder-reranker-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: A10G
   cpu: '1'
   memory: 10Gi
   use_gpu: true
diff --git a/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md
index 50b7934..3bb7f51 100644
--- a/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md
@@ -105,7 +105,7 @@ trt_llm:
       source: HF
     max_num_tokens: 131072
     max_seq_len: 1000001
-    num_builder_gpus: 2
+    num_builder_gpus: 4
     quantization_type: fp8
 
 ```
diff --git a/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml
index 8efe759..ec22240 100644
--- a/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml
+++ b/11-embeddings-reranker-classification-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml
@@ -24,5 +24,5 @@ trt_llm:
       source: HF
     max_num_tokens: 131072
     max_seq_len: 1000001
-    num_builder_gpus: 2
+    num_builder_gpus: 4
     quantization_type: fp8
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md
index 8cf5417..b6b2bf6 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md
index 9ad739a..a5fe008 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh
@@ -129,7 +129,7 @@ print(completion.choices[0].message.tool_calls)
 
 
 ## Config.yaml
-By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+By default, the following configuration is used for this deployment.
 
 ```yaml
 build_commands: []
@@ -162,11 +162,8 @@ trt_llm:
       repo: meta-llama/Llama-3.2-3B-Instruct
       revision: main
       source: HF
-    max_seq_len: 131072
-    num_builder_gpus: 4
-    plugin_configuration:
-      use_fp8_context_fmha: true
-    quantization_type: fp8_kv
+    max_seq_len: 4096
+    quantization_type: no_quant
     tensor_parallel_count: 1
   runtime:
     enable_chunked_context: true
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml
index b74f8c3..7ff6941 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml
@@ -28,11 +28,8 @@ trt_llm:
       repo: meta-llama/Llama-3.2-3B-Instruct
       revision: main
       source: HF
-    max_seq_len: 131072
-    num_builder_gpus: 4
-    plugin_configuration:
-      use_fp8_context_fmha: true
-    quantization_type: fp8_kv
+    max_seq_len: 4096
+    quantization_type: no_quant
     tensor_parallel_count: 1
   runtime:
     enable_chunked_context: true
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md
index 5771d9f..da26c49 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md
index d4fc59e..0aa8192 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md
index 86bf7f3..fe354b2 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md
@@ -149,7 +149,7 @@ model_name: Briton-microsoft-phi-4-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: L4:2
   cpu: '1'
   memory: 10Gi
   use_gpu: true
@@ -163,11 +163,10 @@ trt_llm:
       revision: main
       source: HF
     max_seq_len: 16384
-    num_builder_gpus: 4
     plugin_configuration:
       use_fp8_context_fmha: true
     quantization_type: fp8_kv
-    tensor_parallel_count: 1
+    tensor_parallel_count: 2
   runtime:
     enable_chunked_context: true
 
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml
index c38c390..4d744a5 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml
@@ -15,7 +15,7 @@ model_name: Briton-microsoft-phi-4-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: L4:2
   cpu: '1'
   memory: 10Gi
   use_gpu: true
@@ -29,10 +29,9 @@ trt_llm:
       revision: main
       source: HF
     max_seq_len: 16384
-    num_builder_gpus: 4
     plugin_configuration:
       use_fp8_context_fmha: true
     quantization_type: fp8_kv
-    tensor_parallel_count: 1
+    tensor_parallel_count: 2
   runtime:
     enable_chunked_context: true
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md
index bc86618..51eee9f 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md
index a8a198a..dfc1644 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md
index 7501482..9a4b043 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+
 
 First, clone this repository:
 ```sh
@@ -149,7 +149,7 @@ model_name: Briton-tiiuae-falcon3-10b-instruct-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4:4
+  accelerator: L4:2
   cpu: '1'
   memory: 10Gi
   use_gpu: true
@@ -166,7 +166,7 @@ trt_llm:
     plugin_configuration:
       use_fp8_context_fmha: true
     quantization_type: fp8_kv
-    tensor_parallel_count: 4
+    tensor_parallel_count: 2
   runtime:
     enable_chunked_context: true
 
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml
index 2131ea0..13619c2 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml
@@ -15,7 +15,7 @@ model_name: Briton-tiiuae-falcon3-10b-instruct-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4:4
+  accelerator: L4:2
   cpu: '1'
   memory: 10Gi
   use_gpu: true
@@ -32,6 +32,6 @@ trt_llm:
     plugin_configuration:
       use_fp8_context_fmha: true
     quantization_type: fp8_kv
-    tensor_parallel_count: 4
+    tensor_parallel_count: 2
   runtime:
     enable_chunked_context: true
diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md
index 3637cc3..260aea4 100644
--- a/11-embeddings-reranker-classification-tensorrt/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/README.md
@@ -35,7 +35,6 @@ You can find the following deployments in this repository:
  - [sentence-transformers/all-MiniLM-L6-v2-embedding-TEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding)
 
 ## Reranker Deployments:
- - [Alibaba-NLP/gte-multilingual-reranker-base-TEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base)
  - [BAAI/bge-reranker-large-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-baai-bge-reranker-large)
  - [BAAI/bge-reranker-v2-m3-multilingual-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-baai-bge-reranker-v2-m3-multilingual)
  - [ncbi/MedCPT-Cross-Encoder-reranker-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker)
diff --git a/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md b/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md
deleted file mode 100644
index a1bd64d..0000000
--- a/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md
+++ /dev/null
@@ -1,126 +0,0 @@
-# Huggingface's text-embeddings-inference with Alibaba-NLP/gte-multilingual-reranker-base
-
-This is a Deployment for Huggingface's text-embeddings-inference with Alibaba-NLP/gte-multilingual-reranker-base. TEI is huggingface's solution for (text) embeddings, reranking models and prediction models.
-
-Supported models are tagged here: https://huggingface.co/models?other=text-embeddings-inference&sort=trending
-
-
-For TEI you have to perform a manual selection of the Docker Image. We have mirrored the following images:
-```
-CPU	baseten/text-embeddings-inference-mirror:cpu-1.6
-Turing (T4, ...)	baseten/text-embeddings-inference-mirror:turing-1.6
-Ampere 80 (A100, A30)	baseten/text-embeddings-inference-mirror:1.6
-Ampere 86 (A10, A10G, A40, ...)	baseten/text-embeddings-inference-mirror:86-1.6
-Ada Lovelace (L4, ...)	baseten/text-embeddings-inference-mirror:89-1.6
-Hopper (H100/H100 40GB/H200)	baseten/text-embeddings-inference-mirror:hopper-1.6
-```
-
-
-# Examples:
-This deployment is specifically designed for the Hugging Face model [Alibaba-NLP/gte-multilingual-reranker-base](https://huggingface.co/Alibaba-NLP/gte-multilingual-reranker-base).
-Suitable models can be identified by the `ForSequenceClassification` suffix in the model name. Reranker models may have at most one label, which contains the score of the reranking.
-
-Alibaba-NLP/gte-multilingual-reranker-base  is a reranker model, used to re-rank a list of items, given a query. \nIt is frequently used in search engines, recommendation systems, and more.
-
-
-## Deployment with Truss
-
-Before deployment:
-
-1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
-2. Install the latest version of Truss: `pip install --upgrade truss`
-
-
-First, clone this repository:
-```sh
-git clone https://github.com/basetenlabs/truss-examples.git
-cd 11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base
-```
-
-With `11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
-
-```sh
-truss push --publish
-# prints:
-# ✨ Model TEI-alibaba-nlp-gte-multilingual-reranker-base-truss-example was successfully pushed ✨
-# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
-```
-
-## Call your model
-
-### API-Schema:
-POST-Route: `https://model-xxxxxx.api.baseten.co/environments/production/sync/rerank`:
-```json
-{
-  "query": "What is Baseten?",
-  "raw_scores": false,
-  "return_text": false,
-  "texts": [
-    "Deep Learning is ...", "Baseten is a fast inference provider"
-  ],
-  "truncate": false,
-  "truncation_direction": "right"
-}
-```
-
-Returns:
-```json
-[
-  {
-    "index": 0,
-    "score": 1,
-    "text": "Deep Learning is ..."
-  }
-]
-```
-The OpenAPI.json is available under https://model-xxxxxx.api.baseten.co/environments/production/sync/openapi.json for more details.
-
-#### Advanced:
-You may also use Baseten's async jobs API, which returns a request_id, which you can use to query the status of the job and get the results.
-
-POST-Route: `https://model-xxxxxx.api.baseten.co/environments/production/async/rerank`
-Read more about [Baseten's Async API here](https://docs.baseten.co/invoke/async)
-
-### OpenAI compatible client library
-OpenAI.com does not have a rerank endpoint, therefore no client library is available.
-
-
-## Config.yaml
-By default, the following configuration is used for this deployment.
-
-```yaml
-base_image:
-  image: baseten/text-embeddings-inference-mirror:89-1.6
-build_commands:
-- 'git clone https://huggingface.co/Alibaba-NLP/gte-multilingual-reranker-base /data/local-model
-  # optional step to download the weights of the model into the image, otherwise specify
-  the --model-id Alibaba-NLP/gte-multilingual-reranker-base directly `start_command`'
-docker_server:
-  liveness_endpoint: /health
-  predict_endpoint: /rerank
-  readiness_endpoint: /health
-  server_port: 7997
-  start_command: text-embeddings-router --port 7997 --model-id /data/local-model --max-client-batch-size
-    128 --max-concurrent-requests 40 --max-batch-tokens 16384
-environment_variables: {}
-external_package_dirs: []
-model_metadata:
-  example_model_input:
-    input: This redirects to the embedding endpoint. Use the /sync API to reach /rerank
-model_name: TEI-alibaba-nlp-gte-multilingual-reranker-base-truss-example
-python_version: py39
-requirements: []
-resources:
-  accelerator: L4
-  cpu: '1'
-  memory: 2Gi
-  use_gpu: true
-runtime:
-  predict_concurrency: 40
-secrets: {}
-system_packages: []
-
-```
-
-## Support
-If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml b/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml
deleted file mode 100644
index 45d4db7..0000000
--- a/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-base_image:
-  image: baseten/text-embeddings-inference-mirror:89-1.6
-build_commands:
-- 'git clone https://huggingface.co/Alibaba-NLP/gte-multilingual-reranker-base /data/local-model
-  # optional step to download the weights of the model into the image, otherwise specify
-  the --model-id Alibaba-NLP/gte-multilingual-reranker-base directly `start_command`'
-docker_server:
-  liveness_endpoint: /health
-  predict_endpoint: /rerank
-  readiness_endpoint: /health
-  server_port: 7997
-  start_command: text-embeddings-router --port 7997 --model-id /data/local-model --max-client-batch-size
-    128 --max-concurrent-requests 40 --max-batch-tokens 16384
-environment_variables: {}
-external_package_dirs: []
-model_metadata:
-  example_model_input:
-    input: This redirects to the embedding endpoint. Use the /sync API to reach /rerank
-model_name: TEI-alibaba-nlp-gte-multilingual-reranker-base-truss-example
-python_version: py39
-requirements: []
-resources:
-  accelerator: L4
-  cpu: '1'
-  memory: 2Gi
-  use_gpu: true
-runtime:
-  predict_concurrency: 40
-secrets: {}
-system_packages: []
diff --git a/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md b/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md
index c857c23..bdc7fe8 100644
--- a/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md
@@ -131,7 +131,7 @@ By default, the following configuration is used for this deployment.
 
 ```yaml
 base_image:
-  image: baseten/text-embeddings-inference-mirror:89-1.6
+  image: baseten/text-embeddings-inference-mirror:86-1.6
 build_commands:
 - 'git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 /data/local-model
   # optional step to download the weights of the model into the image, otherwise specify
@@ -154,7 +154,7 @@ model_name: TEI-sentence-transformers-all-minilm-l6-v2-embedding-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: A10G
   cpu: '1'
   memory: 2Gi
   use_gpu: true
diff --git a/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/config.yaml b/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/config.yaml
index d0a2fb4..77d2380 100644
--- a/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/config.yaml
+++ b/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/config.yaml
@@ -1,5 +1,5 @@
 base_image:
-  image: baseten/text-embeddings-inference-mirror:89-1.6
+  image: baseten/text-embeddings-inference-mirror:86-1.6
 build_commands:
 - 'git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 /data/local-model
   # optional step to download the weights of the model into the image, otherwise specify
@@ -22,7 +22,7 @@ model_name: TEI-sentence-transformers-all-minilm-l6-v2-embedding-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: A10G
   cpu: '1'
   memory: 2Gi
   use_gpu: true
diff --git a/11-embeddings-reranker-classification-tensorrt/templating/deploy_all.py b/11-embeddings-reranker-classification-tensorrt/templating/deploy_all.py
index 0e1fdc2..845f70b 100644
--- a/11-embeddings-reranker-classification-tensorrt/templating/deploy_all.py
+++ b/11-embeddings-reranker-classification-tensorrt/templating/deploy_all.py
@@ -44,7 +44,11 @@ def wrapper(*args, **kwargs):
 
 
 def matches_name(model: dict, key: str = "name") -> bool:
-    return model[key].endswith("truss-example") and FILTER in model[key]
+    return (
+        model[key].endswith("truss-example")
+        and FILTER in model[key]
+        and (not "405b" in model[key])
+    )
 
 
 @retry(max_retries=3, delay=2)
diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
index d7f472e..991df4d 100644
--- a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
+++ b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
@@ -1,7 +1,9 @@
 from dataclasses import field
+from functools import cached_property
 from pathlib import Path
 from typing import Any, Optional
 
+import requests
 from pydantic import dataclasses
 from transformers import AutoConfig
 from truss.base.trt_llm_config import (
@@ -64,6 +66,13 @@ def make_truss_config(self, dp: "Deployment") -> TrussConfig:
         max_position_embeddings = hf_cfg.max_position_embeddings
 
         max_num_tokens = max(16384, max_position_embeddings)
+
+        num_builder_gpus = 1
+        if dp.accelerator in [Accelerator.H100]:
+            num_builder_gpus = 2
+        elif dp.accelerator in [Accelerator.L4]:
+            num_builder_gpus = 4
+
         return TrussConfig(
             model_metadata=dp.task.model_metadata,
             trt_llm=TRTLLMConfiguration(
@@ -80,11 +89,7 @@ def make_truss_config(self, dp: "Deployment") -> TrussConfig:
                         {
                             "quantization_type": TrussTRTLLMQuantizationType.FP8,
                             # give more resources / cpu ram + vram on build if the model uses not-mig
-                            "num_builder_gpus": (
-                                2
-                                if dp.accelerator in [Accelerator.H100, Accelerator.L4]
-                                else 1
-                            ),
+                            "num_builder_gpus": num_builder_gpus,
                         }
                         if dp.is_fp8
                         else {}
@@ -192,7 +197,21 @@ def make_truss_config(self, dp):
         )  # make sure model is available
         max_position_embeddings = hf_cfg.max_position_embeddings
         assert self.trt_config is not None
-        self.trt_config.build.max_seq_len = max(max_position_embeddings, 512)
+        self.trt_config.build.max_seq_len = max_position_embeddings
+        assert max_position_embeddings >= 512, "Model needs to have at least 512 tokens"
+        if (
+            dp.accelerator in [Accelerator.L4, Accelerator.A10G]
+            and self.trt_config.build.tensor_parallel_count == 1
+        ):
+            # limit context length on single small gpus as its hard to tune
+            self.trt_config.build.max_seq_len = min(
+                self.trt_config.build.max_seq_len, 4096
+            )
+        secrets = {}
+        if dp.is_gated:
+            # fix: pass-through access token
+            # TODO: remove need to the token at runtime
+            secrets["hf_access_token"] = None
 
         return TrussConfig(
             model_metadata=dp.task.model_metadata,
@@ -530,7 +549,6 @@ class Deployment:
     accelerator: Accelerator
     task: Task
     solution: Solution
-    is_gated: bool = False
     is_fp8: bool = False
 
     def __init__(self, *args, **kwargs):
@@ -540,16 +558,26 @@ def __init__(self, *args, **kwargs):
                 "fp8" in self.solution.trt_config.build.quantization_type.value
             )
 
-        try:
-            AutoConfig.from_pretrained(self.hf_model_id, token="invalid")
-            self.is_gated = False
-        except Exception:
-            try:
-                # has only access with permissions
-                AutoConfig.from_pretrained(self.hf_model_id)
-                self.is_gated = True
-            except Exception:
-                raise
+    @cached_property
+    def hf_config(self):
+        return AutoConfig.from_pretrained(self.hf_model_id, trust_remote_code=True)
+
+    @cached_property
+    def is_gated(self):
+        # make sure the model is available via AutoConfig
+        assert self.hf_config is not None
+
+        # model_name = "unsloth/phi-4"
+        # Attempt to fetch the weights file rather than config.json
+        url = f"https://huggingface.co/{self.hf_model_id}/resolve/main/config.json"
+
+        response = requests.get(url)
+        if response.status_code == 200:
+            return False
+        elif response.status_code == 401:
+            return True
+        else:
+            raise ValueError(f"Received HTTP status code: {response.status_code}")
 
     @property
     def folder_name(self):
@@ -571,9 +599,10 @@ def generate_bei_deployment(dp: Deployment):
 
     folder_relative_path = SUBFOLDER / dp.folder_name
     full_folder_path = root / folder_relative_path
-    is_gated = (
+    is_gated_notice = (
         "Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). "
-        "Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`."
+        "Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. "
+        "Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store."
         if dp.is_gated
         else ""
     )
@@ -620,7 +649,7 @@ def generate_bei_deployment(dp: Deployment):
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-{is_gated}
+{is_gated_notice}
 
 First, clone this repository:
 ```sh
@@ -759,7 +788,7 @@ def generate_bei_deployment(dp: Deployment):
     Deployment(
         "ncbi/MedCPT-Cross-Encoder-reranker",
         "ncbi/MedCPT-Cross-Encoder",
-        Accelerator.L4,
+        Accelerator.A10G,
         Reranker(),
         solution=BEI(),
     ),
@@ -815,7 +844,7 @@ def generate_bei_deployment(dp: Deployment):
     Deployment(  #
         name="sentence-transformers/all-MiniLM-L6-v2-embedding",
         hf_model_id="sentence-transformers/all-MiniLM-L6-v2",
-        accelerator=Accelerator.L4,
+        accelerator=Accelerator.A10G,
         task=Embedder(),
         solution=HFTEI(),
     ),
@@ -840,13 +869,6 @@ def generate_bei_deployment(dp: Deployment):
         task=Embedder(),
         solution=HFTEI(),
     ),
-    Deployment(  #
-        name="Alibaba-NLP/gte-multilingual-reranker-base",
-        hf_model_id="Alibaba-NLP/gte-multilingual-reranker-base",
-        accelerator=Accelerator.L4,
-        task=Reranker(),
-        solution=HFTEI(),
-    ),
 ]
 
 
@@ -904,7 +926,6 @@ def llamalike_config(
                 repoid="meta-llama/Llama-3.3-70B-Instruct", tp=2
             )
         ),
-        is_gated=True,
     ),  # meta-llama/Llama-3.1-405B tp8
     Deployment(
         "meta-llama/Llama-3.2-3B-Instruct",
@@ -912,7 +933,11 @@ def llamalike_config(
         Accelerator.L4,
         TextGen(),
         solution=Briton(
-            trt_config=llamalike_config(repoid="meta-llama/Llama-3.2-3B-Instruct")
+            trt_config=llamalike_config(
+                repoid="meta-llama/Llama-3.2-3B-Instruct",
+                tp=1,
+                quant=TrussTRTLLMQuantizationType.NO_QUANT,
+            )
         ),
     ),
     Deployment(
@@ -923,7 +948,6 @@ def llamalike_config(
         solution=Briton(
             trt_config=llamalike_config(repoid="meta-llama/Llama-3.1-405B", tp=8)
         ),
-        is_gated=True,
     ),
     Deployment(
         "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
@@ -971,14 +995,13 @@ def llamalike_config(
                 repoid="mistralai/Mistral-Small-24B-Instruct-2501"
             )
         ),
-        is_gated=True,
     ),  # unsloth/phi-4
     Deployment(
         "microsoft/phi-4",
         "unsloth/phi-4",
         Accelerator.L4,
         TextGen(),
-        solution=Briton(trt_config=llamalike_config(repoid="unsloth/phi-4")),
+        solution=Briton(trt_config=llamalike_config(repoid="unsloth/phi-4", tp=2)),
     ),
     Deployment(
         "tiiuae/Falcon3-10B-Instruct",
@@ -986,9 +1009,8 @@ def llamalike_config(
         Accelerator.L4,
         TextGen(),
         solution=Briton(
-            trt_config=llamalike_config(repoid="tiiuae/Falcon3-10B-Instruct", tp=4)
+            trt_config=llamalike_config(repoid="tiiuae/Falcon3-10B-Instruct", tp=2)
         ),
-        is_gated=True,
     ),
     Deployment(
         "mistralai/Mistral-7B-Instruct-v0.3",
diff --git a/11-embeddings-reranker-classification-tensorrt/templating/test_deploy.py b/11-embeddings-reranker-classification-tensorrt/templating/test_deploy.py
new file mode 100644
index 0000000..573f8bf
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/templating/test_deploy.py
@@ -0,0 +1,54 @@
+def test_deploy(deploy_id: str = "03ykpnkw"):
+    import os
+
+    from openai import OpenAI
+
+    client = OpenAI(
+        api_key=os.environ["BASETEN_API_KEY"],
+        base_url=f"https://model-{deploy_id}.api.baseten.co/environments/production/sync/v1",
+    )
+
+    # Default completion
+    response_completion = client.completions.create(
+        model="not_required",
+        prompt="Q: Tell me everything about Baseten.co! A:",
+        temperature=0.3,
+        max_tokens=100,
+    )
+    assert "baseten" in response_completion.choices[0].text.lower()
+
+    # Chat completion
+    response_chat = client.chat.completions.create(
+        model="",
+        messages=[{"role": "user", "content": "Tell me everything about Baseten.co!"}],
+        temperature=0.3,
+        max_tokens=100,
+    )
+    assert "baseten" in response_chat.choices[0].message.content.lower()
+    # Structured output
+    from pydantic import BaseModel
+
+    class CalendarEvent(BaseModel):
+        name: str
+        date: str
+        participants: list[str]
+
+    completion = client.beta.chat.completions.parse(
+        model="not_required",
+        messages=[
+            {"role": "system", "content": "Extract the event information."},
+            {
+                "role": "user",
+                "content": "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+        response_format=CalendarEvent,
+    )
+
+    event = completion.choices[0].message.parsed
+    assert "science" in event.name.lower()
+    print(f"✅ All tests passed for deployment {deploy_id}")
+
+
+if __name__ == "__main__":
+    test_deploy()
diff --git a/text-embeddings-inference/README.md b/text-embeddings-inference/README.md
index 37526a6..a52303f 100644
--- a/text-embeddings-inference/README.md
+++ b/text-embeddings-inference/README.md
@@ -1,7 +1,6 @@
 # Notice
 This section has moved to [jina-ai/jina-embeddings-v2-base-en-TEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt) with an overview over fast embeddings.
 
-
 # Text Embeddings Inference Truss
 
 This is a Trussless Customer Server example to deploy [text-embeddings-inference](https://github.com/huggingface/text-embeddings-inference), a high performance server that handles text-embeddings, ranranking and classification models as api.