update all models in trt-llm

basetenlabs · Feb 14, 2025 · eed1098 · eed1098
1 parent 0f861e8
commit eed1098
Show file tree

Hide file tree

Showing 24 changed files with 145 additions and 231 deletions.
diff --git a/...ranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md b/...ranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/README.md
@@ -92,7 +92,7 @@ model_name: BEI-ncbi-medcpt-cross-encoder-reranker-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: A10G
   cpu: '1'
   memory: 10Gi
   use_gpu: true

diff --git a/...dings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml b/...dings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker/config.yaml
@@ -8,7 +8,7 @@ model_name: BEI-ncbi-medcpt-cross-encoder-reranker-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: A10G
   cpu: '1'
   memory: 10Gi
   use_gpu: true

diff --git a/...ensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md b/...ensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/README.md
@@ -105,7 +105,7 @@ trt_llm:
       source: HF
     max_num_tokens: 131072
     max_seq_len: 1000001
-    num_builder_gpus: 2
+    num_builder_gpus: 4
     quantization_type: fp8
 
 ```

diff --git a/...cation-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml b/...cation-tensorrt/BEI-skywork-skywork-reward-llama-3.1-8b-v0.2-reward-model-fp8/config.yaml
@@ -24,5 +24,5 @@ trt_llm:
       source: HF
     max_num_tokens: 131072
     max_seq_len: 1000001
-    num_builder_gpus: 2
+    num_builder_gpus: 4
     quantization_type: fp8
diff --git a/...ngs-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md b/...ngs-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-405b/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh

diff --git a/...anker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md b/...anker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh
@@ -129,7 +129,7 @@ print(completion.choices[0].message.tool_calls)
 
 
 ## Config.yaml
-By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+By default, the following configuration is used for this deployment.
 
 ```yaml
 build_commands: []
@@ -162,11 +162,8 @@ trt_llm:
       repo: meta-llama/Llama-3.2-3B-Instruct
       revision: main
       source: HF
-    max_seq_len: 131072
-    num_builder_gpus: 4
-    plugin_configuration:
-      use_fp8_context_fmha: true
-    quantization_type: fp8_kv
+    max_seq_len: 4096
+    quantization_type: no_quant
     tensor_parallel_count: 1
   runtime:
     enable_chunked_context: true

diff --git a/...ings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml b/...ings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct/config.yaml
@@ -28,11 +28,8 @@ trt_llm:
       repo: meta-llama/Llama-3.2-3B-Instruct
       revision: main
       source: HF
-    max_seq_len: 131072
-    num_builder_gpus: 4
-    plugin_configuration:
-      use_fp8_context_fmha: true
-    quantization_type: fp8_kv
+    max_seq_len: 4096
+    quantization_type: no_quant
     tensor_parallel_count: 1
   runtime:
     enable_chunked_context: true
diff --git a/...-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md b/...-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp2/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh

diff --git a/...nker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md b/...nker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh

diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/README.md
@@ -149,7 +149,7 @@ model_name: Briton-microsoft-phi-4-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: L4:2
   cpu: '1'
   memory: 10Gi
   use_gpu: true
@@ -163,11 +163,10 @@ trt_llm:
       revision: main
       source: HF
     max_seq_len: 16384
-    num_builder_gpus: 4
     plugin_configuration:
       use_fp8_context_fmha: true
     quantization_type: fp8_kv
-    tensor_parallel_count: 1
+    tensor_parallel_count: 2
   runtime:
     enable_chunked_context: true
 

diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4/config.yaml
@@ -15,7 +15,7 @@ model_name: Briton-microsoft-phi-4-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: L4:2
   cpu: '1'
   memory: 10Gi
   use_gpu: true
@@ -29,10 +29,9 @@ trt_llm:
       revision: main
       source: HF
     max_seq_len: 16384
-    num_builder_gpus: 4
     plugin_configuration:
       use_fp8_context_fmha: true
     quantization_type: fp8_kv
-    tensor_parallel_count: 1
+    tensor_parallel_count: 2
   runtime:
     enable_chunked_context: true
diff --git a/...ker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md b/...ker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh

diff --git a/...ssification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md b/...ssification-tensorrt/Briton-mistralai-mistral-small-24b-instruct-2501/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
 
 First, clone this repository:
 ```sh

diff --git a/...s-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md b/...s-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/README.md
@@ -27,7 +27,7 @@ Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
-Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+
 
 First, clone this repository:
 ```sh
@@ -149,7 +149,7 @@ model_name: Briton-tiiuae-falcon3-10b-instruct-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4:4
+  accelerator: L4:2
   cpu: '1'
   memory: 10Gi
   use_gpu: true
@@ -166,7 +166,7 @@ trt_llm:
     plugin_configuration:
       use_fp8_context_fmha: true
     quantization_type: fp8_kv
-    tensor_parallel_count: 4
+    tensor_parallel_count: 2
   runtime:
     enable_chunked_context: true
 

diff --git a/...mbeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml b/...mbeddings-reranker-classification-tensorrt/Briton-tiiuae-falcon3-10b-instruct/config.yaml
@@ -15,7 +15,7 @@ model_name: Briton-tiiuae-falcon3-10b-instruct-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4:4
+  accelerator: L4:2
   cpu: '1'
   memory: 10Gi
   use_gpu: true
@@ -32,6 +32,6 @@ trt_llm:
     plugin_configuration:
       use_fp8_context_fmha: true
     quantization_type: fp8_kv
-    tensor_parallel_count: 4
+    tensor_parallel_count: 2
   runtime:
     enable_chunked_context: true
diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md
@@ -35,7 +35,6 @@ You can find the following deployments in this repository:
  - [sentence-transformers/all-MiniLM-L6-v2-embedding-TEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding)
 
 ## Reranker Deployments:
- - [Alibaba-NLP/gte-multilingual-reranker-base-TEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base)
  - [BAAI/bge-reranker-large-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-baai-bge-reranker-large)
  - [BAAI/bge-reranker-v2-m3-multilingual-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-baai-bge-reranker-v2-m3-multilingual)
  - [ncbi/MedCPT-Cross-Encoder-reranker-BEI](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/BEI-ncbi-medcpt-cross-encoder-reranker)

diff --git a/...lassification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md b/...lassification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/README.md
diff --git a/...ranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml b/...ranker-classification-tensorrt/TEI-alibaba-nlp-gte-multilingual-reranker-base/config.yaml
diff --git a/...ication-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md b/...ication-tensorrt/TEI-sentence-transformers-all-minilm-l6-v2-embedding/README.md
@@ -131,7 +131,7 @@ By default, the following configuration is used for this deployment.
 
 ```yaml
 base_image:
-  image: baseten/text-embeddings-inference-mirror:89-1.6
+  image: baseten/text-embeddings-inference-mirror:86-1.6
 build_commands:
 - 'git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 /data/local-model
   # optional step to download the weights of the model into the image, otherwise specify
@@ -154,7 +154,7 @@ model_name: TEI-sentence-transformers-all-minilm-l6-v2-embedding-truss-example
 python_version: py39
 requirements: []
 resources:
-  accelerator: L4
+  accelerator: A10G
   cpu: '1'
   memory: 2Gi
   use_gpu: true