update llama models for long context (#395)

Update most popular llama models with long context engines. Tested these myself, able to get ~9k tokens from each model.
basetenlabs · Jan 2, 2025 · d7d181d · d7d181d
1 parent 29a8b6a
commit d7d181d
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 3 deletions.
diff --git a/llama/engine-llama-3-1-70b-instruct/config.yaml b/llama/engine-llama-3-1-70b-instruct/config.yaml
@@ -38,8 +38,15 @@ trt_llm:
     checkpoint_repository:
       repo: meta-llama/Llama-3.1-70B-Instruct
       source: HF
-    max_seq_len: 8192
     num_builder_gpus: 4
     quantization_type: fp8_kv
+    max_seq_len: 131072
+    batch_scheduler_policy: max_utilization
+    default_max_tokens: 131072
     tensor_parallel_count: 2
-    enable_chunked_context: false
+    kv_cache_free_gpu_mem_fraction: 0.85
+    enable_chunked_context: true
+    plugin_configuration:
+      use_paged_context_fmha: true
+      use_fp8_context_fmha: true
+      paged_kv_cache: true
diff --git a/llama/engine-llama-3-1-8b-instruct/config.yaml b/llama/engine-llama-3-1-8b-instruct/config.yaml
@@ -42,3 +42,10 @@ trt_llm:
     num_builder_gpus: 1
     quantization_type: no_quant
     tensor_parallel_count: 1
+    default_max_tokens: 131072
+    kv_cache_free_gpu_mem_fraction: 0.85
+    enable_chunked_context: true
+    plugin_configuration:
+      use_paged_context_fmha: true
+      use_fp8_context_fmha: false
+      paged_kv_cache: true
diff --git a/llama/engine-llama-3-3-70b-instruct/config.yaml b/llama/engine-llama-3-3-70b-instruct/config.yaml
@@ -38,7 +38,15 @@ trt_llm:
     checkpoint_repository:
       repo: meta-llama/Llama-3.3-70B-Instruct
       source: HF
-    max_seq_len: 8192
     num_builder_gpus: 4
     quantization_type: fp8_kv
+    max_seq_len: 131072
+    batch_scheduler_policy: max_utilization
+    default_max_tokens: 131072
     tensor_parallel_count: 2
+    kv_cache_free_gpu_mem_fraction: 0.85
+    enable_chunked_context: true
+    plugin_configuration:
+      use_paged_context_fmha: true
+      use_fp8_context_fmha: true
+      paged_kv_cache: true