↔️ GRPO: Set max_model_len when initializing vLLM instance (#2728)

* Set max_model_len when initializing vLLM instance * Introduce vllm_max_model_len arg * Replace vllm args with vllm_init_kwargs * Update docstring * Add missing import * Remove default values from newly deprecated args * Docs update * Reverted to adding single arg for max_model_len * Remove spurious import * Remove spurious line * style --------- Co-authored-by: Quentin Gallouédec <[email protected]>
huggingface · Feb 5, 2025 · 78c5ce2 · 78c5ce2
1 parent af4ad47
commit 78c5ce2
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 0 deletions.
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -73,6 +73,10 @@ class GRPOConfig(TrainingArguments):
         vllm_dtype (`str`, *optional*, defaults to `"auto"`):
             Data type to use for vLLM generation. If set to `"auto"`, the data type will be automatically determined
             based on the model configuration. Find the supported values in the vLLM documentation.
+        vllm_max_model_len (`int` or `None`, *optional*, defaults to `None`):
+            If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced
+            `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model
+            context size, which might be much larger than the KV cache, leading to inefficiencies.
 
         > Parameters that control the training
 
@@ -181,6 +185,14 @@ class GRPOConfig(TrainingArguments):
             "determined based on the model configuration. Find the supported values in the vLLM documentation."
         },
     )
+    vllm_max_model_len: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced "
+            "`vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model "
+            "context size, which might be much larger than the KV cache, leading to inefficiencies."
+        },
+    )
 
     # Parameters that control the training
     learning_rate: float = field(

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -322,6 +322,7 @@ def data_collator(features):  # No data collation is needed in GRPO
                         # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
                         # This is particularly useful here because we generate completions from the same prompts.
                         enable_prefix_caching=True,
+                        max_model_len=self.args.vllm_max_model_len,
                     )
                 self.sampling_params = SamplingParams(
                     n=self.num_generations,