📏 Log completion length in GRPO (#2659)

huggingface · Jan 25, 2025 · 4720656 · 4720656
1 parent 807046b
commit 4720656
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 0 deletions.
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
@@ -110,6 +110,7 @@ In TRL though, as in the original paper, we only do one update per generation, s
 
 The GRPO Trainer logs the following metrics:
 
+- `completion_length`: The average completion length.
 - `reward/{reward_func_name}`: The reward computed by each reward function.
 - `reward`: The average reward.
 - `reward_std` : The average standard deviation within reward groups.

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -410,6 +410,9 @@ def get_per_token_logps(model, input_ids):
         loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
 
         # Log the metrics
+        completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item()
+        self._metrics["completion_length"].append(completion_length)
+
         reward_per_func = self.accelerator.gather_for_metrics(rewards_per_func).mean(0)
         for i, reward_func in enumerate(self.reward_funcs):
             if isinstance(reward_func, PreTrainedModel):