Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing GRPO reward_func being a model with DeepSpeed ZeRO-3 #2984

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion tests/test_grpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from datasets import load_dataset
from parameterized import parameterized
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
from transformers.testing_utils import require_peft, require_torch_accelerator
from transformers.testing_utils import require_deepspeed, require_peft, require_torch_accelerator
from transformers.utils import is_peft_available

from trl import GRPOConfig, GRPOTrainer
Expand Down Expand Up @@ -318,6 +318,35 @@ def test_training_peft_with_gradient_checkpointing(self):
else: # Base model parameters should not change
self.assertTrue(torch.equal(param, new_param), f"Base parameter {n} has changed.")

@require_deepspeed
@require_torch_accelerator
def test_training_with_deepspeed_zero3(self):
dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")

with tempfile.TemporaryDirectory() as tmp_dir:
training_args = GRPOConfig(
output_dir=tmp_dir,
learning_rate=0.1,
per_device_train_batch_size=3,
num_generations=3,
max_completion_length=32,
report_to="none",
deepspeed={"train_batch_size": "auto", "zero_optimization": {"stage": 3}},
max_steps=2, # Just need at least one step
)
trainer = GRPOTrainer(
model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
# Reward function should also involve generation,
# that DeepSpeed ZeRO-3 will also optimize
reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
args=training_args,
train_dataset=dataset,
)

trainer.train()

self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])

def test_training_different_reward_model(self):
# Use a reward model different from the model: different chat template, tokenization, etc.
dataset = load_dataset("trl-internal-testing/zen", "conversational_prompt_only", split="train")
Expand Down
6 changes: 4 additions & 2 deletions trl/trainer/grpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,8 +829,10 @@ def _generate_and_score_completions(
texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False
)
reward_inputs = super()._prepare_inputs(reward_inputs)
with torch.inference_mode():
rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0] # Shape (B*G,)
with torch.inference_mode(), unwrap_model_for_generation(
reward_func, self.accelerator
) as unwrapped_reward_func:
Comment on lines +832 to +834
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need to unwrap here? it seems like your loosing interest of using deepspeed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the question, yeah without this unwrap the unit test will crash with the stack trace in the PR description.

Try running torchrun --nproc_per_node=1 -m pytest -sv tests/test_grpo_trainer.py::GRPOTrainerTester::test_training_with_deepspeed_zero3 with this PR's unit test on main and then with the unwrap.

rewards_per_func[:, i] = unwrapped_reward_func(**reward_inputs).logits[:, 0] # Shape (B*G,)
else:
# Repeat all input columns (but "prompt" and "completion") to match the number of generations
keys = [key for key in inputs[0] if key not in ["prompt", "completion"]]
Expand Down