Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds AOT #1701

Merged
merged 4 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions trl/trainer/dpo_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ class DPOConfig(TrainingArguments):
The robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/cdpo.pdf) report and [Robust DPO](https://arxiv.org/abs/2403.00409) paper that should be between 0 and 0.5.
loss_type (`str`, defaults to `"sigmoid"`):
The type of DPO loss to use. Either `"sigmoid"` the default DPO loss,`"hinge"` loss from [SLiC](https://arxiv.org/abs/2305.10425) paper, `"ipo"` from [IPO](https://arxiv.org/abs/2310.12036) paper,
`"kto_pair"` from the HALOs [report](https://github.com/ContextualAI/HALOs/blob/main/assets/report.pdf), `"bco_pair"` from [BCO](https://arxiv.org/abs/2404.04656) paper or `"robust"` from [Robust DPO](https://arxiv.org/abs/2403.00409) paper.
`"kto_pair"` from the HALOs [report](https://github.com/ContextualAI/HALOs/blob/main/assets/report.pdf), `"bco_pair"` from [BCO](https://arxiv.org/abs/2404.04656) paper or `"robust"` from [Robust DPO](https://arxiv.org/abs/2403.00409) paper,
"aot" and "aot_pair" from alignment via optimal transport
label_pad_token_id (`int`, defaults to `-100`):
The label pad token id. This argument is required if you want to use the default data collator.
padding_value (`int`, defaults to `0`):
Expand Down Expand Up @@ -78,7 +79,7 @@ class DPOConfig(TrainingArguments):
beta: float = 0.1
label_smoothing: float = 0
loss_type: Literal[
"sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "sppo_hard", "nca_pair", "robust"
"sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "sppo_hard", "nca_pair", "robust", "aot", "aot_pair"
] = "sigmoid"
label_pad_token_id: int = -100
padding_value: int = 0
Expand Down
30 changes: 29 additions & 1 deletion trl/trainer/dpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def __init__(
ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
beta: float = 0.1,
label_smoothing: float = 0,
loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "robust"] = "sigmoid",
loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "robust", "aot", "aot_pair"] = "sigmoid",
args: Optional[DPOConfig] = None,
data_collator: Optional[DataCollator] = None,
label_pad_token_id: int = -100,
Expand Down Expand Up @@ -1066,6 +1066,34 @@ def dpo_loss(
- 0.5 * F.logsigmoid(-chosen_rewards)
- 0.5 * F.logsigmoid(-rejected_rewards)
)
elif self.loss_type == "aot_pair":
chosen_logratios = policy_chosen_logps - reference_chosen_logps
rejected_logratios = policy_rejected_logps - reference_rejected_logps

chosen_logratios_sorted, _ = torch.sort(chosen_logratios, dim=0)
rejected_logratios_sorted, _ = torch.sort(rejected_logratios, dim=0)

delta = chosen_logratios_sorted - rejected_logratios_sorted

losses = (
-F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
- F.logsigmoid(-self.beta * delta) * self.label_smoothing
)

elif self.loss_type == "aot":
pi_logratios = policy_chosen_logps - policy_rejected_logps
ref_logratios = reference_chosen_logps - reference_rejected_logps

pi_logratios_sorted, _ = torch.sort(pi_logratios, dim=0)
ref_logratios_sorted, _ = torch.sort(ref_logratios, dim=0)

delta = pi_logratios_sorted - ref_logratios_sorted

losses = (
-F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
- F.logsigmoid(-self.beta * delta) * self.label_smoothing
)

else:
raise ValueError(
f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair', 'bco_pair', 'sppo_hard', 'nca_pair', 'robust']"
Expand Down
Loading