forked from huggingface/trl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathonline_dpo_config.py
74 lines (64 loc) · 3.6 KB
/
online_dpo_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from typing import List, Literal, Optional
from transformers import TrainingArguments
@dataclass
class OnlineDPOConfig(TrainingArguments):
r"""
Configuration class for the [`OnlineDPOTrainer`].
Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.
Parameters:
learning_rate (`float`, *optional*, defaults to `5e-7`):
Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
[`~transformers.TrainingArguments`].
reward_model_path (`Optional[str]`, *optional*, defaults to `None`):
Path to the reward model.
max_new_tokens (`int`, *optional*, defaults to `64`):
Maximum number of tokens to generate per completion.
temperature (`float`, *optional*, defaults to `0.9`):
Temperature for sampling. The higher the temperature, the more random the completions.
missing_eos_penalty (`Optional[float]`, *optional*, defaults to `None`):
Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage
to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
value.
beta (`float` or `list[float]`, *optional*, defaults to `0.1`):
Parameter controlling the deviation from the reference model. Higher β means less deviation from the
reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is
selected for each new epoch and the last β is used for the rest of the epochs.
loss_type (`str`, *optional*, defaults to `"sigmoid"`):
Type of loss to use. Possible values are:
- `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
- `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
Number of processes to use for processing the dataset.
disable_dropout (`bool`, *optional*, defaults to `True`):
Whether to disable dropout in the model.
"""
learning_rate: float = 5e-7
reward_model_path: Optional[str] = None
max_new_tokens: int = 64
temperature: float = 0.9
missing_eos_penalty: Optional[float] = None
beta: List[float] = field(default_factory=lambda: [0.1])
loss_type: Literal["sigmoid", "ipo"] = "sigmoid"
dataset_num_proc: Optional[int] = None
disable_dropout: bool = True
def __post_init__(self):
super().__post_init__()
if hasattr(self.beta, "__len__") and len(self.beta) == 1:
self.beta = self.beta[0]