-
Notifications
You must be signed in to change notification settings - Fork 87
/
Copy patharguments.py
276 lines (262 loc) · 10.3 KB
/
arguments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
from dataclasses import dataclass, field
from typing import List, Optional
@dataclass
class ModelArgs:
model_path: str = field(default="syzymon/long_llama_3b_v1_1")
mem_dtype: str = field(default="bfloat16")
last_context_length: int = field(default=1024)
torch_attention: bool = field(default=False)
torch_dtype: str = field(default="float32")
gradient_checkpoint_every_ith: int = field(default=1)
@dataclass
class DataArgs:
data_type: str = field(
default="instructions",
metadata={
"help": """
`,` separated list indicating the type of each dataset.
Available types: 'instructions' and 'chat'.
Examples:
* 'instructions'
* 'instructions,chat' - the first dataset is for instruction tuning
whereas the second one is for chat tuning.
"""
},
) # instructions or chat
data_filter: Optional[str] = field(
default=None,
metadata={
"help": (
"""
'<,>' separated list of '<;>' separated rules of format field_name<M>regex for filtering out the data.
For example 'lang<M>en<;>conversations.<int>*^.source<M>gpt<,>lang<M>pl'
will take from the first dataset the records such that their field lang matches the regex en
and the following property holds: when we take the conversation field and look at all its elements
then each of them has a field source that matches the regex gpt.
From the second dataset, it will take the records with field lang matching regex pl.
Consider another example: '<,>lang<M>^en$<;>conversations.<int>*^.value<M>(?i)^((?!openai).)*$<;>conversations.<int>*^.value<M>^((?!DAN).)*$<;>conversations.<int>0.value<LENLT>8000'.
Here we do not filter the data coming from the first dataset.
From the second dataset we take the records such that:
* field lang is equal to 'en'
* the conversations mention neither openai nor DAN.
* the first part of the conversation has at most 8000 chars
"""
)
},
)
data_path: str = field(
default=None,
metadata={
"help": """
Hugging Face dataset(s) name/path; separator ','
Examples:
* Open-Orca/OpenOrca
* 'Open-Orca/OpenOrca,zetavg/ShareGPT-Processed'
"""
},
)
data_revision: Optional[str] = field(
default=None,
metadata={
"help": """
Revision for each Hugging Face dataset; separator ','
Examples:
* 'f0823c7ffc48c9d33a42f16cf0b885fed4a7d0a1'
* 'f0823c7ffc48c9d33a42f16cf0b885fed4a7d0a1,15968d6dfa02529988da12e382af3ab7c857ebcd'
"""
},
)
dataset_split: str = field(
default="train",
metadata={
"help": """
Split for each Hugging Face dataset; separator ','
Examples:
* 'train'
* 'train,train'
"""
},
)
# instructions
pre_prompt_text: str = field(
default="",
metadata={
"help": """
Field with pre-prompt text. One for each instruction dataset.
Separator '<,>'. In case no '<,>' is present value will be replicated
for all instructions datasets.
Examples:
* PROMPT:
* PROMPT<,>PROMPT:
"""
},
)
prompt_field: Optional[str] = field(
default=None,
metadata={
"help": """
Field with the prompt. One for each instruction dataset.
Separator ','. 'None' is interpreted as None. In case no ',' is present value will be replicated
for all instructions datasets.
Examples:
* system_prompt
* system_prompt,prompt
* system_prompt,None
"""
},
)
post_prompt_text: str = field(
default="\n",
metadata={
"help": """
Field with post-prompt text. One for each instruction dataset.
Separator '<,>'. In case no '<,>' is present value will be replicated
for all instructions datasets.
"""
},
)
pre_question_text: str = field(
default="",
metadata={
"help": """
Field with pre-question text. One for each instruction dataset.
Separator '<,>'. In case no '<,>' is present value will be replicated
for all instructions datasets.
"""
},
)
question_field: str = field(
default=None,
metadata={
"help": """
Field with question. One for each instruction dataset.
Separator ','. 'None' is interpreted as None. In case no ',' is present value will be replicated
for all instructions datasets.
Examples:
* question
* question,instruction
* question,None
"""
},
)
post_question_text: str = field(
default="\n",
metadata={
"help": """
Field with post-question text. One for each instruction dataset.
Separator '<,>'. In case no '<,>' is present value will be replicated
for all instructions datasets.
"""
},
)
pre_response_text: str = field(
default="",
metadata={
"help": """
Field with pre-response text. One for each instruction dataset.
Separator '<,>'. In case no '<,>' is present value will be replicated
for all instructions datasets.
"""
},
)
response_field: str = field(
default=None,
metadata={
"help": """
Field with the expected response. One for each instruction dataset.
Separator ','. 'None' is interpreted as None. In case no ',' is present value will be replicated
for all instructions datasets.
Examples:
* response
* response,output
* response,None
"""
},
)
post_response_text: str = field(
default="",
metadata={
"help": """
Field with post response text. One for each instruction dataset.
Separator '<,>'. In case no '<,>' is present value will be replicated
for all instructins datasets.
"""
},
)
# chat
chat_conversations_field: str = field(
default="conversations",
metadata={
"help": """
Name of the field with conversations list. One for each chat dataset.
Separator ','. 'None' is interpreted as None.
In case no ',' is present value will be replicated
for all chat datasets.
"""
},
)
chat_data_field: str = field(
default="value",
metadata={
"help": """
Name of field with text.
One for each chat dataset.
Separator ','. 'None' is interpreted as None.
In case no ',' is present value will be replicated
for all chat datasets.
"""
},
)
chat_source_name_field: str = field(
default="from",
metadata={
"help": """Name of field describing the source (human/ai) of the text.
One for each chat dataset.
Separator ','. 'None' is interpreted as None.
In case no ',' is present value will be replicated
for all chat datasets.
"""
},
)
chat_model_source_name: str = field(
default="gpt",
metadata={
"help": """Name of the text source that should be used to tune the model.
One for each chat dataset.
Separator ','. 'None' is interpreted as None.
In case no ',' is present value will be replicated
for all chat datasets.
"""
},
)
chat_initial_prompt: str = field(default="You are a helpful ASSISTANT.\n\n")
chat_replace_rules: Optional[str] = field(
default=None,
metadata={
"help": (
"'<;>' separated list o describing pairs of replace regular expressions"
"for example, 'a<R>b<;>c<R>d' means first replace text that matches regex 'a' with string 'b'"
"then do the same for 'c' and 'd'."
)
},
)
chat_model_response_prefix: str = field(default="\nASSISTANT: ")
chat_human_response_prefix: str = field(default="\nUSER: ")
# proportions (for mixed dataset)
data_proportions: List[float] = field(
default_factory=lambda: [1.0], metadata={"help": "Space separated probability of sampling (for each dataset)"}
)
@dataclass
class TokenizationArgs:
# Note that max_input_length and max_output_length are only used for instructions data (not for chat)
# max_total_length is used for both
max_input_length: int = field(default=2048)
max_output_length: int = field(default=2048)
max_total_length: int = field(default=2048)
always_pad: bool = field(default=True, metadata={"help": "Whether to always pad data to max_total_length tokens"})
random_pad: bool = field(
default=True,
metadata={
"help": "Whether add padding tokens to the right only or sample the amount of left and right padding"
},
)