forked from nyu-mll/jiant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdefaults.conf
448 lines (379 loc) · 21.1 KB
/
defaults.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
// Default config file.
// Set values here, then override them in custom configs by including this at
// the top:
// my_experiment.conf:
// include "defaults.conf"
//
// exp_name = my_expt
// run_name = run1
// pretrain_tasks = "mnli,qnli"
//
// ... or in a command line flag:
// $ python main.py --config_file config/defaults.conf \
// --overrides "exp_name = my_expt, run_name = run1, pretrain_tasks = \"mnli,qnli\""
// This file uses HOCON, which is a JSON/YAML-like format that supports
// includes, references, and object merging semantics; see
// https://github.com/lightbend/config/blob/master/HOCON.md for reference.
// Misc. Logistics //
cuda = 0 // GPU ID. Set to -1 for CPU. On machines without GPUs, this is ignored.
random_seed = 1234 // Global random seed.
track_batch_utilization = 0 // Track % of each batch that is padding tokens (for tasks with field 'input1').
// Paths and Logging //
// You'll generally have to override these:
project_dir = ${NFS_PROJECT_PREFIX} // The base directory for model output. This default is meant for runs
// on the JSALT Google Cloud Platform (GCP) setup, and you'll need to
// override it elsewhere.
exp_name = common-indexed-tasks // Experiment name, will be a subdirectory of project_dir.
// This directory will contain all run directories, a results summary file,
// and preprocessed data files.
run_name = tuning-0 // Run name, will be a subdirectory of exp_name. This directory will contain logs,
// checkpoints, and model predictions.
data_dir = ${JIANT_DATA_DIR} // Base directory in which to look for raw data subdirectories. This will often be
// the glue_data directory created when using download_glue_data.py.
global_ro_exp_dir = "/nfs/jsalt/share/exp/default" // If you're using very large datasets for which preprocessing
// is slow, you can set this to point to a directory for a past experiment
// (different from the current one), and the 'preproc' index files will
// be read from that directory to save time. If this directory does not
// exist, all data will be preprocessed as usual without failing.
remote_log_name = ${exp_name}"__"${run_name} // Log name for GCP remote logging, if used. This should be globally unique to
// your run. Usually safe to ignore.
// You can safely ignore these, and they'll be set automatically:
exp_dir = ${project_dir}"/"${exp_name}"/"
run_dir = ${project_dir}"/"${exp_name}"/"${run_name}
local_log_path = ${run_dir}"/log.log"
// Tasks //
pretrain_tasks = sst // Comma-separated list of pretraining tasks or 'glue' or 'none'.
// If there are multiple entries, the list should contain no spaces, and quotation marks are needed.
// ex: --overrides "pretrain_tasks = \"glue,ccg\""
target_tasks = glue // Target tasks, in the same format as pretrain_tasks, above.
// Execution, Saving, and Loading //
// Three main stages of operation
do_pretrain = 1 // Train the shared sentence encoder model (and task-specific model parameters) on the pretraining tasks in
// pretrain_tasks.
do_target_task_training = 1 // After do_pretrain, train the task-specific model parameters on the target tasks in target_tasks.
do_full_eval = 1 // Evaluate the model on the tasks on target_tasks.
// Related configuration
load_model = 1 // If true, restore from checkpoint when starting do_pretrain. No impact on do_target_task_training.
load_eval_checkpoint = none // If not "none", load the specified model_state checkpoint file when starting do_target_task_training.
allow_untrained_encoder_parameters = 0 // Set for experiments involving random untrained encoders only. Allows do_target_task_training
// and do_full_eval to proceed without pretraining.
allow_reuse_of_pretraining_parameters = 0 // Set to 1 to allow task models that were trained during pretraining from being
// reused in do_target_task_training. This may behave incorrectly if a run is stopped and
// restarted in do_target_task_training (issues #285, #290).
allow_missing_task_map = 0 // Dangerous: If true, ignore missing classifier_task_map.json
// This is needed for bare-ELMo probing, since the main training phase is skipped for these models.
reload_tasks = 0 // If true, force the rebuilding of the task files in the experiment directory, even if they exist.
reload_indexing = 0 // If true, force the rebuilding of the index files in preproc/ for tasks in reindex_tasks, even if they
// exist.
eval_data_fraction = 1 // Use only the specified fraction of the training data in the do_target_task_training phase.
// should not impact main training, even for the same task.
// training_data_fraction should not be used together with eval_data_fraction
// both should not be less then 1 at a same time
reindex_tasks = ""
reload_vocab = 0 // If true, force the rebuilding of the vocabulary files in the experiment directory. For classification and
// regression tasks with the default ELMo-style character handling, there is no vocabulary.
is_probing_task = 0 // Set if the tasks in target_tasks are NLI-style probing tasks. (Ellie: Is this right?)
// Learning curves
training_data_fraction = 1 // Use only the specified fraction of the training data in the do_pretrain phase.
// Should not impact eval-phase training, even for the same task.
// Note: This uses rejection sampling at training time, so it can slow down training
// for small fractions (<5%).
// Training options //
// Optimization
trainer_type = sampling // Type of trainer object. Currently only one option: 'sampling'
shared_optimizer = 1 // If true, use same optimizer for all tasks. (Setting this to false may not be bug-free.)
batch_size = 32 // Training batch size.
optimizer = adam // Optimizer. All valid AllenNLP options are available, including 'sgd'.
// 'adam' uses the newer AMSGrad variant.
lr = 0.0001 // Initial learning rate.
min_lr = 0.000001 // Minimum learning rate. Training will stop when our explicit LR decay lowers the LR below this point.
max_grad_norm = 5.0 // Maximum gradient norm, for use in clipping.
lr_patience = 1 // [Uninformative name alert!] Patience to use (in validation checks) before decaying the learning rate.
// Learning rate will decay after lr_patience + 1 validation checks have completed with no improvement
// in validation score.
lr_decay_factor = 0.5 // Factor by which to decay LR (multiplicative) when lr_patience is reached.
scheduler_threshold = 0.0001 // Threshold used in deciding when to lower learning rate.
warmup = 4000 // Number of warmup steps for custom transformer LR schedule.
// Validation, Checkpointing, and Early Stopping
val_data_limit = 5000 // Maximum number of examples to be used during mid-training validations.
// We use the _first_ N (5000) examples from each dev set. Does not apply to the final validation run at the
// end of main.py that is invoked by do_full_eval.
val_interval = 1000 // Number of steps between validation checks.
// Must be divisible by bpp_base (which is usually 1).
max_vals = 1000 // Maximum number of validation checks. Will stop once this limit has been reached.
bpp_base = 1 // In multitask learning, number of steps to train each task before sampling a fresh task.
patience = 5 // Patience in early stopping. Training will stop if performance does not improve at all in patience + 1 validations.
keep_all_checkpoints = 0 // If set, keep checkpoints from every validation. Otherwise, keep only best and (if different) most recent.
// Multi-task Training
weighting_method = proportional // Weighting method for task sampling, relative to the number of training examples in each task:
// Options: uniform, power_<power>, softmax_<temp>
// proportional, proportional_log_batch, proportional_log_example,
// inverse, inverse_log_example, inverse_log_batch
scaling_method = uniform // Method for scaling loss:
// Options: uniform, max_power_<power>
// max_proportional, max_proportional_log
// max_inverse, max_inverse_log
// max_epoch_<E1_E2_..._En>
dec_val_scale = 250 // when training with increasing and decreasing val metrics
// (or multiple decreasing metrics), we use the macro average
// for early stopping, where decreasing metrics are aggregated
// as (1 - metric / dec_val_scale).
// Currently, perplexity is our only decreasing metric.
// Evaluation
write_preds = 0 // 0 for none _or_ comma-separated list of splits in {'train', 'val', 'test'} for which we should write predictions
// to disk during do_full_eval. Supported for GLUE tasks and a few others. You should see errors with unsupported tasks.
write_strict_glue_format = 0 // If true, write_preds will only write the 'index' and 'prediction' columns for GLUE tasks, and will
// use the filenames expected by the GLUE evaluation site.
// Preprocessing //
max_seq_len = 40 // Maximum sequence length, in tokens (usually words, even for models with char handling).
max_word_v_size = 30000 // Maximum input word vocab size, when creating a new embedding matrix. Not used for ELMo.
max_char_v_size = 250 // Maximum input char vocab size, when creating a new embedding matrix. Not used for ELMo.
max_targ_word_v_size = 20000 // Maximum target word vocab size for seq2seq tasks.
// Input Handling //
word_embs = none // The type of word embedding layer. Usually set to none when using ELMo.
// Options: none, scratch (i.e., trained from scratch), glove, fastText.
word_embs_file = ${WORD_EMBS_FILE} // Path to embeddings file.
fastText = 0 // Use dynamically computed fastText embeddings.
fastText_model_file = ${FASTTEXT_MODEL_FILE} // Path to fastText model file for dynamically computed embeddings.
d_word = 300 // Dimension of word embeddings. Not used by ELMo.
d_char = 100 // Dimension of char embeddings. Not used by ELMo.
n_char_filters = 100 // Number of filters in char CNN. Not used by ELMo.
char_filter_sizes = "2,3,4,5" // Size of char CNN filters.
elmo = 1 // If true, load and use ELMo.
elmo_chars_only = 1 // If true, use *only* the char CNN layer of ELMo. If false but elmo is true, use the full ELMo.
elmo_weight_file_path = none // Path to ELMo RNN weights file. Default ELMo weights will be used if "none".
cove = 0 // If true, use CoVe.
cove_fine_tune = 0 // If true, CoVe params are fine-tuned.
char_embs = 0 // If true, train char embeddings. This is separate from the ELMo char component, and the two
// usually aren't used together.
openai_transformer = 0 // If true, use OpenAI Transformer model for
// representations. Not compatible with other
// embeddings due to different tokenization.
openai_transformer_ckpt = "" // If non-empty, will load OpenAI Transformer
// from the given TensorFlow checkpoint.
// Checkpoint should be as created by the
// original release
// (openai/finetune-transformer-lm).
openai_embeddings_mode = "none" // How to handle the embedding layer of the
// OpenAI Transformer model.
// "none" returns only top-layer activation
// "cat" returns top-layer concatenated with
// lexical layer
// "only" returns only lexical layer
openai_transformer_fine_tune = 0 // If false, OpenAI Transformer weights will
// be frozen during training. If enabled,
// train according to global settings for
// training a token embedder module.
// Note that this might require a
// high amount of GPU memory, so consider
// lowering the batch size if set to true.
// Sentence Encoder //
sent_enc = rnn // Type of sentence encoder: 'bow', 'rnn' (for LSTM), or 'transformer'
// set to 'null' to do nothing but forward the skip connection.
// Note: Transformer has no known issues, but hasn't performed well, so there may be bugs.
// Note: 'bow' just skips the encoder step and passes the word representations to the task model, s
// so it is possible to combine attention or max pooling with the 'bow' encoder.
bidirectional = 1 // If true, the RNN encoder should be bidirectional.
d_hid = 1024 // Hidden dimension size (usually num_heads * d_proj for transformer)
n_layers_enc = 2 // Number of encoder layers. Usually 8–12 for Transformer.
skip_embs = 1 // If true, concatenate the encoder's input (ELMo or embeddings) with the encoder's output.
sep_embs_for_skip = 0 // Whether the skip embedding uses the same embedder object as the original embedding (before skip).
// Only makes a difference if we are using ELMo weights, where it allows the four tuned ELMo scalars
// to vary separately for each target task.
n_layers_highway = 0 // Number of highway layers between the embedding layer and the encoder. [Old. May not be bug-free.]
n_heads = 8 // Number of transformer heads.
d_tproj = 64 // Transformer projection dimension.
d_ff = 2048 // Transformer feed-forward dimension.
dropout = 0.2 // Dropout rate.
dropout_embs = ${dropout} // Dropout rate for embeddings, same as above by default.
// NB: This only applies to trained char embs, not including ELMo.
// Task-specific Options //
// These are _usually_ overridden for specific tasks, and are explicitly overridden in this file for many tasks, but defaults
// are set here.
// Model
classifier = mlp // The type of the final layer(s) in classification and regression tasks.
// Options:
// log_reg: Softmax layer with no additional hidden layer.
// mlp: One tanh+layernorm+dropout layer, followed by a softmax layer.
// fancy_mlp: Same as mlp, but with an additional hidden layer. Fancy!
classifier_hid_dim = 512 // The hidden dimension size for mlp and fancy_mlp.
classifier_dropout = 0.2 // The dropout rate for mlp and fancy_mlp.
pair_attn = 1 // If true, use attn in sentence-pair classification/regression tasks.
d_hid_attn = 512 // Post-attention LSTM state size.
shared_pair_attn = 0 // If true, share pair_attn parameters across all tasks that use it.
d_proj = 512 // Size of task-specific linear projection applied before before pooling.
classifier_loss_fn = "" // Classifier loss function. Used only in some specialized tasks, not mlp/fancy_mlp.
classifier_span_pooling = "x,y" // Span pooling type (for edge probing only).
// Options: 'attn' or one of the 'combination' arguments accepted by AllenNLP's
// EndpointSpanExtractor.
s2s {
d_hid_dec = 1024 // The hidden size of the decoder in seq2seq tasks.
n_layers_dec = 1 // The number of decoder layers in seq2seq tasks.
target_embedding_dim = 300 // The size of target word embeddings in seq2seq tasks.
attention = "bilinear" // Attention used in s2s. Current implemented options are "bilinear" and "none".
output_proj_input_dim = 1024 // Dimension of bottleneck layer in s2s decoder output projection. If
// output_proj_input_dim == d_hid_dec, will not add projection.
}
edgeprobe_cnn_context = 0 // expanded context for edge probing via CNN.
// 0 looks at only the current word, 1 adds +/-
// words (kernel width 3), etc.
// Training
eval_val_interval = 500 // Comparable to val_interval, used during do_target_task_training. Can be set separately per task.
// MUST be divisible by bpp_base.
eval_max_vals = 1000 // Comparable to max_vals, used during do_target_task_training. Can be set separately per task.
// Evaluation
use_classifier = "" // Used to make some task (usually a probing task with no training set) use a model that was trained for
// a different task at do_full_eval time. This should be overridden for each probing task, and set to the
// name of the trained task.
// Task-Specific Overrides //
// Note: Model params apply during all phases, but trainer params like LR apply only during eval phase.
mnli-diagnostic { use_classifier = "mnli" }
rte = {}
rte_classifier_hid_dim = 128
rte_d_proj = 128
rte_classifier_dropout = 0.4
rte_pair_attn = 0
rte_val_interval = 100
rte_lr = 0.0003
wnli = {}
wnli_classifier_hid_dim = 128
wnli_d_proj = 128
wnli_classifier_dropout = 0.4
wnli_pair_attn = 0
wnli_val_interval = 100
wnli_lr = 0.0003
mrpc = {}
mrpc_classifier_hid_dim = 256
mrpc_d_proj = 256
mrpc_classifier_dropout = 0.2
mrpc_pair_attn = 0
mrpc_val_interval = 100
mrpc_lr = 0.0003
sst = {}
sst_classifier_hid_dim = 256
sst_d_proj = 256
sst_classifier_dropout = 0.2
sst_val_interval = 100
sst_lr = 0.0003
cola = {}
cola_classifier_hid_dim = 256
cola_d_proj = 256
cola_classifier_dropout = 0.2
cola_val_interval = 100
cola_lr = 0.0003
sts-b = {}
sts-b_classifier_hid_dim = 512
sts-b_classifier_dropout = 0.2
sts-b_pair_attn = 1
sts-b_val_interval = 1000
sts-b_lr = 0.0003
sts-b-alt = {}
sts-b-alt_classifier_hid_dim = 512
sts-b-alt_classifier_dropout = 0.2
sts-b-alt_pair_attn = 1
sts-b-alt_val_interval = 1000
sts-b-alt_lr = 0.0003
qnli = {}
qnli_classifier_hid_dim = 512
qnli_classifier_dropout = 0.2
qnli_pair_attn = 1
qnli_val_interval = 1000
qnli_lr = 0.0003
qnli-alt = {}
qnli-alt_classifier_hid_dim = 512
qnli-alt_classifier_dropout = 0.2
qnli-alt_pair_attn = 1
qnli-alt_val_interval = 1000
qnli-alt_lr = 0.0003
mnli = {}
mnli_classifier_hid_dim = 512
mnli_classifier_dropout = 0.2
mnli_pair_attn = 1
mnli_val_interval = 1000
mnli_lr = 0.0003
mnli-alt = {}
mnli-alt_classifier_hid_dim = 512
mnli-alt_classifier_dropout = 0.2
mnli-alt_pair_attn = 1
mnli-alt_val_interval = 1000
mnli-alt_lr = 0.0003
qqp = {}
qqp_classifier_hid_dim = 512
qqp_classifier_dropout = 0.2
qqp_pair_attn = 1
qqp_val_interval = 1000
qqp_lr = 0.0003
grounded = {}
grounded_d_proj = 2048
groundedsw = {}
groundedsw_d_proj = 2048
qqp-alt = {}
qqp-alt_classifier_hid_dim = 512
qqp-alt_classifier_dropout = 0.2
qqp-alt_pair_attn = 1
qqp-alt_val_interval = 1000
qqp-alt_lr = 0.0003
nli-prob {
probe_path = ""
}
// Edge-Probing Experiments //
// Template: Not used for any single task, but extended per-task below.
edges-tmpl {
classifier_loss_fn = "sigmoid" // 'sigmoid' or 'softmax'
classifier_span_pooling = "attn" // 'attn' or 'x,y'
classifier_hid_dim = 256
classifier_dropout = 0.3
pair_attn = 0
// Default iters; run 50k steps.
max_vals = 100
val_interval = 500
}
edges-srl-conll2005 = ${edges-tmpl}
edges-srl-conll2012 = ${edges-tmpl} {
val_interval = 1000
}
edges-spr1 = ${edges-tmpl} {
val_interval = 100
}
edges-spr2 = ${edges-tmpl} {
val_interval = 100
}
edges-dpr = ${edges-tmpl} {
val_interval = 100
}
edges-coref-ontonotes = ${edges-tmpl}
edges-coref-ontonotes-conll = ${edges-tmpl} {
max_vals = 250
val_interval = 1000
}
edges-ner-conll2003 = ${edges-tmpl} {
val_interval = 250
}
edges-ner-ontonotes = ${edges-tmpl} {
max_vals = 250
val_interval = 1000
}
edges-dep-labeling = ${edges-tmpl}
edges-dep-labeling-ewt = ${edges-tmpl} {
max_vals = 250
val_interval = 1000
}
edges-constituent-ptb = ${edges-tmpl}
edges-constituent-ontonotes = ${edges-tmpl} {
max_vals = 250
val_interval = 1000
}
edges-pos-ontonotes = ${edges-constituent-ontonotes}
edges-nonterminal-ontonotes = ${edges-constituent-ontonotes}
// These tasks are still very slow. TODO: Debug.
edges-ccg-tag = ${edges-tmpl}
edges-ccg-parse = ${edges-tmpl}
// OpenAI transformer versions.
edges-srl-conll2012-openai = ${edges-srl-conll2012}
edges-spr1-openai = ${edges-spr1}
edges-spr2-openai = ${edges-spr2}
edges-dpr-openai = ${edges-dpr}
edges-coref-ontonotes-conll-openai = ${edges-coref-ontonotes-conll}
edges-ner-ontonotes-openai = ${edges-ner-ontonotes}
edges-dep-labeling-ewt-openai = ${edges-dep-labeling-ewt}
edges-constituent-ontonotes-openai = ${edges-constituent-ontonotes}