config/defaults.conf

// Default config file.
// Set values here, then override them in custom configs by including this at
// the top:
// my_experiment.conf:
//   include "defaults.conf"
//
//   exp_name = my_expt
//   run_name = run1
//   pretrain_tasks = "mnli,qnli"
//
// ... or in a command line flag:
// $ python main.py --config_file config/defaults.conf \
//       --overrides "exp_name = my_expt, run_name = run1, pretrain_tasks = \"mnli,qnli\""

// This file uses HOCON, which is a JSON/YAML-like format that supports
// includes, references, and object merging semantics; see
// https://github.com/lightbend/config/blob/master/HOCON.md for reference.


// Misc. Logistics //

cuda = 0  // GPU ID. Set to -1 for CPU. On machines without GPUs, this is ignored.
random_seed = 1234  // Global random seed.
track_batch_utilization = 0  // Track % of each batch that is padding tokens (for tasks with field 'input1').


// Paths and Logging //

// You'll generally have to override these:
project_dir = ${NFS_PROJECT_PREFIX}  // The base directory for model output. This default is meant for runs
                                     // on the JSALT Google Cloud Platform (GCP) setup, and you'll need to
                                     // override it elsewhere.
exp_name = common-indexed-tasks  // Experiment name, will be a subdirectory of project_dir.
                                   // This directory will contain all run directories, a results summary file,
                                   // and preprocessed data files.
run_name = tuning-0  // Run name, will be a subdirectory of exp_name. This directory will contain logs,
                       // checkpoints, and model predictions.
data_dir = ${JIANT_DATA_DIR}  // Base directory in which to look for raw data subdirectories. This will often be
                              // the glue_data directory created when using download_glue_data.py.
global_ro_exp_dir = "/nfs/jsalt/share/exp/default"  // If you're using very large datasets for which preprocessing
                                                    // is slow, you can set this to point to a directory for a past experiment
                                                    // (different from the current one), and the 'preproc' index files will
                                                    // be read from that directory to save time. If this directory does not
                                                    // exist, all data will be preprocessed as usual without failing.

remote_log_name = ${exp_name}"__"${run_name}  // Log name for GCP remote logging, if used. This should be globally unique to
                                              // your run. Usually safe to ignore.

// You can safely ignore these, and they'll be set automatically:
exp_dir = ${project_dir}"/"${exp_name}"/"
run_dir = ${project_dir}"/"${exp_name}"/"${run_name}
local_log_path = ${run_dir}"/log.log"


// Tasks //

pretrain_tasks = sst // Comma-separated list of pretraining tasks or 'glue' or 'none'.
                  // If there are multiple entries, the list should contain no spaces, and quotation marks are needed.
                  // ex: --overrides "pretrain_tasks = \"glue,ccg\""
target_tasks = glue  // Target tasks, in the same format as pretrain_tasks, above.


// Execution, Saving, and Loading //

// Three main stages of operation
do_pretrain = 1  // Train the shared sentence encoder model (and task-specific model parameters) on the pretraining tasks in
              // pretrain_tasks.
do_target_task_training = 1  // After do_pretrain, train the task-specific model parameters on the target tasks in target_tasks.
do_full_eval = 1     // Evaluate the model on the tasks on target_tasks.

// Related configuration
load_model = 1  // If true, restore from checkpoint when starting do_pretrain. No impact on do_target_task_training.
load_eval_checkpoint = none  // If not "none", load the specified model_state checkpoint file when starting do_target_task_training.
allow_untrained_encoder_parameters = 0  // Set for experiments involving random untrained encoders only. Allows do_target_task_training
                                        // and do_full_eval to proceed without pretraining.
allow_reuse_of_pretraining_parameters = 0  // Set to 1 to allow task models that were trained during pretraining from being
                                           // reused in do_target_task_training. This may behave incorrectly if a run is stopped and
                                           // restarted in do_target_task_training (issues #285, #290).

allow_missing_task_map = 0  // Dangerous: If true, ignore missing classifier_task_map.json
                            // This is needed for bare-ELMo probing, since the main training phase is skipped for these models.
reload_tasks = 0     // If true, force the rebuilding of the task files in the experiment directory, even if they exist.
reload_indexing = 0  // If true, force the rebuilding of the index files in preproc/ for tasks in reindex_tasks, even if they
                     // exist.
eval_data_fraction = 1  // Use only the specified fraction of the training data in the do_target_task_training phase.
                        // should not impact main training, even for the same task.
                        // training_data_fraction should not be used together with eval_data_fraction
                        // both should not be less then 1 at a same time
reindex_tasks = ""

reload_vocab = 0     // If true, force the rebuilding of the vocabulary files in the experiment directory. For classification and
                     // regression tasks with the default ELMo-style character handling, there is no vocabulary.

is_probing_task = 0  // Set if the tasks in target_tasks are NLI-style probing tasks. (Ellie: Is this right?)

// Learning curves
training_data_fraction = 1  // Use only the specified fraction of the training data in the do_pretrain phase.
                            // Should not impact eval-phase training, even for the same task.
                            // Note: This uses rejection sampling at training time, so it can slow down training
                            // for small fractions (<5%).


// Training options //

// Optimization
trainer_type = sampling  // Type of trainer object. Currently only one option: 'sampling'
shared_optimizer = 1  // If true, use same optimizer for all tasks. (Setting this to false may not be bug-free.)
batch_size = 32  // Training batch size.
optimizer = adam  // Optimizer. All valid AllenNLP options are available, including 'sgd'.
                  // 'adam' uses the newer AMSGrad variant.
lr = 0.0001  // Initial learning rate.
min_lr = 0.000001  // Minimum learning rate. Training will stop when our explicit LR decay lowers the LR below this point.
max_grad_norm = 5.0  // Maximum gradient norm, for use in clipping.
lr_patience = 1  // [Uninformative name alert!] Patience to use (in validation checks) before decaying the learning rate.
                   // Learning rate will decay after lr_patience + 1 validation checks have completed with no improvement
                   // in validation score.
lr_decay_factor = 0.5  // Factor by which to decay LR (multiplicative) when lr_patience is reached.
scheduler_threshold = 0.0001  // Threshold used in deciding when to lower learning rate.
warmup = 4000  // Number of warmup steps for custom transformer LR schedule.

// Validation, Checkpointing, and Early Stopping
val_data_limit = 5000  // Maximum number of examples to be used during mid-training validations.
                       // We use the _first_ N (5000) examples from each dev set. Does not apply to the final validation run at the
                       // end of main.py that is invoked by do_full_eval.
val_interval = 1000  // Number of steps between validation checks.
                     // Must be divisible by bpp_base (which is usually 1).
max_vals = 1000  // Maximum number of validation checks. Will stop once this limit has been reached.
bpp_base = 1  // In multitask learning, number of steps to train each task before sampling a fresh task.
patience = 5  // Patience in early stopping. Training will stop if performance does not improve at all in patience + 1 validations.
keep_all_checkpoints = 0  // If set, keep checkpoints from every validation. Otherwise, keep only best and (if different) most recent.

// Multi-task Training
weighting_method = proportional  // Weighting method for task sampling, relative to the number of training examples in each task:
                                   // Options: uniform, power_<power>, softmax_<temp>
                                   // proportional, proportional_log_batch, proportional_log_example,
                                   // inverse, inverse_log_example, inverse_log_batch
scaling_method = uniform  // Method for scaling loss:
                            // Options: uniform, max_power_<power>
                            // max_proportional, max_proportional_log
                            // max_inverse, max_inverse_log
                            // max_epoch_<E1_E2_..._En>
dec_val_scale = 250  // when training with increasing and decreasing val metrics
                     // (or multiple decreasing metrics), we use the macro average
                     // for early stopping, where decreasing metrics are aggregated
                     // as (1 - metric / dec_val_scale).
                     // Currently, perplexity is our only decreasing metric.

// Evaluation
write_preds = 0  // 0 for none _or_ comma-separated list of splits in {'train', 'val', 'test'} for which we should write predictions
                 // to disk during do_full_eval. Supported for GLUE tasks and a few others. You should see errors with unsupported tasks.
write_strict_glue_format = 0  // If true, write_preds will only write the 'index' and 'prediction' columns for GLUE tasks, and will
                              // use the filenames expected by the GLUE evaluation site.


// Preprocessing //

max_seq_len = 40  // Maximum sequence length, in tokens (usually words, even for models with char handling).
max_word_v_size = 30000  // Maximum input word vocab size, when creating a new embedding matrix. Not used for ELMo.
max_char_v_size = 250  // Maximum input char vocab size, when creating a new embedding matrix. Not used for ELMo.
max_targ_word_v_size = 20000  // Maximum target word vocab size for seq2seq tasks.


// Input Handling //

word_embs = none  // The type of word embedding layer. Usually set to none when using ELMo.
                  // Options: none, scratch (i.e., trained from scratch), glove, fastText.
word_embs_file = ${WORD_EMBS_FILE}  // Path to embeddings file.
fastText = 0  // Use dynamically computed fastText embeddings.
fastText_model_file = ${FASTTEXT_MODEL_FILE}  // Path to fastText model file for dynamically computed embeddings.
d_word = 300  //  Dimension of word embeddings. Not used by ELMo.
d_char = 100  //  Dimension of char embeddings. Not used by ELMo.
n_char_filters = 100  // Number of filters in char CNN. Not used by ELMo.
char_filter_sizes = "2,3,4,5"  // Size of char CNN filters.
elmo = 1  // If true, load and use ELMo.
elmo_chars_only = 1  // If true, use *only* the char CNN layer of ELMo. If false but elmo is true, use the full ELMo.
elmo_weight_file_path = none  // Path to ELMo RNN weights file.  Default ELMo weights will be used if "none".
cove = 0  // If true, use CoVe.
cove_fine_tune = 0  // If true, CoVe params are fine-tuned.
char_embs = 0  // If true, train char embeddings. This is separate from the ELMo char component, and the two
               // usually aren't used together.
openai_transformer = 0  // If true, use OpenAI Transformer model for
                        // representations. Not compatible with other
                        // embeddings due to different tokenization.
openai_transformer_ckpt = ""  // If non-empty, will load OpenAI Transformer
                              // from the given TensorFlow checkpoint.
                              // Checkpoint should be as created by the
                              // original release
                              // (openai/finetune-transformer-lm).
openai_embeddings_mode = "none"  // How to handle the embedding layer of the
                                 // OpenAI Transformer model.
                                 // "none" returns only top-layer activation
                                 // "cat" returns top-layer concatenated with
                                 //   lexical layer
                                 // "only" returns only lexical layer
openai_transformer_fine_tune = 0  // If false, OpenAI Transformer weights will
                                  // be frozen during training. If enabled,
                                  // train according to global settings for
                                  // training a token embedder module.
                                  // Note that this might require a
                                  // high amount of GPU memory, so consider
                                  // lowering the batch size if set to true.

// Sentence Encoder //
sent_enc = rnn  // Type of sentence encoder: 'bow', 'rnn' (for LSTM), or 'transformer'
                // set to 'null' to do nothing but forward the skip connection.
                // Note: Transformer has no known issues, but hasn't performed well, so there may be bugs.
                // Note: 'bow' just skips the encoder step and passes the word representations to the task model, s
                //   so it is possible to combine attention or max pooling with the 'bow' encoder.
bidirectional = 1  // If true, the RNN encoder should be bidirectional.
d_hid = 1024  // Hidden dimension size (usually num_heads * d_proj for transformer)
n_layers_enc = 2  // Number of encoder layers. Usually 8–12 for Transformer.
skip_embs = 1  // If true, concatenate the encoder's input (ELMo or embeddings) with the encoder's output.
sep_embs_for_skip = 0  // Whether the skip embedding uses the same embedder object as the original embedding (before skip).
                       // Only makes a difference if we are using ELMo weights, where it allows the four tuned ELMo scalars
                       // to vary separately for each target task.
n_layers_highway = 0  // Number of highway layers between the embedding layer and the encoder. [Old. May not be bug-free.]
n_heads = 8  // Number of transformer heads.
d_tproj = 64  // Transformer projection dimension.
d_ff = 2048   // Transformer feed-forward dimension.
dropout = 0.2  // Dropout rate.
dropout_embs = ${dropout}  // Dropout rate for embeddings, same as above by default.
                           // NB: This only applies to trained char embs, not including ELMo.


// Task-specific Options //
// These are _usually_ overridden for specific tasks, and are explicitly overridden in this file for many tasks, but defaults
// are set here.

// Model
classifier = mlp  // The type of the final layer(s) in classification and regression tasks.
                  // Options:
                  //   log_reg: Softmax layer with no additional hidden layer.
                  //   mlp: One tanh+layernorm+dropout layer, followed by a softmax layer.
                  //   fancy_mlp: Same as mlp, but with an additional hidden layer. Fancy!
classifier_hid_dim = 512  // The hidden dimension size for mlp and fancy_mlp.
classifier_dropout = 0.2  // The dropout rate for mlp and fancy_mlp.
pair_attn = 1  // If true, use attn in sentence-pair classification/regression tasks.
d_hid_attn = 512  // Post-attention LSTM state size.
shared_pair_attn = 0  // If true, share pair_attn parameters across all tasks that use it. 
d_proj = 512  // Size of task-specific linear projection applied before before pooling.
classifier_loss_fn = ""  // Classifier loss function. Used only in some specialized tasks, not mlp/fancy_mlp.
classifier_span_pooling = "x,y"  // Span pooling type (for edge probing only).
                                 // Options: 'attn' or one of the 'combination' arguments accepted by AllenNLP's
                                 //   EndpointSpanExtractor.

s2s {
    d_hid_dec = 1024  // The hidden size of the decoder in seq2seq tasks.
    n_layers_dec = 1  // The number of decoder layers in seq2seq tasks.
    target_embedding_dim = 300  // The size of target word embeddings in seq2seq tasks.
    attention = "bilinear"  // Attention used in s2s. Current implemented options are "bilinear" and "none".
    output_proj_input_dim = 1024  // Dimension of bottleneck layer in s2s decoder output projection. If
                                  // output_proj_input_dim == d_hid_dec, will not add projection.
}

edgeprobe_cnn_context = 0  // expanded context for edge probing via CNN.
                           // 0 looks at only the current word, 1 adds +/-
                           // words (kernel width 3), etc.

// Training
eval_val_interval = 500  // Comparable to val_interval, used during do_target_task_training. Can be set separately per task.
                         // MUST be divisible by bpp_base.
eval_max_vals = 1000  // Comparable to max_vals, used during do_target_task_training. Can be set separately per task.

// Evaluation
use_classifier = ""  // Used to make some task (usually a probing task with no training set) use a model that was trained for
                     // a different task at do_full_eval time. This should be overridden for each probing task, and set to the
                     // name of the trained task.


// Task-Specific Overrides //
// Note: Model params apply during all phases, but trainer params like LR apply only during eval phase.

mnli-diagnostic { use_classifier = "mnli" }

rte = {}
rte_classifier_hid_dim = 128
rte_d_proj = 128
rte_classifier_dropout = 0.4
rte_pair_attn = 0
rte_val_interval = 100
rte_lr = 0.0003

wnli = {}
wnli_classifier_hid_dim = 128
wnli_d_proj = 128
wnli_classifier_dropout = 0.4
wnli_pair_attn = 0
wnli_val_interval = 100
wnli_lr = 0.0003

mrpc = {}
mrpc_classifier_hid_dim = 256
mrpc_d_proj = 256
mrpc_classifier_dropout = 0.2
mrpc_pair_attn = 0
mrpc_val_interval = 100
mrpc_lr = 0.0003

sst = {}
sst_classifier_hid_dim = 256
sst_d_proj = 256
sst_classifier_dropout = 0.2
sst_val_interval = 100
sst_lr = 0.0003

cola = {}
cola_classifier_hid_dim = 256
cola_d_proj = 256
cola_classifier_dropout = 0.2
cola_val_interval = 100
cola_lr = 0.0003

sts-b = {}
sts-b_classifier_hid_dim = 512
sts-b_classifier_dropout = 0.2
sts-b_pair_attn = 1
sts-b_val_interval = 1000
sts-b_lr = 0.0003

sts-b-alt = {}
sts-b-alt_classifier_hid_dim = 512
sts-b-alt_classifier_dropout = 0.2
sts-b-alt_pair_attn = 1
sts-b-alt_val_interval = 1000
sts-b-alt_lr = 0.0003

qnli = {}
qnli_classifier_hid_dim = 512
qnli_classifier_dropout = 0.2
qnli_pair_attn = 1
qnli_val_interval = 1000
qnli_lr = 0.0003

qnli-alt = {}
qnli-alt_classifier_hid_dim = 512
qnli-alt_classifier_dropout = 0.2
qnli-alt_pair_attn = 1
qnli-alt_val_interval = 1000
qnli-alt_lr = 0.0003

mnli = {}
mnli_classifier_hid_dim = 512
mnli_classifier_dropout = 0.2
mnli_pair_attn = 1
mnli_val_interval = 1000
mnli_lr = 0.0003

mnli-alt = {}
mnli-alt_classifier_hid_dim = 512
mnli-alt_classifier_dropout = 0.2
mnli-alt_pair_attn = 1
mnli-alt_val_interval = 1000
mnli-alt_lr = 0.0003

qqp = {}
qqp_classifier_hid_dim = 512
qqp_classifier_dropout = 0.2
qqp_pair_attn = 1
qqp_val_interval = 1000
qqp_lr = 0.0003

grounded = {}
grounded_d_proj = 2048

groundedsw = {}
groundedsw_d_proj = 2048

qqp-alt = {}
qqp-alt_classifier_hid_dim = 512
qqp-alt_classifier_dropout = 0.2
qqp-alt_pair_attn = 1
qqp-alt_val_interval = 1000
qqp-alt_lr = 0.0003

nli-prob {
  probe_path = ""
}


// Edge-Probing Experiments //

// Template: Not used for any single task, but extended per-task below.
edges-tmpl {
    classifier_loss_fn = "sigmoid"  // 'sigmoid' or 'softmax'
    classifier_span_pooling = "attn"  // 'attn' or 'x,y'
    classifier_hid_dim = 256
    classifier_dropout = 0.3
    pair_attn = 0

    // Default iters; run 50k steps.
    max_vals = 100
    val_interval = 500
}

edges-srl-conll2005 = ${edges-tmpl}
edges-srl-conll2012 = ${edges-tmpl} {
    val_interval = 1000
}
edges-spr1 = ${edges-tmpl} {
    val_interval = 100
}
edges-spr2 = ${edges-tmpl} {
    val_interval = 100
}

edges-dpr = ${edges-tmpl} {
    val_interval = 100
}
edges-coref-ontonotes = ${edges-tmpl}
edges-coref-ontonotes-conll = ${edges-tmpl} {
    max_vals = 250
    val_interval = 1000
}
edges-ner-conll2003 = ${edges-tmpl} {
    val_interval = 250
}
edges-ner-ontonotes = ${edges-tmpl} {
    max_vals = 250
    val_interval = 1000
}

edges-dep-labeling = ${edges-tmpl}
edges-dep-labeling-ewt = ${edges-tmpl} {
    max_vals = 250
    val_interval = 1000
}
edges-constituent-ptb = ${edges-tmpl}
edges-constituent-ontonotes = ${edges-tmpl} {
    max_vals = 250
    val_interval = 1000
}
edges-pos-ontonotes = ${edges-constituent-ontonotes}
edges-nonterminal-ontonotes = ${edges-constituent-ontonotes}

// These tasks are still very slow. TODO: Debug.
edges-ccg-tag = ${edges-tmpl}
edges-ccg-parse = ${edges-tmpl}

// OpenAI transformer versions.
edges-srl-conll2012-openai = ${edges-srl-conll2012}
edges-spr1-openai = ${edges-spr1}
edges-spr2-openai = ${edges-spr2}
edges-dpr-openai = ${edges-dpr}
edges-coref-ontonotes-conll-openai = ${edges-coref-ontonotes-conll}
edges-ner-ontonotes-openai = ${edges-ner-ontonotes}
edges-dep-labeling-ewt-openai = ${edges-dep-labeling-ewt}
edges-constituent-ontonotes-openai = ${edges-constituent-ontonotes}