diff --git a/.gitignore b/.gitignore index fb220d169..9964b7635 100644 --- a/.gitignore +++ b/.gitignore @@ -87,4 +87,4 @@ ENV/ .spyderproject # Rope project settings -.ropeproject +.ropeproject \ No newline at end of file diff --git a/example_configs/lm/lstm-test-small-mixed.py b/example_configs/lm/lstm-test-small-mixed.py new file mode 100644 index 000000000..52b54d138 --- /dev/null +++ b/example_configs/lm/lstm-test-small-mixed.py @@ -0,0 +1,131 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import WKTDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-2-raw DATA]" +processed_data_folder = 'wkt2-processed-folder' +base_model = LSTMLM +bptt = 12 +steps = 40 + +base_params = { + "restore_best_checkpoint": True, + "processed_data_folder": processed_data_folder, + "use_horovod": False, + "num_gpus": 2, + + "batch_size_per_gpu": 160, + "num_epochs": 1500, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "logdir": "LSTM-FP32-2GPU-SMALL-MIXED", + "eval_steps": steps * 2, + + "optimizer": "Adam", + "optimizer_params": {}, + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 9e-4 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + # "dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": WeightDropLayerNormBasicLSTMCell, + "core_cell_params": { + "num_units": 128, + "forget_bias": 1.0, + }, + "encoder_layers": 2, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.6, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.6, + "recurrent_keep_prob": 0.7, + 'encoder_emb_keep_prob': 0.37, + "encoder_use_skip_connections": False, + "emb_size": 64, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": False, + "average_across_timestep": True, + "do_mask": False, + } +} + +train_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "data_root": data_root, + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "rand_start": True, + "shuffle": False, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "bptt": bptt, + "small": True, + }, +} +eval_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "bptt": bptt, + "small": True, + }, +} + +infer_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "bptt": bptt, + "seed_tokens": "something The only game", + }, +} diff --git a/example_configs/lm/lstm-test-small.py b/example_configs/lm/lstm-test-small.py new file mode 100644 index 000000000..860f228aa --- /dev/null +++ b/example_configs/lm/lstm-test-small.py @@ -0,0 +1,132 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import WKTDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-2-raw DATA]" +processed_data_folder = 'wkt2-processed-data' + +base_model = LSTMLM +bptt = 12 +steps = 10 + +base_params = { + "restore_best_checkpoint": True, + "use_horovod": False, + "num_gpus": 2, + + "batch_size_per_gpu": 160, + "num_epochs": 1500, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "processed_data_folder": processed_data_folder, + "logdir": "LSTM-FP32-2GPU-SMALL", + "eval_steps": steps * 2, + + "optimizer": "Adam", + "optimizer_params": {}, + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 9e-4 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "dtype": tf.float32, + # "dtype": "mixed", + # "loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": WeightDropLayerNormBasicLSTMCell, + "core_cell_params": { + "num_units": 128, + "forget_bias": 1.0, + }, + "encoder_layers": 2, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.6, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.6, + "recurrent_keep_prob": 0.7, + 'encoder_emb_keep_prob': 0.37, + "encoder_use_skip_connections": False, + "emb_size": 64, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": False, + "average_across_timestep": True, + "do_mask": False, + } +} + +train_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "data_root": data_root, + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "rand_start": True, + "shuffle": False, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "bptt": bptt, + "small": True, + }, +} +eval_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "bptt": bptt, + "small": True, + }, +} + +infer_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "processed_data_folder": processed_data_folder, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "bptt": bptt, + "seed_tokens": "something The only game", + }, +} diff --git a/example_configs/lm/lstm-wkt103-mixed.py b/example_configs/lm/lstm-wkt103-mixed.py new file mode 100644 index 000000000..f077dd2f7 --- /dev/null +++ b/example_configs/lm/lstm-wkt103-mixed.py @@ -0,0 +1,128 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import WKTDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import BasicSampledSequenceLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-103-raw DATA]" +processed_data_folder = 'wkt103-processed-data' + +base_model = LSTMLM +bptt = 96 +steps = 40 + +base_params = { + "restore_best_checkpoint": True, + "use_horovod": True, + "num_gpus": 8, + + "batch_size_per_gpu": 224, + "eval_batch_size_per_gpu": 56, + "num_epochs": 1500, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "logdir": "LSTM-WKT103-MIXED", + "processed_data_folder": processed_data_folder, + "eval_steps": steps * 4, + + "optimizer": "Adam", + "optimizer_params": {}, + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 1e-3 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + # "max_grad_norm": 0.25, + # "dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": WeightDropLayerNormBasicLSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, + "encoder_layers": 3, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.85, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.85, + "recurrent_keep_prob": 0.7, + 'encoder_emb_keep_prob': 0.8, + "encoder_use_skip_connections": False, + "emb_size": 320, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + "num_sampled": 8192, + }, + + "decoder": FakeDecoder, + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": BasicSampledSequenceLoss, + "loss_params": { + "offset_target_by_one": False, + "average_across_timestep": True, + "do_mask": False, + } +} + +train_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "data_root": data_root, + "pad_vocab_to_eight": False, + "rand_start": True, + "shuffle": True, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "bptt": bptt, + }, +} +eval_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "bptt": bptt, + }, +} + +infer_params = { + "data_layer": WKTDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "bptt": bptt, + "seed_tokens": "something The only game", + }, +} \ No newline at end of file diff --git a/example_configs/lstmlm/lstmlm-test.py b/example_configs/lm/lstm-wkt2-fp32.py similarity index 61% rename from example_configs/lstmlm/lstmlm-test.py rename to example_configs/lm/lstm-wkt2-fp32.py index 187373a44..e80d6d53f 100644 --- a/example_configs/lstmlm/lstmlm-test.py +++ b/example_configs/lm/lstm-wkt2-fp32.py @@ -2,78 +2,68 @@ from open_seq2seq.models import LSTMLM from open_seq2seq.encoders import LMEncoder -# from open_seq2seq.encoders import BidirectionalRNNEncoderWithEmbedding from open_seq2seq.decoders import FakeDecoder -from open_seq2seq.data import LMTextDataLayer, LMTextDataLayerGenerate +from open_seq2seq.data import WKTDataLayer from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell -# from open_seq2seq.losses import CrossEntropyLoss from open_seq2seq.losses import BasicSequenceLoss from open_seq2seq.optimizers.lr_policies import fixed_lr -# from open_seq2seq.data.text2text.text2text import SpecialTextTokens -# from open_seq2seq.optimizers.lr_policies import exp_decay -data_root = "[REPLACE THIS TO THE PATH WITH YOUR WIKITEXT DATA]" +data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-2-raw DATA]" +processed_data_folder = 'wkt2-processed-data' base_model = LSTMLM -bptt = 72 +bptt = 96 steps = 40 base_params = { - "restore_best_checkpoint": True, # best checkpoint is only saved when using train_eval mode - "use_horovod": False, + "restore_best_checkpoint": True, + "use_horovod": True, "num_gpus": 2, - "batch_size_per_gpu": 160, + "batch_size_per_gpu": 160, "num_epochs": 1500, "save_summaries_steps": steps, "print_loss_steps": steps, "print_samples_steps": steps, "save_checkpoint_steps": steps, - "logdir": "LSTM-FP32-2GPU", + "logdir": "LSTM-WKT2-FP32", + "processed_data_folder": processed_data_folder, "eval_steps": steps * 2, - "optimizer": "Adam", # need to change to NT-ASGD + "optimizer": "Adam", "optimizer_params": {}, - # luong10 decay scheme "lr_policy": fixed_lr, "lr_policy_params": { - "learning_rate": 9e-4 + "learning_rate": 4e-4 }, - "summaries": ['learning_rate', 'variables', 'gradients', 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - + # "max_grad_norm": 0.25, "dtype": tf.float32, #"dtype": "mixed", #"automatic_loss_scaling": "Backoff", "encoder": LMEncoder, - # "encoder": BidirectionalRNNEncoderWithEmbedding, - "encoder_params": { # will need to update + "encoder_params": { "initializer": tf.random_uniform_initializer, - "initializer_params": { # need different initializers for embeddings and for weights + "initializer_params": { "minval": -0.1, "maxval": 0.1, }, "core_cell": WeightDropLayerNormBasicLSTMCell, "core_cell_params": { - "num_units": 800, # paper 1150 - "forget_bias": 1.0, - }, - "last_cell_params": { - "num_units": 320, + "num_units": 896, "forget_bias": 1.0, }, "encoder_layers": 3, "encoder_dp_input_keep_prob": 1.0, - "encoder_dp_output_keep_prob": 0.6, # output dropout for middle layer 0.3 + "encoder_dp_output_keep_prob": 0.6, "encoder_last_input_keep_prob": 1.0, - "encoder_last_output_keep_prob": 0.6, # output droput at last layer is 0.4 + "encoder_last_output_keep_prob": 0.6, "recurrent_keep_prob": 0.7, 'encoder_emb_keep_prob': 0.37, "encoder_use_skip_connections": False, - "emb_size": 320, - "vocab_size": 33278, + "emb_size": 256, "num_tokens_gen": 10, "sampling_prob": 0.0, # 0 is always use the ground truth "fc_use_bias": True, @@ -81,14 +71,13 @@ "awd_initializer": False, }, - "decoder": FakeDecoder, # need a new decoder with AR and TAR + "decoder": FakeDecoder, "regularizer": tf.contrib.layers.l2_regularizer, "regularizer_params": { - 'scale': 2e-6, # alpha + 'scale': 2e-6, }, - # "loss": CrossEntropyLoss, # will need to write new loss + regularizer "loss": BasicSequenceLoss, "loss_params": { "offset_target_by_one": False, @@ -98,12 +87,12 @@ } train_params = { - "data_layer": LMTextDataLayer, + "data_layer": WKTDataLayer, "data_layer_params": { "data_root": data_root, "pad_vocab_to_eight": False, "rand_start": True, - "shuffle": False, + "shuffle": True, "shuffle_buffer_size": 25000, "repeat": True, "map_parallel_calls": 16, @@ -112,9 +101,8 @@ }, } eval_params = { - "data_layer": LMTextDataLayer, + "data_layer": WKTDataLayer, "data_layer_params": { - # "data_root": data_root, "pad_vocab_to_eight": False, "shuffle": False, "repeat": False, @@ -125,9 +113,8 @@ } infer_params = { - "data_layer": LMTextDataLayer, + "data_layer": WKTDataLayer, "data_layer_params": { - # "data_root": data_root, "pad_vocab_to_eight": False, "shuffle": False, "repeat": False, @@ -137,4 +124,4 @@ "bptt": bptt, "seed_tokens": "something The only game", }, -} +} \ No newline at end of file diff --git a/example_configs/text2text/toy-reversal/nmt-reversal-CR.py b/example_configs/text2text/toy-reversal/nmt-reversal-CR.py index 30abd7ad5..71c439968 100644 --- a/example_configs/text2text/toy-reversal/nmt-reversal-CR.py +++ b/example_configs/text2text/toy-reversal/nmt-reversal-CR.py @@ -162,4 +162,5 @@ "delimiter": " ", "special_tokens_already_in_vocab": False, }, + } diff --git a/example_configs/transfer/imdb-from-scratch.py b/example_configs/transfer/imdb-from-scratch.py new file mode 100644 index 000000000..e59752552 --- /dev/null +++ b/example_configs/transfer/imdb-from-scratch.py @@ -0,0 +1,130 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import IMDBDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR IMDB DATA]" +processed_data_folder = 'imdb-processed-data-wkt2' + +base_model = LSTMLM +max_length = 256 +binary = True +steps = 40 + +base_params = { + "restore_best_checkpoint": True, + "use_horovod": False, + "num_gpus": 2, + + "batch_size_per_gpu": 160, + "num_epochs": 1500, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "logdir": "IMDB-START", + "lm_vocab_file": 'wkt2-processed-data/vocab.txt', + # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]' + "processed_data_folder": processed_data_folder, + "eval_steps": steps * 2, + + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 9e-4 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + # "max_grad_norm": 0.25, + "dtype": tf.float32, + #"dtype": "mixed", + #"automatic_loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": WeightDropLayerNormBasicLSTMCell, + "core_cell_params": { + "num_units": 896, + "forget_bias": 1.0, + }, + "encoder_layers": 3, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.6, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.6, + "recurrent_keep_prob": 0.7, + 'encoder_emb_keep_prob': 0.37, + "encoder_use_skip_connections": False, + "emb_size": 256, + "num_tokens_gen": 10, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": CrossEntropyLoss, +} + +train_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": True, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "binary": binary, + "max_length": max_length, + }, +} +eval_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + # "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "binary": binary, + "max_length": max_length, + }, +} + +infer_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + # "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "binary": binary, + "max_length": max_length, + }, +} \ No newline at end of file diff --git a/example_configs/transfer/imdb-wkt103.py b/example_configs/transfer/imdb-wkt103.py new file mode 100644 index 000000000..8deb1d351 --- /dev/null +++ b/example_configs/transfer/imdb-wkt103.py @@ -0,0 +1,135 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import IMDBDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR IMDB DATA]" +processed_data_folder = 'imdb-processed-data-wkt103' + +base_model = LSTMLM +max_length = 256 +binary = True +steps = 10 + +base_params = { + "restore_best_checkpoint": True, + "use_horovod": False, + "num_gpus": 1, + + "batch_size_per_gpu": 16, + "eval_batch_size_per_gpu": 64, + "num_epochs": 100, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "load_model": "WKT103-CPT", + "logdir": "IMDB-WKT103-EXP1", + "lm_vocab_file": 'wkt103-processed-data/vocab.txt', + # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]' + "processed_data_folder": processed_data_folder, + "eval_steps": steps, + + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 1e-4 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + # "max_grad_norm": 0.25, + "dtype": tf.float32, + #"dtype": "mixed", + #"loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": WeightDropLayerNormBasicLSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, + "encoder_layers": 3, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.8, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.8, + "recurrent_keep_prob": 1.0, + 'encoder_emb_keep_prob': 0.6, + "encoder_use_skip_connections": False, + "emb_size": 256, + "num_tokens_gen": 10, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": CrossEntropyLoss, +} + +train_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": True, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "binary": binary, + "max_length": max_length, + "get_stats": True, + # "small": True, + }, +} +eval_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + # "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "binary": binary, + "max_length": max_length, + # "small": True, + }, +} + +infer_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + # "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "binary": binary, + "max_length": max_length, + }, +} diff --git a/example_configs/transfer/imdb-wkt2.py b/example_configs/transfer/imdb-wkt2.py new file mode 100644 index 000000000..e162bfa01 --- /dev/null +++ b/example_configs/transfer/imdb-wkt2.py @@ -0,0 +1,133 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import IMDBDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR IMDB DATA]" +processed_data_folder = 'imdb-processed-data-wkt2' + +base_model = LSTMLM +max_length = 256 +binary = True +steps = 5 + +base_params = { + "restore_best_checkpoint": True, + "use_horovod": False, + "num_gpus": 2, + + "batch_size_per_gpu": 16, + "num_epochs": 25, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "load_model": "AWDLSTM-EXP69", + "logdir": "IMDB-WKT2", + "lm_vocab_file": 'wkt2-processed-data/vocab.txt', + # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]' + "processed_data_folder": processed_data_folder, + "eval_steps": steps * 2, + + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 1e-5 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + # "max_grad_norm": 0.25, + "dtype": tf.float32, + #"dtype": "mixed", + #"loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": WeightDropLayerNormBasicLSTMCell, + "core_cell_params": { + "num_units": 896, + "forget_bias": 1.0, + }, + "encoder_layers": 3, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 1.0, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 1.0, + "recurrent_keep_prob": 1.0, + 'encoder_emb_keep_prob': 1.0, + "encoder_use_skip_connections": False, + "emb_size": 256, + "num_tokens_gen": 10, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-4, + }, + + "loss": CrossEntropyLoss, +} + +train_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": True, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "binary": binary, + "max_length": max_length, + # "small": True, + }, +} +eval_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + # "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "binary": binary, + "max_length": max_length, + # "small": True, + }, +} + +infer_params = { + "data_layer": IMDBDataLayer, + "data_layer_params": { + # "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "binary": binary, + "max_length": max_length, + }, +} \ No newline at end of file diff --git a/example_configs/transfer/sst-wkt2-small.py b/example_configs/transfer/sst-wkt2-small.py new file mode 100644 index 000000000..c6edf7fe0 --- /dev/null +++ b/example_configs/transfer/sst-wkt2-small.py @@ -0,0 +1,129 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import SSTDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import BasicSequenceLoss, CrossEntropyLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +base_model = LSTMLM +steps = 10 + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR SST DATA]" +processed_data_folder = 'sst-processed-data-wkt2' +binary = True +max_length = 96 + +base_params = { + "restore_best_checkpoint": True, # best checkpoint is only saved when using train_eval mode + "use_horovod": False, + "num_gpus": 1, + + "batch_size_per_gpu": 20, + "eval_batch_size_per_gpu": 80, + "num_epochs": 120, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "load_model": "LSTM-FP32-2GPU-SMALL", + "lm_vocab_file": 'wkt2-processed-data/vocab.txt', + # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]' + "logdir": "SST-WKT2-SMALL", + "processed_data_folder": processed_data_folder, + "eval_steps": steps, + + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 1e-4 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + # "max_grad_norm": 0.25, + "dtype": tf.float32, + #"dtype": "mixed", + #"loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { # will need to update + "initializer": tf.random_uniform_initializer, + "initializer_params": { # need different initializers for embeddings and for weights + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": WeightDropLayerNormBasicLSTMCell, + "core_cell_params": { + "num_units": 128, + "forget_bias": 1.0, + }, + "encoder_layers": 3, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.8, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.8, + "recurrent_keep_prob": 1.0, + 'encoder_emb_keep_prob': 0.7, + "encoder_use_skip_connections": False, + "emb_size": 64, + "num_tokens_gen": 10, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + "use_cell_state": True, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": CrossEntropyLoss, +} + +train_params = { + "data_layer": SSTDataLayer, + "data_layer_params": { + "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": True, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": max_length, + "get_stats": True, + }, +} +eval_params = { + "data_layer": SSTDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "max_length": max_length, + }, +} + +infer_params = { + "data_layer": SSTDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": max_length, + }, +} \ No newline at end of file diff --git a/example_configs/transfer/sst-wkt2.py b/example_configs/transfer/sst-wkt2.py new file mode 100644 index 000000000..a5741aed4 --- /dev/null +++ b/example_configs/transfer/sst-wkt2.py @@ -0,0 +1,128 @@ +import tensorflow as tf + +from open_seq2seq.models import LSTMLM +from open_seq2seq.encoders import LMEncoder +from open_seq2seq.decoders import FakeDecoder +from open_seq2seq.data import SSTDataLayer +from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell +from open_seq2seq.losses import BasicSequenceLoss, CrossEntropyLoss +from open_seq2seq.optimizers.lr_policies import fixed_lr + +base_model = LSTMLM +steps = 10 + +data_root = "[REPLACE THIS TO THE PATH WITH YOUR SST DATA]" +processed_data_folder = 'sst-processed-data-wkt2' +binary = True +max_length = 96 + +base_params = { + "restore_best_checkpoint": True, # best checkpoint is only saved when using train_eval mode + "use_horovod": False, + "num_gpus": 1, + + "batch_size_per_gpu": 20, + "eval_batch_size_per_gpu": 80, + "num_epochs": 120, + "save_summaries_steps": steps, + "print_loss_steps": steps, + "print_samples_steps": steps, + "save_checkpoint_steps": steps, + "load_model": "AWDLSTM-EXP69", + "lm_vocab_file": 'wkt2-processed-data/vocab.txt', + # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]' + "logdir": "SST-WKT2-EXP10", + "processed_data_folder": processed_data_folder, + "eval_steps": steps, + + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + + "lr_policy": fixed_lr, + "lr_policy_params": { + "learning_rate": 1e-4 + }, + + "summaries": ['learning_rate', 'variables', 'gradients', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + # "max_grad_norm": 0.25, + "dtype": tf.float32, + #"dtype": "mixed", + #"loss_scaling": "Backoff", + "encoder": LMEncoder, + "encoder_params": { # will need to update + "initializer": tf.random_uniform_initializer, + "initializer_params": { # need different initializers for embeddings and for weights + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": WeightDropLayerNormBasicLSTMCell, + "core_cell_params": { + "num_units": 896, + "forget_bias": 1.0, + }, + "encoder_layers": 3, + "encoder_dp_input_keep_prob": 1.0, + "encoder_dp_output_keep_prob": 0.8, + "encoder_last_input_keep_prob": 1.0, + "encoder_last_output_keep_prob": 0.8, + "recurrent_keep_prob": 1.0, + 'encoder_emb_keep_prob': 0.7, + "encoder_use_skip_connections": False, + "emb_size": 256, + "num_tokens_gen": 10, + "sampling_prob": 0.0, # 0 is always use the ground truth + "fc_use_bias": True, + "weight_tied": True, + "awd_initializer": False, + "use_cell_state": True, + }, + + "decoder": FakeDecoder, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 2e-6, + }, + + "loss": CrossEntropyLoss, +} + +train_params = { + "data_layer": SSTDataLayer, + "data_layer_params": { + "data_root": data_root, + "pad_vocab_to_eight": False, + "shuffle": True, + "shuffle_buffer_size": 25000, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": max_length, + }, +} +eval_params = { + "data_layer": SSTDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "max_length": max_length, + }, +} + +infer_params = { + "data_layer": SSTDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "shuffle": False, + "repeat": False, + "rand_start": False, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": max_length, + }, +} \ No newline at end of file diff --git a/open_seq2seq/data/__init__.py b/open_seq2seq/data/__init__.py index 8d6ec1775..3f9450aa6 100644 --- a/open_seq2seq/data/__init__.py +++ b/open_seq2seq/data/__init__.py @@ -2,5 +2,5 @@ from .data_layer import DataLayer from .speech2text.speech2text import Speech2TextDataLayer from .image2label.image2label import ImagenetDataLayer -from .lm.lmdata import LMTextDataLayer, LMTextDataLayerGenerate +from .lm.lmdata import WKTDataLayer, IMDBDataLayer, SSTDataLayer from .text2speech.text2speech import Text2SpeechDataLayer diff --git a/open_seq2seq/data/lm/lmdata.py b/open_seq2seq/data/lm/lmdata.py index e63a4680b..cefffa365 100644 --- a/open_seq2seq/data/lm/lmdata.py +++ b/open_seq2seq/data/lm/lmdata.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017 NVIDIA Corporation +# Copyright (c) 2018 NVIDIA Corporation import random import numpy as np @@ -9,15 +9,21 @@ from open_seq2seq.data.utils import load_pre_existing_vocabulary, pad_vocab_to_eight from open_seq2seq.data.text2text.t2t import _read_and_batch_from_files -from open_seq2seq.data.lm.lmutils import Corpus - -class LMTextDataLayer(DataLayer): +from open_seq2seq.data.lm.lmutils import Dictionary, Corpus, IMDBCorpus, SSTCorpus + +class WKTDataLayer(DataLayer): + ''' + WKTDataLayer does the necessary pre-processing to make the WikiText datasets + ready to be fed into the model. We use the ``word_token`` method + available in the ``nltk`` package. + You can download the datasets here: + https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/ + bptt: backpropagation through time - the length of the sequences used for training + rand_start: whether to start from a random starting index between (0, bptt) + ''' @staticmethod def get_required_params(): return dict(DataLayer.get_required_params(), **{ - # 'content_file': str, - # 'vocab_file': str, - 'shuffle': bool, 'repeat': bool, 'bptt': int, }) @@ -30,7 +36,6 @@ def get_optional_params(): 'small': bool, 'use_targets': bool, 'delimiter': str, - 'target_file': str, 'map_parallel_calls': int, 'prefetch_buffer_size': int, 'pad_lengths_to_eight': bool, @@ -41,21 +46,33 @@ def get_optional_params(): }) def __init__(self, params, model, num_workers=1, worker_id=0): - super(LMTextDataLayer, self).__init__(params, model, + super(WKTDataLayer, self).__init__(params, model, num_workers, worker_id) - self._processed_data_folder = self.params.get('processed_data_folder', 'processed_data') + self._processed_data_folder = self.params.get('processed_data_folder', 'wkt-processed_data') self._data_root = self.params.get('data_root', None) + self.corp = Corpus(self._data_root, self._processed_data_folder) + + seed_tokens = self.params.get('seed_tokens', 'The').split() + + self.end_token = self.corp.dictionary.word2idx[self.corp.dictionary.EOS] + self.params['seed_tokens'] = [self.corp.dictionary.word2idx[seed_token] for seed_token in seed_tokens] + + if self.params['mode'] == 'infer': + self.corp.content = self.params['seed_tokens'] + if self.params['mode'] == 'train': - self._batch_size = self.params['batch_size'] + self.batch_size = self.params['batch_size'] self.corp.content = self.corp.train elif self.params['mode'] == 'eval': - self._batch_size = self.params['batch_size'] + self.batch_size = self.params['batch_size'] self.corp.content = self.corp.valid else: - self._batch_size = 1 - self.corp.content = self.corp.test + if len(self.corp.content) < self.params['batch_size']: + self.batch_size = len(self.corp.content) + else: + self.batch_size = self.params['batch_size'] self.vocab_file = (self._processed_data_folder, 'vocab.txt') self.bptt = self.params['bptt'] @@ -67,44 +84,23 @@ def __init__(self, params, model, num_workers=1, worker_id=0): self._shuffle_buffer_size = self.params.get('shuffle_buffer_size', -1) self._num_workers = num_workers self._worker_id = worker_id - self.params["delimiter"] = self.params.get("delimiter", " ") - self.params["small"] = self.params.get("small", False) + self.delimiter = self.params.get("delimiter", " ") + self._small = self.params.get("small", False) self.start = 0 - if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0): - raise ValueError("If padding to 8 in data layer, then " - "max_length should be multiple of 8") - # load source and target vocabularies to RAM - - seed_tokens = self.params.get('seed_tokens', 'The').split() - - self.params['end_token'] = self.corp.dictionary.word2idx[self.corp.dictionary.EOS] - self.params['seed_tokens'] = [self.corp.dictionary.word2idx[seed_token] for seed_token in seed_tokens] - if self.params["small"]: + if self._small: if self.params['mode'] == 'eval': self.corp.content = self.corp.content[:200] else: self.corp.content = self.corp.content[:9004] - if self.params.get('pad_vocab_to_eight', False): self.corp.content = pad_vocab_to_eight(self.corp.content) - if self.params['mode'] == 'infer': - if len(self.corp.content) > self.bptt: - self.corp.content = self.corp.content[-self.bptt:] - self.dataset_size = len(self.corp.content) - - self.params['vocab_size'] = len(self.corp.dictionary.idx2word) - self.PAD_ID = self.params['vocab_size'] - self.PAD = '' - self.corp.dictionary.idx2word.append(self.PAD) - self.corp.dictionary.word2idx[self.PAD] = self.PAD_ID - + self.vocab_size = len(self.corp.dictionary.idx2word) self._input_tensors = {} - self._batch_size def gen(self): while True: @@ -119,15 +115,17 @@ def gen(self): def gen_infer(self): while True: - yield (self.corp.content, self.corp.content) - + for seed in self.corp.content: + yield ([seed], [seed]) + def build_graph(self): if self.params['mode'] == 'train' or self.params['mode'] == 'eval': gen = self.gen batch_shape = self.bptt else: gen = self.gen_infer - batch_shape = len(self.corp.content) + batch_shape = 1 + _src_tgt_dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), (tf.TensorShape([batch_shape]), tf.TensorShape([batch_shape]))) @@ -149,7 +147,7 @@ def build_graph(self): _src_tgt_dataset = _src_tgt_dataset.map(lambda x, y: ((x, tf.size(x)), (y, tf.size(y))), num_parallel_calls=self._map_parallel_calls) - self.batched_dataset = _src_tgt_dataset.batch(self._batch_size) + self.batched_dataset = _src_tgt_dataset.batch(self.batch_size) self._iterator = self.batched_dataset.make_initializable_iterator() @@ -166,7 +164,7 @@ def build_graph(self): def get_size_in_samples(self): if self.params['mode'] == 'train' or self.params['mode'] == 'eval': return (self.dataset_size - self.start) // self.bptt - return 1 + return len(self.corp.content) @property def iterator(self): @@ -176,84 +174,116 @@ def iterator(self): def input_tensors(self): return self._input_tensors -class LMTextDataLayerGenerate(DataLayer): +class TextClassificationDataLayer(DataLayer): + ''' + The base ckass to process data for text classification tasks. + If the data has already been processed, it shoud load the processed + data instead of re-processing it. + ''' @staticmethod def get_required_params(): return dict(DataLayer.get_required_params(), **{ - 'vocab_file': str, - 'bptt': int, + 'lm_vocab_file': str, + 'shuffle': bool, + 'repeat': bool, + 'max_length': int, + 'processed_data_folder': str, }) @staticmethod def get_optional_params(): return dict(DataLayer.get_optional_params(), **{ + 'rand_start': bool, + 'small': bool, + 'use_targets': bool, 'delimiter': str, 'map_parallel_calls': int, 'prefetch_buffer_size': int, 'pad_lengths_to_eight': bool, 'pad_vocab_to_eight': bool, - 'seed_file': str, + 'shuffle_buffer_size': int, + 'data_root': str, + 'binary': bool, + 'num_classes': int, + 'get_stats': bool, }) def __init__(self, params, model, num_workers=1, worker_id=0): - super(LMTextDataLayerGenerate, self).__init__(params, model, + super(TextClassificationDataLayer, self).__init__(params, model, num_workers, worker_id) - self._batch_size = 1 - self.vocab_file = self.params['vocab_file'] - self.bptt = self.params['bptt'] + + self._data_root = self.params.get('data_root', None) + self._binary = self.params.get('binary', True) + self._get_stats = self.params.get('get_stats', False) + self._lm_vocab_file = self.params['lm_vocab_file'] + self._map_parallel_calls = self.params.get('map_parallel_calls', 8) self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False) self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', tf.contrib.data.AUTOTUNE) + self._shuffle_buffer_size = self.params.get('shuffle_buffer_size', -1) self._num_workers = num_workers self._worker_id = worker_id - self.params["delimiter"] = self.params.get("delimiter", " ") - self.seed_file = self.params.get("seed_file", None) - - # load source and target vocabularies to RAM - self.corp = Corpus(self.params['vocab_file']) - - if self.seed_file: - self.input_string = open(self.seed_file, 'r').read().strip() - else: - self.input_string = input('Please enter your seed string (case sensitive): ').strip() - - self.corp.content = self.corp.tokenize(self.input_string) - if len(self.corp.content) > self.bptt: - self.corp.content = self.corp.content[-self.bptt:] - - self.dataset_size = len(self.corp.content) - - self.params['vocab_size'] = len(self.corp.dictionary.idx2word) - self.PAD_ID = self.params['vocab_size'] - self.PAD = '' - self.corp.dictionary.idx2word.append(self.PAD) - self.corp.dictionary.word2idx[self.PAD] = self.PAD_ID + self._small = self.params.get("small", False) + self._max_length = self.params['max_length'] + self.delimiter = self.params.get("delimiter", " ") + self.EOS_ID = -1 + self.batch_size = self.params['batch_size'] + if self._pad_lengths_to_eight and not (self._max_length % 8 == 0): + raise ValueError("If padding to 8 in data layer, then " + "max_length should be multiple of 8") self._input_tensors = {} def gen(self): - yield self.corp.content + while True: + for review, raw_rating in self.corp.content: + if len(review) > self._max_length: + review = review[-self._max_length:] + rating = np.zeros(self.num_classes) + rating[raw_rating] = 1 + yield (review, rating) def build_graph(self): - _src_tgt_dataset = tf.data.Dataset.from_generator(self.gen, (tf.int32), - (tf.TensorShape([len(self.corp.content)]))) + _src_tgt_dataset = tf.data.Dataset.from_generator(self.gen, + (tf.int32, tf.int32), + (tf.TensorShape([None]), tf.TensorShape([self.num_classes]))) + + if self._num_workers > 1: + _src_tgt_dataset = _src_tgt_dataset\ + .shard(num_shards=self._num_workers, index=self._worker_id) - _src_tgt_dataset = _src_tgt_dataset.map(lambda x: ((x, tf.size(x))), + if self.params['shuffle']: + bf_size = self.get_size_in_samples() if self._shuffle_buffer_size == -1 \ + else self._shuffle_buffer_size + _src_tgt_dataset = _src_tgt_dataset.shuffle(buffer_size=bf_size) + + if self.params['repeat']: + _src_tgt_dataset = _src_tgt_dataset.repeat() + + _src_tgt_dataset = _src_tgt_dataset.map(lambda x, y: ((x, tf.size(x)), (y, tf.size(y))), num_parallel_calls=self._map_parallel_calls) - self.batched_dataset = _src_tgt_dataset.batch(self._batch_size) + self.batched_dataset = _src_tgt_dataset.padded_batch( + self.batch_size, + padded_shapes=((tf.TensorShape([None]), + tf.TensorShape([])), + (tf.TensorShape([None]), + tf.TensorShape([]))), + padding_values=( + (self.EOS_ID, 0), + (self.EOS_ID, 0))).prefetch(buffer_size=self._prefetch_buffer_size) self._iterator = self.batched_dataset.make_initializable_iterator() - t1, _ = self.iterator.get_next() - t1 = tf.expand_dims(t1[0], axis=0) - print(t1) - self._input_tensors['source_tensors'] = [t1[0], t1[1]] - print(self._input_tensors) + t1, t2 = self.iterator.get_next() + x, x_length = t1[0], t1[1] + y, y_length = t2[0], t2[1] + self._input_tensors['source_tensors'] = [x, x_length] + self._input_tensors['target_tensors'] = [y, y_length] def get_size_in_samples(self): - return 1 + return self.dataset_size @property def iterator(self): @@ -261,4 +291,71 @@ def iterator(self): @property def input_tensors(self): - return self._input_tensors \ No newline at end of file + return self._input_tensors + +class IMDBDataLayer(TextClassificationDataLayer): + ''' + Data layer to process the raw IMDB data, which can be downloaded here: + http://ai.stanford.edu/~amaas/data/sentiment/ + + ''' + def __init__(self, params, model, num_workers=1, worker_id=0): + super(IMDBDataLayer, self).__init__(params, model, num_workers, worker_id) + self._processed_data_folder = self.params['processed_data_folder'] + + if self._binary: + self.num_classes = 2 + else: + self.num_classes = 10 + + self.corp = IMDBCorpus(self._data_root, + self._processed_data_folder, + self._lm_vocab_file, + self._binary, + get_stats=self._get_stats) + + if self.params['mode'] == 'train': + self.corp.content = self.corp.train + elif self.params['mode'] == 'eval': + self.corp.content = self.corp.valid + else: + self.corp.content = self.corp.test + + if self._small: + if self.params['mode'] == 'eval': + self.corp.content = self.corp.content[:self.batch_size * 2] + else: + self.corp.content = self.corp.content[:self.batch_size * 4] + + self.dataset_size = len(self.corp.content) + self.vocab_size = len(self.corp.dictionary.idx2word) + self.EOS_ID = self.corp.dictionary.word2idx[self.corp.dictionary.EOS] + self.end_token = self.corp.dictionary.word2idx[self.corp.dictionary.EOS] + +class SSTDataLayer(TextClassificationDataLayer): + ''' + Data layer to process the raw SST (Stanford Sentiment Treebank). + Read about the dataset here: + https://nlp.stanford.edu/sentiment/ + Download the preprocessed version that can be used for this DataLayer here: + https://github.com/NVIDIA/sentiment-discovery/tree/master/data/binary_sst + ''' + def __init__(self, params, model, num_workers=1, worker_id=0): + super(SSTDataLayer, self).__init__(params, model, num_workers, worker_id) + self._processed_data_folder = self.params['processed_data_folder'] + self.corp = SSTCorpus(self._data_root, + self._processed_data_folder, + self._lm_vocab_file, + get_stats=self._get_stats) + + if self.params['mode'] == 'train': + self.corp.content = self.corp.train + elif self.params['mode'] == 'eval': + self.corp.content = self.corp.valid + else: + self.corp.content = self.corp.test + self.num_classes = 2 + self.dataset_size = len(self.corp.content) + self.vocab_size = len(self.corp.dictionary.idx2word) + self.EOS_ID = self.corp.dictionary.word2idx[self.corp.dictionary.EOS] + self.end_token = self.corp.dictionary.word2idx[self.corp.dictionary.EOS] \ No newline at end of file diff --git a/open_seq2seq/data/lm/lmutils.py b/open_seq2seq/data/lm/lmutils.py index d29ebfc45..3c278cce3 100644 --- a/open_seq2seq/data/lm/lmutils.py +++ b/open_seq2seq/data/lm/lmutils.py @@ -1,109 +1,494 @@ from collections import Counter +import glob import os import pathlib +import random +import re +import shutil +from nltk.tokenize import word_tokenize import numpy as np +import pandas as pd class Dictionary(object): + ''' + Adapted from salesforce's repo: + https://github.com/salesforce/awd-lstm-lm/blob/master/data.py + ''' + def __init__(self, limit=3, vocab_link=None): # do we need limit? + self.word2idx = {} + self.idx2word = [] + self.counter = Counter() + self.UNK = '' + self.EOS = '' + if vocab_link and os.path.isfile(vocab_link): + self.load_vocab(vocab_link) + + def add_word(self, word): + if word not in self.word2idx: + self.idx2word.append(word) + self.word2idx[word] = len(self.idx2word) - 1 + token_id = self.word2idx[word] + self.counter[token_id] += 1 + return self.word2idx[word] + + def load_vocab(self, vocab_link): + vocab_file = open(vocab_link, 'r') + lines = vocab_file.readlines() + n = int(lines[-1].strip()) + self.idx2word = [0 for _ in range(n)] + for line in lines[:-1]: + parts = line.strip().split('\t') + token_id, word, count = int(parts[0]), parts[1], int(parts[2]) + self.word2idx[word] = token_id + self.idx2word[token_id] = word + self.counter[token_id] = count + if not self.UNK in self.word2idx: + self.add_word(self.UNK) + if not self.EOS in self.word2idx: + self.add_word(self.EOS) + + + def __len__(self): + return len(self.idx2word) + +def check_exist(proc_path): + filenames = ['train.ids', 'valid.ids', 'test.ids'] + paths = [os.path.join(proc_path, name) for name in filenames] + paths.append(proc_path) + for name in paths: + if not os.path.exists(name): + return False + return True + +def list2str(list): + return '\t'.join([str(num) for num in list]) + +def unzip(data): + tmp = [list(t) for t in zip(*data)] + return (tmp[0], tmp[1]) + +class Corpus(object): + def __init__(self, raw_path, proc_path, change_contraction=True, limit=3): + pathlib.Path(proc_path).mkdir(exist_ok=True) + self.limit = limit + self.dictionary = Dictionary(limit) + self.vocab_link = 'vocab.txt' + exists = check_exist(proc_path) + self.change_contraction = change_contraction + + if not exists: + print('Creating corpus from raw data ...') + if raw_path and 'raw' in raw_path: + self._change_names(raw_path) + if not raw_path: + raise ValueError("data_root [directory to the original data] must be specified") + self.preprocess(raw_path, proc_path) + self.create_dictionary(proc_path, os.path.join(proc_path, 'train.txt')) + self.dictionary = Dictionary(limit) + self.dictionary.load_vocab(os.path.join(proc_path, self.vocab_link)) + self.train = self.tokenize(proc_path, proc_path, 'train.txt') + self.valid = self.tokenize(proc_path, proc_path, 'valid.txt') + self.test = self.tokenize(proc_path, proc_path, 'test.txt') + else: + self.load_corpus(proc_path) + + def _change_names(self, raw_path): + if os.path.isfile(os.path.join(raw_path, 'wiki.train.raw')): + os.rename(os.path.join(raw_path, 'wiki.train.raw'), os.path.join(raw_path, 'train.txt')) + os.rename(os.path.join(raw_path, 'wiki.valid.raw'), os.path.join(raw_path, 'valid.txt')) + os.rename(os.path.join(raw_path, 'wiki.test.raw'), os.path.join(raw_path, 'test.txt')) + + def preprocess(self, raw_path, proc_path): + for filename in ['train.txt', 'valid.txt', 'test.txt']: + in_ = open(os.path.join(raw_path, filename), 'r') + out = open(os.path.join(proc_path, filename), 'w') + for line in in_: + line = re.sub('@-@', '-', line) + line = re.sub('-', ' - ', line) + line = re.sub('etc .', 'etc.', line) + if self.change_contraction: + line = re.sub("n 't", " n't", line) + tokens = [] + for token in line.split(): + tokens.append(token.strip()) + out.write(' '.join(tokens) + '\n') + + def create_dictionary(self, proc_path, filename): ''' - Adapted from salesforce's repo: - https://github.com/salesforce/awd-lstm-lm/blob/master/data.py + Add words to the dictionary only if it's in the train file ''' - def __init__(self): - self.word2idx = {} - self.idx2word = [] - self.counter = Counter() - self.UNK = '' - self.EOS = '' - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - token_id = self.word2idx[word] - self.counter[token_id] += 1 - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) + self.dictionary.add_word(self.dictionary.UNK) + with open(filename, 'r') as f: + f.readline() + for line in f: + words = line.split() + [self.dictionary.EOS] + for word in words: + self.dictionary.add_word(word) -class Corpus(object): - def __init__(self, raw_path, proc_path): - pathlib.Path(proc_path).mkdir(exist_ok=True) - self.dictionary = Dictionary() - self.vocab_link = 'vocab.txt' - exists = self.check_exist(proc_path) - - if not exists: - print('Creating corpus from raw data ...') - if not raw_path: - raise ValueError("data_root [directory to the original data] must be specified") - self.create_dictionary(proc_path, os.path.join(raw_path, 'train.txt')) - self.train = self.tokenize(raw_path, proc_path, 'train.txt') - self.valid = self.tokenize(raw_path, proc_path, 'valid.txt') - self.test = self.tokenize(raw_path, proc_path, 'test.txt') + with open(os.path.join(proc_path, self.vocab_link), 'w') as f: + f.write('\t'.join(['0', self.dictionary.UNK, '0']) + '\n') + idx = 1 + for token_id, count in self.dictionary.counter.most_common(): + if count < self.limit: + f.write(str(idx) + '\n') + return + f.write('\t'.join([str(idx), + self.dictionary.idx2word[token_id], + str(count)]) + '\n') + idx += 1 + + def tokenize(self, raw_path, proc_path, filename): + unk_id = self.dictionary.word2idx[self.dictionary.UNK] + out = open(os.path.join(proc_path, filename[:-3] + 'ids'), 'w') + with open(os.path.join(raw_path, filename), 'r') as f: + ids = [] + for line in f: + words = line.split() + [self.dictionary.EOS] + for word in words: + ids.append(self.dictionary.word2idx.get(word, unk_id)) + out.write(list2str(ids)) + out.close() + + return np.asarray(ids) + + def load_ids(self, filename): + ids = open(filename, 'r').read().strip().split('\t') + return np.asarray([int(i) for i in ids]) + + def list2str(self, list): + return '\t'.join([str(num) for num in list]) + + def load_corpus(self, proc_path): + print('Loading corpus from processed data ...') + self.dictionary.load_vocab(os.path.join(proc_path, self.vocab_link)) + self.train = self.load_ids(os.path.join(proc_path, 'train.ids')) + self.valid = self.load_ids(os.path.join(proc_path, 'valid.ids')) + self.test = self.load_ids(os.path.join(proc_path, 'test.ids')) + +class IMDBCorpus(object): + def __init__(self, raw_path, proc_path, lm_vocab_link, binary=True, get_stats=False): + exists = check_exist(proc_path) + pathlib.Path(proc_path).mkdir(exist_ok=True) + self.dictionary = Dictionary(vocab_link=lm_vocab_link) + self.binary = binary + self.raw_path = raw_path + self.proc_path = proc_path + self._get_stats = get_stats + + if not exists: + print('Creating corpus from raw data ...') + if not raw_path: + raise ValueError("data_root [directory to the original data] must be specified") + self.preprocess() + else: + self.load_corpus(proc_path) + + def check_oov(self, txt): + txt = txt.lower() + txt = re.sub('thats', "that's", txt) + txt = re.sub('wouldnt', "wounldn't", txt) + txt = re.sub('couldnt', "couldn't", txt) + txt = re.sub('cant', "can't", txt) + txt = re.sub('dont', "don't", txt) + txt = re.sub("didnt", "didn't", txt) + txt = re.sub("isnt", "isn't", txt) + txt = re.sub("wasnt", "wasn't", txt) + return word_tokenize(txt) + + def tokenize(self, txt): + txt = re.sub('
', ' ', txt) + txt = re.sub('–', ' ', txt) + txt = re.sub('—', ' ', txt) + txt = re.sub('-', ' - ', txt) + txt = re.sub('\.', ' . ', txt) + txt = re.sub('\+', ' + ', txt) + txt = re.sub('\*', ' * ', txt) + txt = re.sub('/', ' / ', txt) + txt = re.sub('`', "'", txt) + txt = re.sub(' ms \.', " ms.", txt) + txt = re.sub('Ms \.', "Ms.", txt) + + words = [] + for token in word_tokenize(txt): + if not token in self.dictionary.word2idx: + if token.startswith("'"): + words.append("'") + token = token[1:] + if not token in self.dictionary.word2idx: + tokens = self.check_oov(token) + words.extend(tokens) + else: + words.append(token) + else: + words.append(token) + + txt = ' '.join(words) + txt = re.sub("''", '"', txt) + txt = re.sub("' '", '"', txt) + txt = re.sub("``", '"', txt) + txt = re.sub('etc \.', 'etc. ', txt) + txt = re.sub(' etc ', ' etc. ', txt) + return txt + + def tokenize_folder(self, mode, token_file, rating_file): + review_outfile = open(token_file, 'w') + rating_outfile = open(rating_file, 'w') + for sent in ['pos', 'neg']: + files = glob.glob(os.path.join(self.raw_path, mode, sent, '*.txt')) + for file in files: + in_file = open(file, 'r') + txt = self.tokenize(in_file.read()) + review_outfile.write(txt + "\n") + if self.binary: + if sent == 'pos': + rating = "1" + else: + rating = "0" + else: + idx = file.rfind("_") + rating = str(int(file[idx + 1:-4]) - 1) + rating_outfile.write(rating + '\n') + in_file.close() + + def txt2ids(self, mode, token_file, rating_file): + if self._get_stats: + import matplotlib + matplotlib.use("TkAgg") + from matplotlib import pyplot as plt + rating_lines = open(rating_file, 'r').readlines() + ratings = [int(line.strip()) for line in rating_lines] + reviews = [] + unk_id = self.dictionary.word2idx[self.dictionary.UNK] + unseen = [] + all_tokens = 0 + all_unseen = 0 + for line in open(token_file, 'r'): + tokens = line.strip().split() + reviews.append([self.dictionary.word2idx.get(token, unk_id) for token in tokens]) + if self._get_stats: + for token in tokens: + all_tokens += 1 + if not token in self.dictionary.word2idx: + unseen.append(token) + all_unseen += 1 + + if self._get_stats: + counter = Counter(unseen) + + out = open(os.path.join(self.proc_path, mode + '_unseen.txt'), 'w') + for key, count in counter.most_common(): + out.write(key + '\t' + str(count) + '\n') + + lengths = np.asarray([len(review) for review in reviews]) + stat_file = open(os.path.join(self.proc_path, 'statistics.txt'), 'w') + stat_file.write(mode + '\n') + short_lengths = [l for l in lengths if l <= 256] + stat_file.write('\t'.join(['Min', 'Max', 'Mean', 'Median', 'STD', 'Total', '<=256']) + '\n') + stats = [np.min(lengths), np.max(lengths), np.mean(lengths), np.median(lengths), np.std(lengths), len(lengths), len(short_lengths)] + stat_file.write('\t'.join([str(t) for t in stats]) + '\n') + stat_file.write('Total {} unseen out of {} all tokens. Probability {}.\n'. + format(all_unseen, all_tokens, all_unseen / all_tokens)) + plt.hist(lengths, bins=20) + plt.savefig(os.path.join(self.proc_path, mode + '_hist.png')) + plt.hist(short_lengths, bins=20) + plt.savefig(os.path.join(self.proc_path, mode + '_short_hist.png')) + + return list(zip(reviews, ratings)) + + def preprocess_folder(self, mode): + token_file = os.path.join(self.proc_path, mode + '.tok') + rating_file = os.path.join(self.proc_path, mode + '.inter.rat') + self.tokenize_folder(mode, token_file, rating_file) + return self.txt2ids(mode, token_file, rating_file) + + def partition(self, data, val_count=1000): + random.shuffle(data) + return data[val_count:], data[:val_count] + + def ids2file(self): + for mode in ['train', 'valid', 'test']: + data = getattr(self, mode) + review_out = open(os.path.join(self.proc_path, mode + '.ids'), 'w') + rating_out = open(os.path.join(self.proc_path, mode + '.rat'), 'w') + for review, rating in data: + review_out.write(list2str(review) + '\n') + rating_out.write(str(rating) + '\n') + + def preprocess(self): + os.makedirs(self.proc_path, exist_ok=True) + train = self.preprocess_folder('train') + self.train, self.valid = self.partition(train) + self.test = self.preprocess_folder('test') + self.ids2file() + + def load_ids(self, mode): + review_lines = open(os.path.join(self.proc_path, mode + '.ids')).readlines() + rating_lines = open(os.path.join(self.proc_path, mode + '.rat')).readlines() + ratings = [int(line.strip()) for line in rating_lines] + reviews = [[int(i) for i in line.strip().split('\t')] for line in review_lines] + return list(zip(reviews, ratings)) + + def load_corpus(self, proc_path): + print('Loading corpus from processed data ...') + self.train = self.load_ids('train') + self.valid = self.load_ids('valid') + self.test = self.load_ids('test') + +class SSTCorpus(object): + def __init__(self, raw_path, proc_path, lm_vocab_link, get_stats=False): + exists = check_exist(proc_path) + pathlib.Path(proc_path).mkdir(exist_ok=True) + self.dictionary = Dictionary(vocab_link=lm_vocab_link) + self.raw_path = raw_path + self.proc_path = proc_path + self._get_stats = get_stats + + if not exists: + print('Creating corpus from raw data ...') + if not raw_path: + raise ValueError("data_root [directory to the original data] must be specified") + self.preprocess() + else: + self.load_corpus(proc_path) + + def check_oov(self, txt): + txt = txt.lower() + txt = re.sub('thats', "that's", txt) + txt = re.sub('wouldnt', "wounldn't", txt) + txt = re.sub('couldnt', "couldn't", txt) + txt = re.sub('cant', "can't", txt) + txt = re.sub('dont', "don't", txt) + txt = re.sub("didnt", "didn't", txt) + txt = re.sub("isnt", "isn't", txt) + txt = re.sub("wasnt", "wasn't", txt) + return word_tokenize(txt) + + def tokenize(self, txt): + txt = re.sub('-', ' - ', txt) + txt = re.sub('\+', ' + ', txt) + txt = re.sub('\*', ' * ', txt) + txt = re.sub('/', ' / ', txt) + txt = re.sub('`', "'", txt) + + words = [] + for token in word_tokenize(txt): + if not token in self.dictionary.word2idx: + if token.startswith("'"): + words.append("'") + token = token[1:] + if not token in self.dictionary.word2idx: + tokens = self.check_oov(token) + words.extend(tokens) else: - self.load_corpus(proc_path) - - def check_exist(self, proc_path): - paths = [proc_path, proc_path + '/vocab.txt', proc_path + '/train.ids', - proc_path + '/valid.ids', proc_path + '/test.ids'] - for name in paths: - if not os.path.exists(name): - return False - return True - - def create_dictionary(self, proc_path, filename): - ''' - Add words to the dictionary only if it's train file - ''' - with open(filename, 'r') as f: - f.readline() - for line in f: - words = line.split() + [self.dictionary.EOS] - for word in words: - self.dictionary.add_word(word) - - with open(os.path.join(proc_path, self.vocab_link), 'w') as f: - f.write(str(len(self.dictionary)) + '\n') - for token_id, count in self.dictionary.counter.most_common(): - f.write('\t'.join([str(token_id), - self.dictionary.idx2word[token_id], - str(count)]) + '\n') - - - def tokenize(self, raw_path, proc_path, filename): - unk_id = self.dictionary.word2idx[self.dictionary.UNK] - out = open(os.path.join(proc_path, filename[:-3] + 'ids'), 'w') - with open(os.path.join(raw_path, filename), 'r') as f: - ids = [] - for line in f: - words = line.split() + [self.dictionary.EOS] - for word in words: - ids.append(self.dictionary.word2idx.get(word, unk_id)) - out.write(self.list2str(ids)) #TODO: change to pickle - out.close() - - return np.asarray(ids) - - def load_ids(self, filename): - ids = open(filename, 'r').read().strip().split('\t') - return np.asarray([int(i) for i in ids]) - - def list2str(self, list): - return '\t'.join([str(num) for num in list]) - - def load_corpus(self, proc_path): - print('Loading corpus from processed data ...') - vocab_file = open(os.path.join(proc_path, self.vocab_link), 'r') - n = int(vocab_file.readline().strip()) - self.dictionary.idx2word = [0 for _ in range(n)] - for line in vocab_file: - parts = line.strip().split('\t') - token_id, word, count = int(parts[0]), parts[1], int(parts[2]) - self.dictionary.word2idx[word] = token_id - self.dictionary.idx2word[token_id] = word - self.dictionary.counter[token_id] = count - self.train = self.load_ids(os.path.join(proc_path, 'train.ids')) - self.valid = self.load_ids(os.path.join(proc_path, 'valid.ids')) - self.test = self.load_ids(os.path.join(proc_path, 'test.ids')) \ No newline at end of file + words.append(token) + else: + words.append(token) + + txt = ' '.join(words) + txt = re.sub("''", '"', txt) + txt = re.sub("' '", '"', txt) + txt = re.sub("``", '"', txt) + txt = re.sub('etc \.', 'etc. ', txt) + txt = re.sub(' etc ', ' etc. ', txt) + return txt + + def tokenize_file(self, mode): + data = pd.read_csv(os.path.join(self.raw_path, mode + '.csv')) + + if mode == 'val': + mode = 'valid' + review_file = open(os.path.join(self.proc_path, mode + '.tok'), 'w') + rating_file = open(os.path.join(self.proc_path, mode + '.rat'), 'w') + for _, row in data.iterrows(): + review = self.tokenize(row['sentence']) + review_file.write(review + '\n') + rating_file.write(str(row['label']) + '\n') + + def txt2ids(self, mode): + if self._get_stats: + import matplotlib + matplotlib.use("TkAgg") + from matplotlib import pyplot as plt + + reviews = [] + unk_id = self.dictionary.word2idx[self.dictionary.UNK] + unseen = [] + all_tokens = 0 + all_unseen = 0 + + rating_lines = open(os.path.join(self.proc_path, mode + '.rat'), 'r').readlines() + ratings = [int(line.strip()) for line in rating_lines] + + for line in open(os.path.join(self.proc_path, mode + '.tok'), 'r'): + tokens = line.strip().split() + reviews.append([self.dictionary.word2idx.get(token, unk_id) for token in tokens]) + if self._get_stats: + for token in tokens: + all_tokens += 1 + if not token in self.dictionary.word2idx: + unseen.append(token) + all_unseen += 1 + + if self._get_stats: + counter = Counter(unseen) + + out = open(os.path.join(self.proc_path, mode + '_unseen.txt'), 'w') + for key, count in counter.most_common(): + out.write(key + '\t' + str(count) + '\n') + + lengths = np.asarray([len(review) for review in reviews]) + stat_file = open(os.path.join(self.proc_path, 'statistics.txt'), 'a') + stat_file.write(mode + '\n') + short_lengths = [l for l in lengths if l <= 96] + stat_file.write('\t'.join(['Min', 'Max', 'Mean', 'Median', 'STD', 'Total', '<=96']) + '\n') + stats = [np.min(lengths), np.max(lengths), np.mean(lengths), np.median(lengths), np.std(lengths), len(lengths), len(short_lengths)] + stat_file.write('\t'.join([str(t) for t in stats]) + '\n') + stat_file.write('Total {} unseen out of {} all tokens. Probability {}.\n'. + format(all_unseen, all_tokens, all_unseen / all_tokens)) + plt.hist(lengths, bins=20) + plt.savefig(os.path.join(self.proc_path, mode + '_hist.png')) + plt.hist(short_lengths, bins=20) + plt.savefig(os.path.join(self.proc_path, mode + '_short_hist.png')) + + return list(zip(reviews, ratings)) + + def preprocess_file(self, mode): + self.tokenize_file(mode) + if mode == 'val': + mode = 'valid' + return self.txt2ids(mode) + + def ids2file(self): + for mode in ['train', 'valid', 'test']: + data = getattr(self, mode) + review_out = open(os.path.join(self.proc_path, mode + '.ids'), 'w') + rating_out = open(os.path.join(self.proc_path, mode + '.rat'), 'w') + for review, rating in data: + review_out.write(list2str(review) + '\n') + rating_out.write(str(rating) + '\n') + + def preprocess(self): + os.makedirs(self.proc_path, exist_ok=True) + self.train = self.preprocess_file('train') + self.valid = self.preprocess_file('val') + self.test = self.preprocess_file('test') + self.ids2file() + + def load_ids(self, mode): + review_lines = open(os.path.join(self.proc_path, mode + '.ids')).readlines() + rating_lines = open(os.path.join(self.proc_path, mode + '.rat')).readlines() + ratings = [int(line.strip()) for line in rating_lines] + reviews = [[int(i) for i in line.strip().split('\t')] for line in review_lines] + return list(zip(reviews, ratings)) + + def load_corpus(self, proc_path): + print('Loading corpus from processed data ...') + self.train = self.load_ids('train') + self.valid = self.load_ids('valid') + self.test = self.load_ids('test') + +# SSTCorpus('/home/chipn/data/binary_sst', 'sst-processed-data-wkt2' , '/home/chipn/dev/OpenSeq2Seq/wkt2-processed-data/vocab.txt') +# SSTCorpus('/home/chipn/data/binary_sst', 'sst-processed-data-wkt103' , '/home/chipn/dev/OpenSeq2Seq/wkt103-processed-data/vocab.txt') +# IMDBCorpus('/home/chipn/data/aclImdb', 'imdb-processed-data-wkt103' , '/home/chipn/dev/OpenSeq2Seq/wkt103-processed-data/vocab.txt') +# IMDBCorpus('/home/chipn/data/aclImdb', 'imdb-processed-data-wkt2' , '/home/chipn/dev/OpenSeq2Seq/wkt2-processed-data/vocab.txt') \ No newline at end of file diff --git a/open_seq2seq/encoders/lm_encoders.py b/open_seq2seq/encoders/lm_encoders.py index 83fde4195..ebb50edc6 100644 --- a/open_seq2seq/encoders/lm_encoders.py +++ b/open_seq2seq/encoders/lm_encoders.py @@ -18,7 +18,7 @@ class LMEncoder(Encoder): """ - RNN-based encoder with embeddings + RNN-based encoder with embeddings for language modeling """ @staticmethod def get_required_params(): @@ -29,8 +29,6 @@ def get_required_params(): 'encoder_use_skip_connections': bool, 'core_cell': None, 'core_cell_params': dict, - 'last_cell_params': dict, - 'output_dim': int, 'end_token': int, "batch_size": int, }) @@ -41,7 +39,7 @@ def get_optional_params(): 'encoder_dp_input_keep_prob': float, 'encoder_dp_output_keep_prob': float, "encoder_last_input_keep_prob": float, - "encoder_last_output_keep_prob": float, # output droput at last layer is 0.4 + "encoder_last_output_keep_prob": float, 'encoder_emb_keep_prob': float, 'variational_recurrent': bool, 'time_major': bool, @@ -61,6 +59,8 @@ def get_optional_params(): "weight_variational": bool, "dropout_seed": int, "num_sampled": int, + "fc_dim": int, + "use_cell_state": bool, }) def __init__(self, params, model, @@ -68,19 +68,48 @@ def __init__(self, params, model, """ Initializes bi-directional encoder with embeddings :param params: dictionary with encoder parameters + + Many of the techniques in this implementation is taken from the paper + "Regularizing and Optimizing LSTM Language Models" (Merity et al., 2017) + https://arxiv.org/pdf/1708.02182.pdf + Must define: * vocab_size - data vocabulary size * emb_size - size of embedding to use * encoder_cell_units - number of units in RNN cell * encoder_cell_type - cell type: lstm, gru, etc. * encoder_layers - number of layers - * encoder_dp_input_keep_prob - - * encoder_dp_output_keep_prob - * encoder_use_skip_connections - true/false * time_major (optional) * use_swap_memory (optional) * mode - train or infer - ... add any cell-specific parameters here as well + * input_weight_keep_prob: keep probability for dropout of W + (kernel used to multiply with the input tensor) + * recurrent_weight_keep_prob: keep probability for dropout of U + (kernel used to multiply with last hidden state tensor) + * recurrent_keep_prob: keep probability for dropout + when applying tanh for the input transform step + * weight_variational: whether to keep the same weight dropout mask + at every timestep. This feature is not yet implemented. + * emb_keep_prob: keep probability for dropout of the embedding matrix + * encoder_dp_input_keep_prob: keep probability for dropout on input of a LSTM cell + in the layer which is not the last layer + * encoder_dp_output_keep_prob: keep probability for dropout on output of a LSTM cell + in the layer which is not the last layer + * encoder_last_input_keep_prob: like ``encoder_dp_input_keep_prob`` but for the + cell in the last layer + * encoder_dp_output_keep_prob: like ``encoder_dp_output_keep_prob`` but for the + cell in the last layer + * weight_tied: whether to tie the embedding matrix to the last linear layer. + can only do so if the dimension of the last output layer is + the same as the vocabulary size + * use_cell_state: if set to True, concat the last hidden state and + the last cell state to input into the last output layer. + This only works for the text classification task, not the + language modeling phase. + For different ways to do dropout for LSTM cells, please read this article: + https://medium.com/@bingobee01/a-review-of-dropout-as-applied-to-rnns-72e79ecd5b7b + :param encoder_params: """ super(LMEncoder, self).__init__( @@ -101,16 +130,15 @@ def __init__(self, params, model, self.params['recurrent_weight_keep_prob'] = self.params.get('recurrent_weight_keep_prob', 1.0) self.params['weight_variational'] = self.params.get('weight_variational', False) self.params['dropout_seed'] = self.params.get('dropout_seed', 1822) - self._num_sampled = self.params.get('num_sampled', self._vocab_size) # if num_sampled not define then just take full softmax - - if mode == 'infer': - self.num_tokens_gen = self.params.get('num_tokens_gen', 1) + self._fc_dim = self.params.get('fc_dim', self._vocab_size) + self._num_sampled = self.params.get('num_sampled', self._fc_dim) # if num_sampled not defined, take full softmax + self._lm_phase = self._fc_dim == self._vocab_size + self._num_tokens_gen = self.params.get('num_tokens_gen', 200) + self._batch_size = self.params['batch_size'] + + if mode == 'infer' and self._lm_phase: self._batch_size = len(self.params['seed_tokens']) - else: - self.num_tokens_gen = 1 - self._batch_size = self.params['batch_size'] - # if self._vocab_size > 100000 and mode == 'eval': - # self._batch_size = 2 + self._use_cell_state = self.params.get('use_cell_state', False) def encode(self, input_dict): """Wrapper around :meth:`self._encode() <_encode>` method. @@ -193,21 +221,36 @@ def _encode(self, input_dict): emb_keep_prob, recurrent_keep_prob = 1.0, 1.0 input_weight_keep_prob, recurrent_weight_keep_prob = 1.0, 1.0 + self._output_layer = tf.layers.Dense( - self._vocab_size, + self._fc_dim, kernel_regularizer=regularizer, kernel_initializer=initializer, use_bias=fc_use_bias, + dtype=self._params['dtype'] ) if self._weight_tied: - fake_input = tf.zeros(shape=(1, self._emb_size)) - fake_output = self._output_layer.apply(fake_input) - with tf.variable_scope("dense", reuse=True): - dense_weights = tf.get_variable("kernel") - dense_biases = tf.get_variable("bias") + last_cell_params = copy.deepcopy(self.params['core_cell_params']) + last_cell_params['num_units'] = self._emb_size + else: + last_cell_params = self.params['core_cell_params'] + + last_output_dim = last_cell_params['num_units'] + + if self._use_cell_state: + last_output_dim = 2 * last_output_dim + + + fake_input = tf.zeros(shape=(1, last_output_dim), + dtype=self._params['dtype']) + fake_output = self._output_layer.apply(fake_input) + with tf.variable_scope("dense", reuse=True): + dense_weights = tf.get_variable("kernel") + dense_biases = tf.get_variable("bias") + + if self._weight_tied and self._lm_phase: enc_emb_w = tf.transpose(dense_weights) - else: enc_emb_w = tf.get_variable( name="EncoderEmbeddingMatrix", @@ -217,11 +260,6 @@ def _encode(self, input_dict): self._enc_emb_w = tf.nn.dropout(enc_emb_w, keep_prob=emb_keep_prob) - if self._weight_tied: - last_cell_params = self.params['last_cell_params'] - else: - last_cell_params = self.params['core_cell_params'] - fwd_cells = [ single_cell(cell_class=self.params['core_cell'], cell_params=self.params['core_cell_params'], @@ -261,7 +299,8 @@ def _encode(self, input_dict): source_sequence = input_dict['source_tensors'][0] source_length = input_dict['source_tensors'][1] - if self._mode == 'train' or self._mode == 'eval': + # Inference for language modeling requires a different graph + if (not self._lm_phase) or self._mode == 'train' or self._mode == 'eval': embedded_inputs = tf.cast(tf.nn.embedding_lookup( self.enc_emb_w, source_sequence, @@ -273,21 +312,26 @@ def _encode(self, input_dict): sequence_length=source_length, time_major=time_major, swap_memory=use_swap_memory, - dtype=embedded_inputs.dtype, + dtype=self._params['dtype'], scope='decoder', ) - if self._mode == 'eval' or self._num_sampled >= self._vocab_size: - logits = self._output_layer.apply(encoder_outputs) # full softmax - output_dict = {'logits': logits, 'outputs': [tf.argmax(logits, axis=-1)]} - else: + if not self._lm_phase: + if self._use_cell_state: + encoder_outputs = tf.concat([encoder_state[-1].h, encoder_state[-1].c], axis=1) + else: + encoder_outputs = encoder_state[-1].h + + if self._mode == 'train' and self._num_sampled < self._fc_dim: # sampled softmax output_dict = {'weights': enc_emb_w, 'bias': dense_biases, 'inputs': encoder_outputs, 'logits': encoder_outputs, 'outputs': [encoder_outputs], 'num_sampled': self._num_sampled} - - else: + else: # full softmax + logits = self._output_layer.apply(encoder_outputs) + output_dict = {'logits': logits, 'outputs': [logits]} + else: # infer in LM phase embedding_fn = lambda ids: tf.cast(tf.nn.embedding_lookup( self.enc_emb_w, ids, @@ -306,7 +350,7 @@ def _encode(self, input_dict): ), output_layer=self._output_layer, ) - maximum_iterations = tf.constant(200) + maximum_iterations = tf.constant(self._num_tokens_gen) final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, diff --git a/open_seq2seq/losses/cross_entropy_loss.py b/open_seq2seq/losses/cross_entropy_loss.py index 33b59da28..ded3caa1d 100644 --- a/open_seq2seq/losses/cross_entropy_loss.py +++ b/open_seq2seq/losses/cross_entropy_loss.py @@ -17,5 +17,4 @@ def __init__(self, params, model, name="cross_entropy_loss"): def _compute_loss(self, input_dict): logits = input_dict['decoder_output']['logits'] labels = input_dict['target_tensors'][0] - loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels) - return loss + return tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels) diff --git a/open_seq2seq/losses/sequence_loss.py b/open_seq2seq/losses/sequence_loss.py index a6d0edeca..81ce3bc88 100644 --- a/open_seq2seq/losses/sequence_loss.py +++ b/open_seq2seq/losses/sequence_loss.py @@ -262,7 +262,7 @@ def _compute_loss(self, input_dict): labels = input_dict["target_tensors"][0] def _pad_tensors_to_same_length(x, y): - """Pad x and y so that the results have the + """ Pad x and y so that the results have the same length (second dimension). """ with tf.name_scope("pad_to_same_length"): @@ -311,7 +311,9 @@ def _pad_tensors_to_same_length(x, y): class BasicSampledSequenceLoss(Loss): """ - Basic sequence-to-sequence loss. This one does not use one-hot encodings + Basic sampled sequence-to-sequence loss. This is used when the full softmax + is computational prohibitive. + This one does not use one-hot encodings. """ @staticmethod def get_required_params(): @@ -371,19 +373,29 @@ def _compute_loss(self, input_dict): tgt_lengths = input_dict['target_tensors'][1] if 'weights' in input_dict['decoder_output']: - print('DOING SAMPLED LOSS') + print("Because 'weights' is in the input_dict, we are using sampled softmax loss.") inputs = input_dict["decoder_output"]['inputs'] self._hid_dim = inputs.get_shape().as_list()[-1] inputs = tf.reshape(inputs, (-1, self._hid_dim)) targets = tf.reshape(target_sequence, (-1, 1)) - crossent = tf.nn.sampled_softmax_loss(input_dict["decoder_output"]['weights'], - input_dict[ - "decoder_output"]['bias'], - targets, - inputs, - input_dict['decoder_output'][ - 'num_sampled'], - self._tgt_vocab_size) + + weights = input_dict["decoder_output"]['weights'] + biases = input_dict["decoder_output"]['bias'] + + if inputs.dtype.base_dtype != tf.float32: + inputs = tf.cast(inputs, tf.float32) + if weights.dtype.base_dtype != tf.float32: + weights = tf.cast(weights, tf.float32) + if biases.dtype.base_dtype != tf.float32: + biases = tf.cast(biases, tf.float32) + crossent = tf.nn.sampled_softmax_loss(weights, + biases, + targets, + inputs, + input_dict['decoder_output']['num_sampled'], + self._tgt_vocab_size) + + if self._average_across_timestep: loss = tf.reduce_mean(crossent) else: @@ -391,6 +403,7 @@ def _compute_loss(self, input_dict): loss /= self._batch_size else: + print("Because 'weights' is not in the input_dict, we are using normal softmax loss.") logits = input_dict["decoder_output"]["logits"] if self._offset_target_by_one: diff --git a/open_seq2seq/models/encoder_decoder.py b/open_seq2seq/models/encoder_decoder.py index da8cb7c7f..08b258c35 100644 --- a/open_seq2seq/models/encoder_decoder.py +++ b/open_seq2seq/models/encoder_decoder.py @@ -7,7 +7,6 @@ from open_seq2seq.models.model import Model from open_seq2seq.utils.utils import deco_print - class EncoderDecoderModel(Model): """ Standard encoder-decoder class with one encoder and one decoder. @@ -64,8 +63,8 @@ def __init__(self, params, mode="train", hvd=None): * **loss_params** (dict) --- dictionary with loss configuration. For complete list of possible parameters see the corresponding class docs. """ - super(EncoderDecoderModel, self).__init__( - params=params, mode=mode, hvd=hvd) + super(EncoderDecoderModel, self).__init__(params=params, mode=mode, hvd=hvd) + if 'encoder_params' not in self.params: self.params['encoder_params'] = {} if 'decoder_params' not in self.params: diff --git a/open_seq2seq/models/lstm_lm.py b/open_seq2seq/models/lstm_lm.py index b01df9174..075d1102e 100644 --- a/open_seq2seq/models/lstm_lm.py +++ b/open_seq2seq/models/lstm_lm.py @@ -1,61 +1,113 @@ +import random +import numpy as np import tensorflow as tf from .encoder_decoder import EncoderDecoderModel +from open_seq2seq.data import WKTDataLayer from open_seq2seq.utils.utils import deco_print, array_to_string +from open_seq2seq.utils import metrics class LSTMLM(EncoderDecoderModel): """ - An example class implementing classical text-to-text model. + An example class implementing an LSTM language model. """ + def __init__(self, params, mode="train", hvd=None): + super(EncoderDecoderModel, self).__init__(params=params, mode=mode, hvd=hvd) + + if 'encoder_params' not in self.params: + self.params['encoder_params'] = {} + if 'decoder_params' not in self.params: + self.params['decoder_params'] = {} + if 'loss_params' not in self.params: + self.params['loss_params'] = {} + + self._lm_phase = isinstance(self.get_data_layer(), WKTDataLayer) + + self._encoder = self._create_encoder() + self._decoder = self._create_decoder() + if self.mode == 'train' or self.mode == 'eval': + self._loss_computator = self._create_loss() + else: + self._loss_computator = None + + self.delimiter = self.get_data_layer().delimiter def _create_encoder(self): + self._print_f1 = False self.params['encoder_params']['vocab_size'] = ( - self.get_data_layer().params['vocab_size'] - ) - self.params['encoder_params']['output_dim'] = ( - self.get_data_layer().params['vocab_size'] + self.get_data_layer().vocab_size ) self.params['encoder_params']['end_token'] = ( - self.get_data_layer().params['end_token'] - ) - self.params['encoder_params']['seed_tokens'] = ( - self.get_data_layer().params['seed_tokens'] + self.get_data_layer().end_token ) self.params['encoder_params']['batch_size'] = ( - self.get_data_layer().params['batch_size'] + self.get_data_layer().batch_size ) + if not self._lm_phase: + self.params['encoder_params']['fc_dim'] = ( + self.get_data_layer().num_classes + ) + if self.params['encoder_params']['fc_dim'] == 2: + self._print_f1 = True + if self._lm_phase: + self.params['encoder_params']['seed_tokens'] = ( + self.get_data_layer().params['seed_tokens'] + ) return super(LSTMLM, self)._create_encoder() def _create_loss(self): - # self.params['loss_params']['batch_size'] = self.params['batch_size_per_gpu'] - self.params['loss_params']['batch_size'] = self.get_data_layer().params['batch_size'] - - self.params['loss_params']['tgt_vocab_size'] = ( - self.get_data_layer().params['vocab_size'] - ) + if self._lm_phase: + self.params['loss_params']['batch_size'] = ( + self.get_data_layer().batch_size + ) + self.params['loss_params']['tgt_vocab_size'] = ( + self.get_data_layer().vocab_size + ) return super(LSTMLM, self)._create_loss() def infer(self, input_values, output_values): - vocab = self.get_data_layer().corp.dictionary.idx2word - seed_tokens = self.params['encoder_params']['seed_tokens'] - for i in range(len(seed_tokens)): - print(output_values[0][i].shape) - print('Seed:', vocab[seed_tokens[i]] + '\n') - deco_print( - "Output: " + array_to_string( - output_values[0][i], + if self._lm_phase: + vocab = self.get_data_layer().corp.dictionary.idx2word + seed_tokens = self.params['encoder_params']['seed_tokens'] + for i in range(len(seed_tokens)): + print('Seed:', vocab[seed_tokens[i]] + '\n') + deco_print( + "Output: " + array_to_string( + output_values[0][i], + vocab=self.get_data_layer().corp.dictionary.idx2word, + delim=self.delimiter, + ), + offset=4, + ) + return [] + else: + ex, elen_x = input_values['source_tensors'] + ey, elen_y = None, None + if 'target_tensors' in input_values: + ey, elen_y = input_values['target_tensors'] + + n_samples = len(ex) + results = [] + for i in range(n_samples): + current_x = array_to_string( + ex[i][:elen_x[i]], vocab=self.get_data_layer().corp.dictionary.idx2word, - delim=self.get_data_layer().params["delimiter"], + delim=self.delimiter, ), - offset=4, - ) + current_pred = np.argmax(output_values[0][i]) + curret_y = None + if ey is not None: + current_y = np.argmax(ey[i]) + + results.append((current_x[0], current_pred, current_y)) + return results + def maybe_print_logs(self, input_values, output_values, training_step): x, len_x = input_values['source_tensors'] - y, len_y = input_values['target_tensors'] - # samples = output_values[0][0] + y, len_y = input_values['target_tensors'] x_sample = x[0] len_x_sample = len_x[0] @@ -66,19 +118,49 @@ def maybe_print_logs(self, input_values, output_values, training_step): "Train Source[0]: " + array_to_string( x_sample[:len_x_sample], vocab=self.get_data_layer().corp.dictionary.idx2word, - delim=self.get_data_layer().params["delimiter"], - ), - offset=4, - ) - deco_print( - "Train Target[0]: " + array_to_string( - y_sample[:len_y_sample], - vocab=self.get_data_layer().corp.dictionary.idx2word, - delim=self.get_data_layer().params["delimiter"], + delim=self.delimiter, ), offset=4, ) + if self._lm_phase: + deco_print( + "Train Target[0]: " + array_to_string( + y_sample[:len_y_sample], + vocab=self.get_data_layer().corp.dictionary.idx2word, + delim=self.delimiter, + ), + offset=4, + ) + else: + deco_print( + "TRAIN Target[0]: " + str(np.argmax(y_sample)), + offset=4, + ) + samples = output_values[0][0] + deco_print( + "TRAIN Prediction[0]: " + str(samples), + offset=4, + ) + labels = np.argmax(y, 1) + preds = np.argmax(output_values[0], axis=-1) + print('Labels', labels) + print('Preds', preds) + + deco_print( + "Accuracy: {:.4f}".format(metrics.accuracy(labels, preds)), + offset = 4, + ) + + if self._print_f1: + deco_print( + "Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f}" + .format(metrics.precision(labels, preds), + metrics.recall(labels, preds), + metrics.f1(labels, preds)), + offset = 4, + ) + return {} def evaluate(self, input_values, output_values): @@ -89,33 +171,129 @@ def evaluate(self, input_values, output_values): len_x_sample = elen_x[0] y_sample = ey[0] len_y_sample = elen_y[0] + + return_values = {} + + if self._lm_phase: + flip = random.random() + if flip <= 0.9: + return return_values + + deco_print( + "*****EVAL Source[0]: " + array_to_string( + x_sample[:len_x_sample], + vocab=self.get_data_layer().corp.dictionary.idx2word, + delim=self.delimiter, + ), + offset=4, + ) + samples = np.argmax(output_values[0][0], axis=-1) + deco_print( + "*****EVAL Target[0]: " + array_to_string( + y_sample[:len_y_sample], + vocab=self.get_data_layer().corp.dictionary.idx2word, + delim=self.delimiter, + ), + offset=4, + ) + + deco_print( + "*****EVAL Prediction[0]: " + array_to_string( + samples, + vocab=self.get_data_layer().corp.dictionary.idx2word, + delim=self.delimiter, + ), + offset=4, + ) + else: + deco_print( + "*****EVAL Source[0]: " + array_to_string( + x_sample[:len_x_sample], + vocab=self.get_data_layer().corp.dictionary.idx2word, + delim=self.delimiter, + ), + offset=4, + ) + samples = output_values[0][0] + deco_print( + "EVAL Target[0]: " + str(np.argmax(y_sample)), + offset=4, + ) + deco_print( + "EVAL Prediction[0]: " + str(samples), + offset=4, + ) + + labels = np.argmax(ey, 1) + preds = np.argmax(output_values[0], axis=-1) + print('Labels', labels) + print('Preds', preds) + + return_values['accuracy'] = metrics.accuracy(labels, preds) + + if self._print_f1: + return_values['true_pos'] = metrics.true_positives(labels, preds) + return_values['pred_pos'] = np.sum(preds) + return_values['actual_pos'] = np.sum(labels) + + return return_values + + def finalize_evaluation(self, results_per_batch, training_step=None): + accuracies = [] + true_pos, pred_pos, actual_pos = 0.0, 0.0, 0.0 + + for results in results_per_batch: + if not 'accuracy' in results: + return {} + accuracies.append(results['accuracy']) + if 'true_pos' in results: + true_pos += results['true_pos'] + pred_pos += results['pred_pos'] + actual_pos += results['actual_pos'] deco_print( - "*****EVAL Source[0]: " + array_to_string( - x_sample[:len_x_sample], - vocab=self.get_data_layer().corp.dictionary.idx2word, - delim=self.get_data_layer().params["delimiter"], - ), - offset=4, - ) - deco_print( - "*****EVAL Target[0]: " + array_to_string( - y_sample[:len_y_sample], - vocab=self.get_data_layer().corp.dictionary.idx2word, - delim=self.get_data_layer().params["delimiter"], - ), - offset=4, - ) - samples = output_values[0][0] - deco_print( - "*****EVAL Prediction[0]: " + array_to_string( - samples, - vocab=self.get_data_layer().corp.dictionary.idx2word, - delim=self.get_data_layer().params["delimiter"], - ), - offset=4, + "EVAL Accuracy: {:.4f}".format(np.mean(accuracies)), + offset = 4, ) + if true_pos > 0: + prec = true_pos / pred_pos + rec = true_pos / actual_pos + f1 = 2.0 * prec * rec / (rec + prec) + deco_print( + "EVAL Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f} | True pos: {}" + .format(prec, rec, f1, true_pos), + offset = 4, + ) + return {} + + def finalize_inference(self, results_per_batch, output_file): + out = open(output_file, 'w') + out.write('\t'.join(['Source', 'Pred', 'Label']) + '\n') + preds, labels = [], [] + + for results in results_per_batch: + for x, pred, y in results: + out.write('\t'.join([x, str(pred), str(y)]) + '\n') + preds.append(pred) + labels.append(y) + + if len(labels) > 0 and labels[0] is not None: + preds = np.asarray(preds) + labels = np.asarray(labels) + deco_print( + "TEST Accuracy: {:.4f}".format(metrics.accuracy(labels, preds)), + offset = 4, + ) + deco_print( + "TEST Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f}" + .format(metrics.precision(labels, preds), + metrics.recall(labels, preds), + metrics.f1(labels, preds)), + offset = 4, + ) + return {} + def _get_num_objects_per_step(self, worker_id=0): """Returns number of source tokens + number of target tokens in batch.""" data_layer = self.get_data_layer(worker_id) diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index 489cc4410..81b9b2abd 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -57,14 +57,15 @@ class :meth:`__init__` method. 'num_gpus': int, # cannot be used when gpu_ids is specified 'gpu_ids': list, # cannot be used when num_gpus is specified + 'load_model': str, + 'save_summaries_steps': None, # could be int or None 'print_loss_steps': None, # could be int or None 'print_samples_steps': None, # could be int or None 'print_bench_info_steps': None, # could be int or None 'save_checkpoint_steps': None, # could be int or None - 'restore_best_checkpoint': bool, # whether to restore best check point + 'restore_best_checkpoint': bool, # if True,restore best check point instead of latest checkpoint 'eval_steps': int, - 'base_logdir': str, 'finetune': bool, 'eval_batch_size_per_gpu': int, @@ -89,6 +90,8 @@ class :meth:`__init__` method. 'loss_scaling_params': dict, 'summaries': list, 'iter_size': int, + 'lm_vocab_file': str, #TODO: move this paramters to lstm_lm.py + 'processed_data_folder': str, } def __init__(self, params, mode="train", hvd=None): @@ -121,6 +124,20 @@ def __init__(self, params, mode="train", hvd=None): used if ``num_gpus`` is specified. When ``use_horovod`` is True this parameter is ignored. * **batch_size_per_gpu** (int) --- batch size to use for each GPU. + * **eval_batch_size_per_gpu** (int) --- batch size to use for each GPU during + inference. This is for when training and inference have different computation + and memory requirements, such as when training uses sampled softmax and + inference uses full softmax. If not specified, it's set + to ``batch_size_per_gpu``. + * **restore_best_checkpoint** (bool) --- if set to True, when doing evaluation + and inference, the model will load the best checkpoint instead of the latest + checkpoint. Best checkpoint is evaluated based on evaluation results, so + it's only available when the model is trained untder ``train_eval`` mode. + Default to False. + * **load_model** (str) --- points to the location of the pretrained model for + transfer learning. If specified, during training, the system will look + into the checkpoint in this folder and restore all variables whose names and + shapes match a variable in the new model. * **num_epochs** (int) --- number of epochs to run training for. This parameter cannot be used if ``max_steps`` is specified. * **max_steps** (int) --- number of steps to run training for. @@ -234,7 +251,9 @@ class docs. self._params['print_bench_info_steps'] = None self._params['finetune'] = self._params.get('finetune', False) - self._params['base_logdir'] = self._params.get('base_logdir', None) + # self._params['base_logdir'] = self._params.get('base_logdir', None) + self._params['load_model'] = self._params.get('load_model', None) + self._params['load_fc'] = self._params.get('load_fc', False) self._params['eval_batch_size_per_gpu'] = self._params.get( 'eval_batch_size_per_gpu', self._params['batch_size_per_gpu'] @@ -277,9 +296,14 @@ class docs. dl_params['batch_size'] = self._params['batch_size_per_gpu'] else: dl_params['batch_size'] = self._params['eval_batch_size_per_gpu'] + if 'lm_vocab_file' in self._params: + dl_params['lm_vocab_file'] = self._params['lm_vocab_file'] + if 'processed_data_folder' in self._params: + dl_params['processed_data_folder'] = self._params['processed_data_folder'] dl_params['mode'] = self._mode dl_params['interactive'] = self._interactive + if self.on_horovod: self._data_layer = self._params['data_layer']( params=dl_params, model=self, @@ -353,7 +377,6 @@ def compile(self, force_var_reuse=False, checkpoint=None, use_trt=False, precisi else: self.get_data_layer(gpu_cnt).build_graph() input_tensors = self.get_data_layer(gpu_cnt).input_tensors - loss, self._outputs[gpu_cnt] = self.build_forward_pass_graph( input_tensors, gpu_id=gpu_cnt, @@ -366,6 +389,7 @@ def compile(self, force_var_reuse=False, checkpoint=None, use_trt=False, precisi raise ValueError('Decoder outputs have to be either None or list') if self._mode == "train" or self._mode == "eval": losses.append(loss) + # end of for gpu_ind loop if self._mode == "train": self.loss = tf.reduce_mean(losses) @@ -386,8 +410,13 @@ def compile(self, force_var_reuse=False, checkpoint=None, use_trt=False, precisi self.get_data_layer().build_graph() input_tensors = self.get_data_layer().input_tensors - loss, self._output = self._build_forward_pass_graph(input_tensors, + all_loss, self._output = self._build_forward_pass_graph(input_tensors, gpu_id=0) + if isinstance(all_loss, (dict,)): + loss = all_loss['loss'] + else: + loss = all_loss + if self._output is not None and not isinstance(self._output, list): raise ValueError('Decoder outputs have to be either None or list') diff --git a/open_seq2seq/models/text2text.py b/open_seq2seq/models/text2text.py index bbe42042c..5a816dd1a 100644 --- a/open_seq2seq/models/text2text.py +++ b/open_seq2seq/models/text2text.py @@ -93,7 +93,7 @@ def infer(self, input_values, output_values): SpecialTextTokens.EOS_ID), PAD_ID=self.decoder.params.get('PAD_SYMBOL', SpecialTextTokens.PAD_ID), - ignore_special=True, delim=' ', + ignore_special=True, delim=' ', )) input_strings.append(text_ids_to_string( input_sample[i], @@ -101,7 +101,7 @@ def infer(self, input_values, output_values): S_ID=self.decoder.params.get('GO_SYMBOL', SpecialTextTokens.S_ID.value), EOS_ID=self.decoder.params.get('END_SYMBOL', - SpecialTextTokens.EOS_ID.value), + SpecialTextTokens.EOS_ID.value), PAD_ID=self.decoder.params.get('PAD_SYMBOL', SpecialTextTokens.PAD_ID), ignore_special=True, delim=' ', diff --git a/open_seq2seq/parts/rnns/utils.py b/open_seq2seq/parts/rnns/utils.py index 7d4a1d9b5..a2d167cba 100644 --- a/open_seq2seq/parts/rnns/utils.py +++ b/open_seq2seq/parts/rnns/utils.py @@ -52,8 +52,6 @@ def single_cell( if awd_initializer: val = 1.0/math.sqrt(cell_params['num_units']) cell_params['initializer'] = tf.random_uniform_initializer(minval=-val, maxval=val) - # else: - # cell_params['initializer'] = tf.contrib.layers.xavier_initializer() if 'WeightDropLayerNormBasicLSTMCell' in str(cell_class): if recurrent_keep_prob < 1.0: cell_params['recurrent_keep_prob'] = recurrent_keep_prob diff --git a/open_seq2seq/parts/rnns/weight_drop.py b/open_seq2seq/parts/rnns/weight_drop.py index 3280bc496..bc25b3198 100644 --- a/open_seq2seq/parts/rnns/weight_drop.py +++ b/open_seq2seq/parts/rnns/weight_drop.py @@ -2,6 +2,7 @@ class WeightDropLayerNormBasicLSTMCell(tf.contrib.rnn.RNNCell): """LSTM unit with layer normalization, weight dropout, and recurrent dropout. + This is based on LSTM's standard implementation of LayerNormBasicLSTMCell. This class adds layer normalization and recurrent dropout to a basic LSTM unit. Layer normalization implementation is based on: https://arxiv.org/abs/1607.06450. @@ -32,8 +33,6 @@ def __init__(self, dtype=None): """Initializes the basic LSTM cell. Args: - input_weight is W - recurrent_weight is U num_units: int, The number of units in the LSTM cell. forget_bias: float, The bias added to forget gates (see above). input_size: Deprecated and unused. @@ -43,9 +42,14 @@ def __init__(self, `layer_norm` has been set to `False`, this argument will be ignored. norm_shift: float, The layer normalization shift initial value. If `layer_norm` has been set to `False`, this argument will be ignored. - dropout_keep_prob: unit Tensor or float between 0 and 1 representing the - recurrent dropout probability value. If float and 1.0, no dropout will - be applied. + input_weight_keep_prob: keep probablility for dropout of W + (kernel used to multiply with the input tensor) + recurrent_weight_keep_prob: keep probablility for dropout of U + (kernel used to multiply with last hidden state tensor) + recurrent_keep_prob: keep probability for dropout + when applying tanh for the input transform step + weight_variational: whether to keep the same weight dropout mask + at every timestep. This feature is not yet implemented. dropout_prob_seed: (optional) integer, the randomness seed. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has @@ -88,7 +92,7 @@ def _norm(self, inp, scope, dtype=tf.float32): shape = inp.get_shape()[-1:] gamma_init = tf.constant_initializer(self._norm_gain) beta_init = tf.constant_initializer(self._norm_shift) - with tf.variable_scope(scope): # replace vs with tf. vs stands for va + with tf.variable_scope(scope): # Initialize beta and gamma for use by layer_norm. tf.get_variable("gamma", shape=shape, initializer=gamma_init, dtype=dtype) tf.get_variable("beta", shape=shape, initializer=beta_init, dtype=dtype) @@ -117,6 +121,9 @@ def _linear(self, args, inputs_shape, h_shape): return out def _variational_dropout(self, values, noise, keep_prob): + ''' + TODO: Implement variational dropout for weight dropout + ''' return tf.nn.dropout(values, keep_prob, seed=self._dropout_seed) def _dropout(self, values, dropout_noise, keep_prob): diff --git a/open_seq2seq/utils/funcs.py b/open_seq2seq/utils/funcs.py index a81e59309..cce788bd5 100644 --- a/open_seq2seq/utils/funcs.py +++ b/open_seq2seq/utils/funcs.py @@ -14,7 +14,8 @@ collect_if_horovod from .hooks import PrintSamplesHook, RunEvaluationHook, PrintLossAndTimeHook, \ BroadcastGlobalVariablesHook -from open_seq2seq.models import LSTMLM +from .helpers import TransferMonitoredTrainingSession, TransferScaffold +from open_seq2seq.data import WKTDataLayer def train(train_model, eval_model=None, debug_port=None): @@ -42,8 +43,10 @@ def train(train_model, eval_model=None, debug_port=None): if master_worker: checkpoint_dir = train_model.params['logdir'] + base_ckpt_dir = train_model.params['load_model'] else: checkpoint_dir = None + base_ckpt_dir = None if eval_model is not None: # noinspection PyTypeChecker @@ -52,7 +55,7 @@ def train(train_model, eval_model=None, debug_port=None): every_steps=eval_model.params['eval_steps'], model=eval_model, last_step=train_model.last_step, - print_ppl=isinstance(eval_model, LSTMLM), + print_ppl=isinstance(eval_model.get_data_layer(), WKTDataLayer), ), ) @@ -70,7 +73,7 @@ def train(train_model, eval_model=None, debug_port=None): hooks.append(PrintLossAndTimeHook( every_steps=train_model.params['print_loss_steps'], model=train_model, - print_ppl=isinstance(train_model, LSTMLM) + print_ppl=isinstance(train_model.get_data_layer(), WKTDataLayer), )) if train_model.params['print_samples_steps'] is not None: # noinspection PyTypeChecker @@ -94,10 +97,16 @@ def train(train_model, eval_model=None, debug_port=None): [train_model.get_data_layer(i).iterator.initializer for i in range(train_model.num_gpus)] ) - - scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer) - ) + + fine_tuning = (not base_ckpt_dir) or tf.train.latest_checkpoint(checkpoint_dir) + if fine_tuning: + scaffold = tf.train.Scaffold( + local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer) + ) + else: + scaffold = TransferScaffold( + local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer) + ) fetches = [train_model.train_op] try: total_objects = 0.0 @@ -109,7 +118,8 @@ def train(train_model, eval_model=None, debug_port=None): "train model does not define get_num_objects_per_step method.") # starting training - with tf.train.MonitoredTrainingSession( + if fine_tuning: + sess = TransferMonitoredTrainingSession( scaffold=scaffold, checkpoint_dir=checkpoint_dir, save_summaries_steps=train_model.params['save_summaries_steps'], @@ -118,43 +128,55 @@ def train(train_model, eval_model=None, debug_port=None): log_step_count_steps=train_model.params['save_summaries_steps'], stop_grace_period_secs=300, hooks=hooks, - ) as sess: - step = 0 - num_bench_updates = 0 - while True: - if sess.should_stop(): - break - tm = time.time() - try: - feed_dict = {} - iter_size = train_model.params.get('iter_size', 1) - if iter_size > 1: - feed_dict[train_model.skip_update_ph] = step % iter_size != 0 - if step % iter_size == 0: - if step >= bench_start: - num_bench_updates += 1 - fetches_vals = sess.run(fetches, feed_dict) - else: - # necessary to skip "no-update" steps when iter_size > 1 - def run_with_no_hooks(step_context): - return step_context.session.run(fetches, feed_dict) - fetches_vals = sess.run_step_fn(run_with_no_hooks) - except tf.errors.OutOfRangeError: - break - if step >= bench_start: - total_time += time.time() - tm - if len(fetches) > 1: - for i in range(train_model.num_gpus): - total_objects += np.sum(fetches_vals[i + 1]) - if train_model.params['print_bench_info_steps'] is not None: - if step % train_model.params['print_bench_info_steps'] == 0: - total_objects_cur = collect_if_horovod(total_objects, hvd, - mode="sum") - if master_worker: - avg_objects = 1.0 * total_objects_cur / total_time - deco_print("Avg objects per second: {:.3f}".format(avg_objects)) - - step += 1 + base_ckpt_dir=base_ckpt_dir, + load_fc=train_model.params['load_fc']) + else: + sess = tf.train.MonitoredTrainingSession( + scaffold=scaffold, + checkpoint_dir=checkpoint_dir, + save_summaries_steps=train_model.params['save_summaries_steps'], + config=sess_config, + save_checkpoint_secs=None, + log_step_count_steps=train_model.params['save_summaries_steps'], + stop_grace_period_secs=300, + hooks=hooks) + step = 0 + num_bench_updates = 0 + while True: + if sess.should_stop(): + break + tm = time.time() + try: + feed_dict = {} + iter_size = train_model.params.get('iter_size', 1) + if iter_size > 1: + feed_dict[train_model.skip_update_ph] = step % iter_size != 0 + if step % iter_size == 0: + if step >= bench_start: + num_bench_updates += 1 + fetches_vals = sess.run(fetches, feed_dict) + else: + # necessary to skip "no-update" steps when iter_size > 1 + def run_with_no_hooks(step_context): + return step_context.session.run(fetches, feed_dict) + fetches_vals = sess.run_step_fn(run_with_no_hooks) + except tf.errors.OutOfRangeError: + break + if step >= bench_start: + total_time += time.time() - tm + if len(fetches) > 1: + for i in range(train_model.num_gpus): + total_objects += np.sum(fetches_vals[i + 1]) + if train_model.params['print_bench_info_steps'] is not None: + if step % train_model.params['print_bench_info_steps'] == 0: + total_objects_cur = collect_if_horovod(total_objects, hvd, + mode="sum") + if master_worker: + avg_objects = 1.0 * total_objects_cur / total_time + deco_print("Avg objects per second: {:.3f}".format(avg_objects)) + + step += 1 + sess.close() if len(fetches) > 1: total_objects = collect_if_horovod(total_objects, hvd, mode="sum") diff --git a/open_seq2seq/utils/helpers.py b/open_seq2seq/utils/helpers.py new file mode 100644 index 000000000..476a330f1 --- /dev/null +++ b/open_seq2seq/utils/helpers.py @@ -0,0 +1,480 @@ +''' +This file modifies standard TensorFlow modules necessary for transfer learning, +such as MonitoredTrainingSession, ChiefSessionCreator, Scaffold, SessionManager +''' + +import time + +import tensorflow as tf +from tensorflow.python.tools import inspect_checkpoint as chkp +from tensorflow.python import pywrap_tensorflow +from tensorflow.python.ops import resources +from tensorflow.python.training import saver as training_saver + +# Value that indicates no value was provided. +USE_DEFAULT = object() + +def TransferMonitoredTrainingSession(master='', # pylint: disable=invalid-name + is_chief=True, + checkpoint_dir=None, + scaffold=None, + hooks=None, + chief_only_hooks=None, + save_checkpoint_secs=USE_DEFAULT, + save_summaries_steps=USE_DEFAULT, + save_summaries_secs=USE_DEFAULT, + config=None, + stop_grace_period_secs=120, + log_step_count_steps=100, + max_wait_secs=7200, + save_checkpoint_steps=USE_DEFAULT, + summary_dir=None, + base_ckpt_dir=None, + load_fc=False): + """Creates a `MonitoredSession` for training. + For a chief, this utility sets proper session initializer/restorer. It also + creates hooks related to checkpoint and summary saving. For workers, this + utility sets proper session creator which waits for the chief to + initialize/restore. Please check `tf.train.MonitoredSession` for more + information. + Args: + master: `String` the TensorFlow master to use. + is_chief: If `True`, it will take care of initialization and recovery the + underlying TensorFlow session. If `False`, it will wait on a chief to + initialize or recover the TensorFlow session. + checkpoint_dir: A string. Optional path to a directory where to restore + variables. + scaffold: A `Scaffold` used for gathering or building supportive ops. If + not specified, a default one is created. It's used to finalize the graph. + hooks: Optional list of `SessionRunHook` objects. + chief_only_hooks: list of `SessionRunHook` objects. Activate these hooks if + `is_chief==True`, ignore otherwise. + save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved + using a default checkpoint saver. If both `save_checkpoint_steps` and + `save_checkpoint_secs` are set to `None`, then the default checkpoint + saver isn't used. If both are provided, then only `save_checkpoint_secs` + is used. Default 600. + save_summaries_steps: The frequency, in number of global steps, that the + summaries are written to disk using a default summary saver. If both + `save_summaries_steps` and `save_summaries_secs` are set to `None`, then + the default summary saver isn't used. Default 100. + save_summaries_secs: The frequency, in secs, that the summaries are written + to disk using a default summary saver. If both `save_summaries_steps` and + `save_summaries_secs` are set to `None`, then the default summary saver + isn't used. Default not enabled. + config: an instance of `tf.ConfigProto` proto used to configure the session. + It's the `config` argument of constructor of `tf.Session`. + stop_grace_period_secs: Number of seconds given to threads to stop after + `close()` has been called. + log_step_count_steps: The frequency, in number of global steps, that the + global step/sec is logged. + max_wait_secs: Maximum time workers should wait for the session to + become available. This should be kept relatively short to help detect + incorrect code, but sometimes may need to be increased if the chief takes + a while to start up. + save_checkpoint_steps: The frequency, in number of global steps, that a + checkpoint is saved using a default checkpoint saver. If both + `save_checkpoint_steps` and `save_checkpoint_secs` are set to `None`, then + the default checkpoint saver isn't used. If both are provided, then only + `save_checkpoint_secs` is used. Default not enabled. + summary_dir: A string. Optional path to a directory where to + save summaries. If None, checkpoint_dir is used instead. + Returns: + A `MonitoredSession` object. + """ + if save_summaries_steps == USE_DEFAULT and save_summaries_secs == USE_DEFAULT: + save_summaries_steps = 100 + save_summaries_secs = None + elif save_summaries_secs == USE_DEFAULT: + save_summaries_secs = None + elif save_summaries_steps == USE_DEFAULT: + save_summaries_steps = None + + if (save_checkpoint_steps == USE_DEFAULT and + save_checkpoint_secs == USE_DEFAULT): + save_checkpoint_steps = None + save_checkpoint_secs = 600 + elif save_checkpoint_secs == USE_DEFAULT: + save_checkpoint_secs = None + elif save_checkpoint_steps == USE_DEFAULT: + save_checkpoint_steps = None + + if not is_chief: + session_creator = tf.train.WorkerSessionCreator( + scaffold=scaffold, + master=master, + config=config, + max_wait_secs=max_wait_secs) + return tf.train.MonitoredSession(session_creator=session_creator, hooks=hooks or [], + stop_grace_period_secs=stop_grace_period_secs) + + all_hooks = [] + if chief_only_hooks: + all_hooks.extend(chief_only_hooks) + + if not base_ckpt_dir or tf.train.latest_checkpoint(checkpoint_dir): + # if no base checkpoint or if checkpoint for the current model already exists + session_creator = tf.train.ChiefSessionCreator( + scaffold=scaffold, + checkpoint_dir=checkpoint_dir, + master=master, + config=config) + + else: # load variables from the base model's checkpoint + print("Loading the base model") + session_creator = TransferChiefSessionCreator( + scaffold=scaffold, + checkpoint_dir=base_ckpt_dir, + master=master, + config=config, + load_fc=load_fc) + + summary_dir = summary_dir or checkpoint_dir + if summary_dir: + if log_step_count_steps and log_step_count_steps > 0: + all_hooks.append( + tf.train.StepCounterHook( + output_dir=summary_dir, every_n_steps=log_step_count_steps)) + + if (save_summaries_steps and save_summaries_steps > 0) or ( + save_summaries_secs and save_summaries_secs > 0): + all_hooks.append(tf.train.SummarySaverHook( + scaffold=scaffold, + save_steps=save_summaries_steps, + save_secs=save_summaries_secs, + output_dir=summary_dir)) + + if checkpoint_dir: + if (save_checkpoint_secs and save_checkpoint_secs > 0) or ( + save_checkpoint_steps and save_checkpoint_steps > 0): + all_hooks.append(tf.train.CheckpointSaverHook( + checkpoint_dir, + save_steps=save_checkpoint_steps, + save_secs=save_checkpoint_secs, + scaffold=scaffold)) + + if hooks: + all_hooks.extend(hooks) + return tf.train.MonitoredSession(session_creator=session_creator, hooks=all_hooks, + stop_grace_period_secs=stop_grace_period_secs) + +class TransferChiefSessionCreator(tf.train.SessionCreator): + def __init__(self, + scaffold=None, + master='', + config=None, + checkpoint_dir=None, + checkpoint_filename_with_path=None, + load_fc=False): + """Initializes a chief session creator. + Args: + scaffold: A `Scaffold` used for gathering or building supportive ops. If + not specified a default one is created. It's used to finalize the graph. + master: `String` representation of the TensorFlow master to use. + config: `ConfigProto` proto used to configure the session. + checkpoint_dir: A string. Optional path to a directory where to restore + variables. + checkpoint_filename_with_path: Full file name path to the checkpoint file. + """ + self._checkpoint_dir = checkpoint_dir + self._checkpoint_filename_with_path = checkpoint_filename_with_path + self._scaffold = scaffold or TransferScaffold() + self._session_manager = None + self._master = master + self._config = config + self._load_fc = load_fc + + def _get_session_manager(self): + if self._session_manager: + return self._session_manager + + self._session_manager = TransferSessionManager( + local_init_op=self._scaffold.local_init_op, + ready_op=self._scaffold.ready_op, + ready_for_local_init_op=self._scaffold.ready_for_local_init_op, + graph=tf.get_default_graph()) + return self._session_manager + + def create_session(self): + print('SCAFFOLD TYPE:', type(self._scaffold)) + self._scaffold.finalize() + # tf.get_default_graph()._unsafe_unfinalize() + + return self._get_session_manager().prepare_session( + self._master, + saver=self._scaffold.saver, + checkpoint_dir=self._checkpoint_dir, + checkpoint_filename_with_path=self._checkpoint_filename_with_path, + config=self._config, + init_op=self._scaffold.init_op, + init_feed_dict=self._scaffold.init_feed_dict, + init_fn=self._scaffold.init_fn, + load_fc=self._load_fc) + +class TransferScaffold(tf.train.Scaffold): + def finalize(self): + """Creates operations if needed and finalizes the graph.""" + if self._init_op is None: + def default_init_op(): + return tf.group( + tf.global_variables_initializer(), + resources.initialize_resources(resources.shared_resources())) + self._init_op = TransferScaffold.get_or_default( + 'init_op', + tf.GraphKeys.INIT_OP, + default_init_op) + if self._ready_op is None: + def default_ready_op(): + return tf.concat([ + tf.report_uninitialized_variables(), + resources.report_uninitialized_resources() + ], 0) + self._ready_op = TransferScaffold.get_or_default( + 'ready_op', tf.GraphKeys.READY_OP, + default_ready_op) + if self._ready_for_local_init_op is None: + def default_ready_for_local_init_op(): + return tf.report_uninitialized_variables( + tf.global_variables()) + self._ready_for_local_init_op = TransferScaffold.get_or_default( + 'ready_for_local_init_op', tf.GraphKeys.READY_FOR_LOCAL_INIT_OP, + default_ready_for_local_init_op) + if self._local_init_op is None: + self._local_init_op = TransferScaffold.get_or_default( + 'local_init_op', tf.GraphKeys.LOCAL_INIT_OP, + TransferScaffold.default_local_init_op) + if self._summary_op is None: + self._summary_op = TransferScaffold.get_or_default('summary_op', + tf.GraphKeys.SUMMARY_OP, + tf.summary.merge_all) + # pylint: disable=g-long-lambda + if self._saver is None: + self._saver = training_saver._get_saver_or_default() # pylint: disable=protected-access + # pylint: enable=g-long-lambda + self._saver.build() + + # ops.get_default_graph().finalize() + # logging.info('Graph was finalized.') + return self + +class TransferSessionManager(tf.train.SessionManager): + def _restore_checkpoint(self, + master, + sess, + saver=None, + checkpoint_dir=None, + checkpoint_filename_with_path=None, + wait_for_checkpoint=False, + max_wait_secs=7200, + config=None, + load_fc=False): + """Creates a `Session`, and tries to restore a checkpoint. + Args: + master: `String` representation of the TensorFlow master to use. + saver: A `Saver` object used to restore a model. + checkpoint_dir: Path to the checkpoint files. The latest checkpoint in the + dir will be used to restore. + checkpoint_filename_with_path: Full file name path to the checkpoint file. + wait_for_checkpoint: Whether to wait for checkpoint to become available. + max_wait_secs: Maximum time to wait for checkpoints to become available. + config: Optional `ConfigProto` proto used to configure the session. + Returns: + A pair (sess, is_restored) where 'is_restored' is `True` if + the session could be restored, `False` otherwise. + Raises: + ValueError: If both checkpoint_dir and checkpoint_filename_with_path are + set. + """ + self._target = master + # sess = tf.Session(self._target, graph=self._graph, config=config) + + if checkpoint_dir and checkpoint_filename_with_path: + raise ValueError("Can not provide both checkpoint_dir and " + "checkpoint_filename_with_path.") + # If either saver or checkpoint_* is not specified, cannot restore. Just + # return. + print('checkpoint_dir', checkpoint_dir) + print('checkpoint_filename_with_path', checkpoint_filename_with_path) + if not saver or not (checkpoint_dir or checkpoint_filename_with_path): + return sess, False + + if checkpoint_filename_with_path: + # saver.restore(sess, checkpoint_filename_with_path) + restore_certain_variables(sess, checkpoint_filename_with_path) + return sess, True + + # Waits up until max_wait_secs for checkpoint to become available. + wait_time = 0 + ckpt = tf.train.get_checkpoint_state(checkpoint_dir) + while not ckpt or not ckpt.model_checkpoint_path: + if wait_for_checkpoint and wait_time < max_wait_secs: + tf.logging.info("Waiting for checkpoint to be available.") + time.sleep(self._recovery_wait_secs) + wait_time += self._recovery_wait_secs + ckpt = tf.train.get_checkpoint_state(checkpoint_dir) + else: + return sess, False + + # Loads the checkpoint. + ckpt_file = ckpt.model_checkpoint_path + restore_certain_variables(sess, ckpt_file) + saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths) + return sess, True + + def prepare_session(self, + master, + init_op=None, + saver=None, + checkpoint_dir=None, + checkpoint_filename_with_path=None, + wait_for_checkpoint=False, + max_wait_secs=7200, + config=None, + init_feed_dict=None, + init_fn=None, + load_fc=False): + """Creates a `Session`. Makes sure the model is ready to be used. + Creates a `Session` on 'master'. If a `saver` object is passed in, and + `checkpoint_dir` points to a directory containing valid checkpoint + files, then it will try to recover the model from checkpoint. If + no checkpoint files are available, and `wait_for_checkpoint` is + `True`, then the process would check every `recovery_wait_secs`, + up to `max_wait_secs`, for recovery to succeed. + If the model cannot be recovered successfully then it is initialized by + running the `init_op` and calling `init_fn` if they are provided. + The `local_init_op` is also run after init_op and init_fn, regardless of + whether the model was recovered successfully, but only if + `ready_for_local_init_op` passes. + If the model is recovered from a checkpoint it is assumed that all + global variables have been initialized, in particular neither `init_op` + nor `init_fn` will be executed. + It is an error if the model cannot be recovered and no `init_op` + or `init_fn` or `local_init_op` are passed. + Args: + master: `String` representation of the TensorFlow master to use. + init_op: Optional `Operation` used to initialize the model. + saver: A `Saver` object used to restore a model. + checkpoint_dir: Path to the checkpoint files. The latest checkpoint in the + dir will be used to restore. + checkpoint_filename_with_path: Full file name path to the checkpoint file. + wait_for_checkpoint: Whether to wait for checkpoint to become available. + max_wait_secs: Maximum time to wait for checkpoints to become available. + config: Optional `ConfigProto` proto used to configure the session. + init_feed_dict: Optional dictionary that maps `Tensor` objects to feed + values. This feed dictionary is passed to the session `run()` call when + running the init op. + init_fn: Optional callable used to initialize the model. Called after the + optional `init_op` is called. The callable must accept one argument, + the session being initialized. + Returns: + A `Session` object that can be used to drive the model. + Raises: + RuntimeError: If the model cannot be initialized or recovered. + ValueError: If both checkpoint_dir and checkpoint_filename_with_path are + set. + """ + sess = tf.Session(master, graph=self._graph, config=config) + if init_op is None and not init_fn and self._local_init_op is None: + raise RuntimeError("Model is not initialized and no init_op or " + "init_fn or local_init_op was given") + if init_op is not None: + sess.run(init_op, feed_dict=init_feed_dict) + if init_fn: + init_fn(sess) + sess.run(tf.local_variables_initializer()) # why do i have to add this? + print("LOCAL INIT OP", self._local_init_op) + sess, is_loaded_from_checkpoint = self._restore_checkpoint( + master, + sess, + saver, + checkpoint_dir=checkpoint_dir, + checkpoint_filename_with_path=checkpoint_filename_with_path, + wait_for_checkpoint=wait_for_checkpoint, + max_wait_secs=max_wait_secs, + config=config, + load_fc=load_fc) + + + local_init_success, msg = self._try_run_local_init_op(sess) + if not local_init_success: + raise RuntimeError( + "Init operations did not make model ready for local_init. " + "Init op: %s, init fn: %s, error: %s" % (_maybe_name(init_op), + init_fn, + msg)) + + is_ready, msg = self._model_ready(sess) + if not is_ready: + raise RuntimeError( + "Init operations did not make model ready. " + "Init op: %s, init fn: %s, local_init_op: %s, error: %s" % + (_maybe_name(init_op), init_fn, self._local_init_op, msg)) + return sess + +def _restore_embed(embed_var, var_to_shape_map, reader): + has_embed = len([var for var in var_to_shape_map if 'EmbeddingMatrix' in var]) > 0 + if has_embed: + return None, False # assume same name + for var in var_to_shape_map: + if var.endswith('dense/kernel') and var_to_shape_map[var] == tf.transpose(embed_var).shape: + print('Assigning', var, 'to', embed_var.name) + return embed_var.assign(reader.get_tensor(var).T), True + return None, False + +def restore_certain_variables(sess, filename): + print('Restoring only the variables found in the checkpoint') + trainables = {v.name: v for v in tf.trainable_variables()} + assign_ops = [] + vars_to_initialize = [] + + try: + reader = tf.train.NewCheckpointReader(filename) + var_to_shape_map = reader.get_variable_to_shape_map() + non_loss_var = {var: var_to_shape_map[var] for var in var_to_shape_map if 'Loss_Optimization' not in var} + for var in var_to_shape_map: + if 'global_step' in var: + print('Restoring from the step', reader.get_tensor(var)) + for name in trainables: + idx = name.find(":") + if idx != -1: + true_name = name[:idx] + # if name.endswith(':0'): + # true_name = name[:-2] + if true_name in var_to_shape_map and trainables[name].shape == var_to_shape_map[true_name]: + print('Restoring value to', true_name) + assign_ops.append(trainables[name].assign(reader.get_tensor(true_name))) + if 'EmbeddingMatrix' in true_name: + embed_op, has_embed_op = _restore_embed(trainables[name], var_to_shape_map, reader) + if has_embed_op: + assign_ops.append(embed_op) + + print('assign_ops', assign_ops) + except Exception as e: # pylint: disable=broad-except + print(str(e)) + if "corrupted compressed block contents" in str(e): + print("It's likely that your checkpoint file has been compressed " + "with SNAPPY.") + if ("Data loss" in str(e) and + (any([e in file_name for e in [".index", ".meta", ".data"]]))): + proposed_file = ".".join(file_name.split(".")[0:-1]) + v2_file_error_template = """ + It's likely that this is a V2 checkpoint and you need to provide the filename + *prefix*. Try removing the '.' and extension. Try: + inspect checkpoint --file_name = {}""" + print(v2_file_error_template.format(proposed_file)) + sess.run(assign_ops) + +def _maybe_name(obj): + """Returns object name if it has one, or a message otherwise. + This is useful for names that apper in error messages. + Args: + obj: Object to get the name of. + Returns: + name, "None", or a "no name" message. + """ + if obj is None: + return "None" + elif hasattr(obj, "name"): + return obj.name + else: + return "" % type(obj) diff --git a/open_seq2seq/utils/hooks.py b/open_seq2seq/utils/hooks.py index f8222bd17..c8a65bb12 100644 --- a/open_seq2seq/utils/hooks.py +++ b/open_seq2seq/utils/hooks.py @@ -143,12 +143,14 @@ def after_run(self, run_context, run_values): loss = results[0] if not self._model.on_horovod or self._model.hvd.rank() == 0: if self._print_ppl: - deco_print("loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}" + deco_print("Train loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}" .format(loss, math.exp(loss), loss/math.log(2)), start="", end=", ") else: - deco_print("loss: {:.4f} ".format(loss), start="", end=", ") + deco_print( + "Train loss: {:.4f} ".format(loss), + offset=4) tm = (time.time() - self._last_time) / self._every_steps m, s = divmod(tm, 60) @@ -199,6 +201,7 @@ def after_run(self, run_context, run_values): self._model, run_context.session, mode="eval", compute_loss=True, ) + if not self._model.on_horovod or self._model.hvd.rank() == 0: if self._print_ppl: deco_print("Validation loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}" diff --git a/open_seq2seq/utils/metrics.py b/open_seq2seq/utils/metrics.py new file mode 100644 index 000000000..63861ed6b --- /dev/null +++ b/open_seq2seq/utils/metrics.py @@ -0,0 +1,25 @@ +# Copyright (c) 2018 NVIDIA Corporation +''' +This file implements function to calcuate basic metrics. +''' +import numpy as np +import tensorflow as tf + +def true_positives(labels, preds): + return np.sum(np.logical_and(labels, preds)) + +def accuracy(labels, preds): + return np.sum(np.equal(labels, preds)) / len(preds) + +def recall(labels, preds): + return true_positives(labels, preds) / np.sum(labels) + +def precision(labels, preds): + return true_positives(labels, preds) / np.sum(preds) + +def f1(labels, preds): + rec = recall(labels, preds) + pre = precision(labels, preds) + if rec == 0 or pre == 0: + return 0 + return 2 * rec * pre / (rec + pre) diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index 2d9a0bde9..18038b275 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -197,8 +197,7 @@ def iterate_data(model, sess, compute_loss, mode, verbose, num_steps=None): if verbose: if size_defined: - data_size = int(np.sum(np.ceil(np.array(dl_sizes) / - model.params['batch_size_per_gpu']))) + data_size = int(np.sum(np.ceil(np.array(dl_sizes) / batch_size))) if step == 0 or len(fetches_vals) == 0 or \ (data_size > 10 and processed_batches % (data_size // 10) == 0): deco_print("Processed {}/{} batches{}".format( @@ -229,6 +228,7 @@ def iterate_data(model, sess, compute_loss, mode, verbose, num_steps=None): else: deco_print("Not enough steps for benchmarking{}".format(ending)) + if compute_loss: return results_per_batch, total_loss, np.sum(total_samples) else: @@ -486,7 +486,7 @@ def get_base_config(args): parser.add_argument("--mode", default='train', help="Could be \"train\", \"eval\", " "\"train_eval\" or \"infer\"") - parser.add_argument("--infer_output_file", + parser.add_argument("--infer_output_file", default='infer-out.txt', help="Path to the output of inference") parser.add_argument('--continue_learning', dest='continue_learning', action='store_true', help="whether to continue learning") @@ -568,6 +568,7 @@ def check_logdir(args, base_config, restore_best_checkpoint=False): ckpt_dir = os.path.join(logdir, 'logs') else: ckpt_dir = logdir + if args.mode == 'train' or args.mode == 'train_eval': if os.path.isfile(logdir): raise IOError("There is a file with the same name as \"logdir\" " @@ -595,11 +596,16 @@ def check_logdir(args, base_config, restore_best_checkpoint=False): elif (args.mode == 'infer' or args.mode == 'eval' or args.mode == 'interactive_infer'): if os.path.isdir(logdir) and os.listdir(logdir) != []: - if restore_best_checkpoint: - deco_print("Restoring from best checkpoint") - checkpoint = tf.train.latest_checkpoint(ckpt_dir + '/best_models') + # if os.path.isdir(logdir) and 'checkpoint' in os.listdir(logdir): + best_ckpt_dir = os.path.join(ckpt_dir, 'best_models') + if restore_best_checkpoint and os.path.isdir(best_ckpt_dir): + deco_print("Restoring from the best checkpoint") + checkpoint = tf.train.latest_checkpoint(best_ckpt_dir) + ckpt_dir = best_ckpt_dir else: + deco_print("Restoring from the latest checkpoint") checkpoint = tf.train.latest_checkpoint(ckpt_dir) + if checkpoint is None: raise IOError( "There is no valid TensorFlow checkpoint in the " @@ -620,6 +626,40 @@ def check_logdir(args, base_config, restore_best_checkpoint=False): return checkpoint +def check_base_model_logdir(base_logdir, restore_best_checkpoint=False): + """A helper function that ensures the logdir is setup correctly + + Args: + args (dict): Dictionary as returned from get_base_config() + base_config (dict): Dictionary as returned from get_base_config() + restore_best_checkpoint (bool): If True, will look for ckpt_dir + /best_models + Returns: + checkpoint: Either None if continue-learning is not set and training, or + the name of the checkpoint used to restore the model + """ + # checking that everything is correct with log directory + if not base_logdir: + return '' + + if (not os.path.isdir(base_logdir)) or len(os.listdir(base_logdir)) == 0: + raise IOError("The log directory for the base model is empty or does not exist.") + + ckpt_dir = os.path.join(base_logdir, 'logs') + if not os.path.isdir(ckpt_dir): + raise IOError("There's no folder 'logs' in the base model logdir. \ + If checkpoints exist, put them in the 'logs' folder.") + + if restore_best_checkpoint and os.path.isdir(os.path.join(ckpt_dir, 'best_models')): + ckpt_dir = os.path.join(ckpt_dir, 'best_models') + + checkpoint = tf.train.latest_checkpoint(ckpt_dir) + if checkpoint is None: + raise IOError( + "There is no valid TensorFlow checkpoint in the \ + {} directory. Can't load model".format(ckpt_dir)) + + return ckpt_dir + def create_logdir(args, base_config): """A helper function that ensures the logdir and log files are setup corretly. Only called in --enable_logs is set. @@ -663,7 +703,7 @@ def create_logdir(args, base_config): return old_stdout, old_stderr, stdout_log, stderr_log -def create_model(args, base_config, config_module, base_model, hvd): +def create_model(args, base_config, config_module, base_model, hvd, restore_best_checkpoint=False): """A helpful function that creates the train, eval, and infer models as needed. @@ -749,7 +789,9 @@ def create_model(args, base_config, config_module, base_model, hvd): model.compile(force_var_reuse=False) else: model = base_model(params=infer_config, mode=args.mode, hvd=hvd) - checkpoint = check_logdir(args, base_config) + if base_config['logdir'].endswith('logs'): + base_config['logdir'] = base_config['logdir'][:-5] + checkpoint = check_logdir(args, base_config, restore_best_checkpoint) model.compile(checkpoint=checkpoint, use_trt=args.use_trt, precision=args.precision) return model diff --git a/requirements.txt b/requirements.txt index 7848adc17..7a2fad808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ librosa==0.6.1 matplotlib joblib==0.11 sentencepiece -sacrebleu +sacrebleu \ No newline at end of file diff --git a/run.py b/run.py index 804ea3353..cba148e2d 100644 --- a/run.py +++ b/run.py @@ -8,7 +8,7 @@ import sys import tensorflow as tf from open_seq2seq.utils.utils import deco_print, get_base_config, check_logdir,\ - create_logdir, create_model + create_logdir, create_model, check_base_model_logdir from open_seq2seq.utils import train, infer, evaluate def main(): @@ -34,6 +34,15 @@ def main(): else: hvd = None + load_model = base_config.get('load_model', None) + restore_best_checkpoint = base_config.get('restore_best_checkpoint', False) + + + base_ckpt_dir = check_base_model_logdir(load_model, restore_best_checkpoint) + base_config['load_model'] = base_ckpt_dir + + # Check logdir and create it if necessary + checkpoint = check_logdir(args, base_config, restore_best_checkpoint) if args.enable_logs: if hvd is None or hvd.rank() == 0: @@ -46,7 +55,10 @@ def main(): if args.mode == 'train' or args.mode == 'train_eval' or args.benchmark: if hvd is None or hvd.rank() == 0: if checkpoint is None or args.benchmark: - deco_print("Starting training from scratch") + if base_ckpt_dir: + deco_print("Starting training from the base model") + else: + deco_print("Starting training from scratch") else: deco_print( "Restored checkpoint from {}. Resuming training".format(checkpoint), @@ -57,7 +69,7 @@ def main(): # Create model and train/eval/infer with tf.Graph().as_default(): - model = create_model(args, base_config, config_module, base_model, hvd) + model = create_model(args, base_config, config_module, base_model, hvd, restore_best_checkpoint) if args.mode == "train_eval": train(model[0], model[1], debug_port=args.debug_port) elif args.mode == "train": diff --git a/scripts/get_best_accuracy.py b/scripts/get_best_accuracy.py new file mode 100644 index 000000000..87d040aaa --- /dev/null +++ b/scripts/get_best_accuracy.py @@ -0,0 +1,46 @@ +''' +Return the best evaluation accuracy from a file +output-ed by the sentiment analysis model +''' +import sys + +def get_best_accuracy(output_file): + output = open(output_file, 'r') + keyword = "*** EVAL Accuracy: " + best_acc = 0.0 + loss, stat, step = '', '', '' + get_stat = False + n = len(keyword) + m = len("*** Validation loss: ") + last = '' + get_step = False + for line in output.readlines(): + line = line.strip() + if get_stat: + stat = line + get_stat = False + get_step = True + elif get_step: + step = line + get_step = False + else: + idx = line.find(keyword) + if idx != -1: + acc = float(line[n:]) + if acc > best_acc: + best_acc = acc + loss = last + get_stat = True + last = line + + + print("*** Best accuracy:", str(best_acc)) + print(loss) + print(stat) + print(step) + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise ValueError('No output file provided to analyze') + output_file = sys.argv[1] + get_best_accuracy(output_file) \ No newline at end of file diff --git a/scripts/multi-bleu.perl b/scripts/multi-bleu.perl old mode 100644 new mode 100755