diff --git a/.gitignore b/.gitignore
index fb220d169..9964b7635 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,4 +87,4 @@ ENV/
 .spyderproject
 
 # Rope project settings
-.ropeproject
+.ropeproject
\ No newline at end of file
diff --git a/example_configs/lm/lstm-test-small-mixed.py b/example_configs/lm/lstm-test-small-mixed.py
new file mode 100644
index 000000000..52b54d138
--- /dev/null
+++ b/example_configs/lm/lstm-test-small-mixed.py
@@ -0,0 +1,131 @@
+import tensorflow as tf
+
+from open_seq2seq.models import LSTMLM
+from open_seq2seq.encoders import LMEncoder
+from open_seq2seq.decoders import FakeDecoder
+from open_seq2seq.data import WKTDataLayer
+from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
+from open_seq2seq.losses import BasicSequenceLoss
+from open_seq2seq.optimizers.lr_policies import fixed_lr
+
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-2-raw DATA]"
+processed_data_folder = 'wkt2-processed-folder'
+base_model = LSTMLM
+bptt = 12
+steps = 40
+
+base_params = {
+  "restore_best_checkpoint": True,
+  "processed_data_folder": processed_data_folder,
+  "use_horovod": False,
+  "num_gpus": 2,
+
+  "batch_size_per_gpu": 160,
+  "num_epochs": 1500,
+  "save_summaries_steps": steps,
+  "print_loss_steps": steps,
+  "print_samples_steps": steps,
+  "save_checkpoint_steps": steps,
+  "logdir": "LSTM-FP32-2GPU-SMALL-MIXED",
+  "eval_steps": steps * 2,
+
+  "optimizer": "Adam",
+  "optimizer_params": {},
+
+  "lr_policy": fixed_lr,
+  "lr_policy_params": {
+    "learning_rate": 9e-4
+  },
+
+  "summaries": ['learning_rate', 'variables', 'gradients', 
+                'variable_norm', 'gradient_norm', 'global_gradient_norm'],
+
+  # "dtype": tf.float32,
+  "dtype": "mixed",
+  "loss_scaling": "Backoff",
+  "encoder": LMEncoder,
+  "encoder_params": {
+    "initializer": tf.random_uniform_initializer,
+    "initializer_params": {
+      "minval": -0.1,
+      "maxval": 0.1,
+    },
+    "core_cell": WeightDropLayerNormBasicLSTMCell,
+    "core_cell_params": {
+        "num_units": 128,
+        "forget_bias": 1.0,
+    },
+    "encoder_layers": 2,
+    "encoder_dp_input_keep_prob": 1.0,
+    "encoder_dp_output_keep_prob": 0.6,
+    "encoder_last_input_keep_prob": 1.0,
+    "encoder_last_output_keep_prob": 0.6,
+    "recurrent_keep_prob": 0.7,
+    'encoder_emb_keep_prob': 0.37,
+    "encoder_use_skip_connections": False,
+    "emb_size": 64,
+    "sampling_prob": 0.0, # 0 is always use the ground truth
+    "fc_use_bias": True,
+    "weight_tied": True,
+    "awd_initializer": False,
+  },
+
+  "decoder": FakeDecoder,
+
+  "regularizer": tf.contrib.layers.l2_regularizer,
+  "regularizer_params": {
+    'scale': 2e-6,
+  },
+
+  "loss": BasicSequenceLoss,
+  "loss_params": {
+    "offset_target_by_one": False,
+    "average_across_timestep": True,
+    "do_mask": False,
+  }
+}
+
+train_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "data_root": data_root,
+    "processed_data_folder": processed_data_folder,
+    "pad_vocab_to_eight": False,
+    "rand_start": True,
+    "shuffle": False,
+    "shuffle_buffer_size": 25000,
+    "repeat": True,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "bptt": bptt,
+    "small": True,
+  },
+}
+eval_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "processed_data_folder": processed_data_folder,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 1,
+    "bptt": bptt,
+    "small": True,
+  },
+}
+
+infer_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "processed_data_folder": processed_data_folder,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "rand_start": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "bptt": bptt,
+    "seed_tokens": "something The only game",
+  },
+}
diff --git a/example_configs/lm/lstm-test-small.py b/example_configs/lm/lstm-test-small.py
new file mode 100644
index 000000000..860f228aa
--- /dev/null
+++ b/example_configs/lm/lstm-test-small.py
@@ -0,0 +1,132 @@
+import tensorflow as tf
+
+from open_seq2seq.models import LSTMLM
+from open_seq2seq.encoders import LMEncoder
+from open_seq2seq.decoders import FakeDecoder
+from open_seq2seq.data import WKTDataLayer
+from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
+from open_seq2seq.losses import BasicSequenceLoss
+from open_seq2seq.optimizers.lr_policies import fixed_lr
+
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-2-raw DATA]"
+processed_data_folder = 'wkt2-processed-data'
+
+base_model = LSTMLM
+bptt = 12
+steps = 10
+
+base_params = {
+  "restore_best_checkpoint": True,
+  "use_horovod": False,
+  "num_gpus": 2,
+
+  "batch_size_per_gpu": 160,
+  "num_epochs": 1500,
+  "save_summaries_steps": steps,
+  "print_loss_steps": steps,
+  "print_samples_steps": steps,
+  "save_checkpoint_steps": steps,
+  "processed_data_folder": processed_data_folder,
+  "logdir": "LSTM-FP32-2GPU-SMALL",
+  "eval_steps": steps * 2,
+
+  "optimizer": "Adam",
+  "optimizer_params": {},
+
+  "lr_policy": fixed_lr,
+  "lr_policy_params": {
+    "learning_rate": 9e-4
+  },
+
+  "summaries": ['learning_rate', 'variables', 'gradients', 
+                'variable_norm', 'gradient_norm', 'global_gradient_norm'],
+
+  "dtype": tf.float32,
+  # "dtype": "mixed",
+  # "loss_scaling": "Backoff",
+  "encoder": LMEncoder,
+  "encoder_params": {
+    "initializer": tf.random_uniform_initializer,
+    "initializer_params": {
+      "minval": -0.1,
+      "maxval": 0.1,
+    },
+    "core_cell": WeightDropLayerNormBasicLSTMCell,
+    "core_cell_params": {
+        "num_units": 128,
+        "forget_bias": 1.0,
+    },
+    "encoder_layers": 2,
+    "encoder_dp_input_keep_prob": 1.0,
+    "encoder_dp_output_keep_prob": 0.6,
+    "encoder_last_input_keep_prob": 1.0,
+    "encoder_last_output_keep_prob": 0.6,
+    "recurrent_keep_prob": 0.7,
+    'encoder_emb_keep_prob': 0.37,
+    "encoder_use_skip_connections": False,
+    "emb_size": 64,
+    "sampling_prob": 0.0, # 0 is always use the ground truth
+    "fc_use_bias": True,
+    "weight_tied": True,
+    "awd_initializer": False,
+  },
+
+  "decoder": FakeDecoder,
+
+  "regularizer": tf.contrib.layers.l2_regularizer,
+  "regularizer_params": {
+    'scale': 2e-6,
+  },
+
+  "loss": BasicSequenceLoss,
+  "loss_params": {
+    "offset_target_by_one": False,
+    "average_across_timestep": True,
+    "do_mask": False,
+  }
+}
+
+train_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "data_root": data_root,
+    "processed_data_folder": processed_data_folder,
+    "pad_vocab_to_eight": False,
+    "rand_start": True,
+    "shuffle": False,
+    "shuffle_buffer_size": 25000,
+    "repeat": True,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "bptt": bptt,
+    "small": True,
+  },
+}
+eval_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "processed_data_folder": processed_data_folder,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 1,
+    "bptt": bptt,
+    "small": True,
+  },
+}
+
+infer_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "processed_data_folder": processed_data_folder,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "rand_start": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "bptt": bptt,
+    "seed_tokens": "something The only game",
+  },
+}
diff --git a/example_configs/lm/lstm-wkt103-mixed.py b/example_configs/lm/lstm-wkt103-mixed.py
new file mode 100644
index 000000000..f077dd2f7
--- /dev/null
+++ b/example_configs/lm/lstm-wkt103-mixed.py
@@ -0,0 +1,128 @@
+import tensorflow as tf
+
+from open_seq2seq.models import LSTMLM
+from open_seq2seq.encoders import LMEncoder
+from open_seq2seq.decoders import FakeDecoder
+from open_seq2seq.data import WKTDataLayer
+from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
+from open_seq2seq.losses import BasicSampledSequenceLoss
+from open_seq2seq.optimizers.lr_policies import fixed_lr
+
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-103-raw DATA]"
+processed_data_folder = 'wkt103-processed-data'
+
+base_model = LSTMLM
+bptt = 96
+steps = 40
+
+base_params = {
+  "restore_best_checkpoint": True,
+  "use_horovod": True,
+  "num_gpus": 8,
+
+  "batch_size_per_gpu": 224,
+  "eval_batch_size_per_gpu": 56,
+  "num_epochs": 1500,
+  "save_summaries_steps": steps,
+  "print_loss_steps": steps,
+  "print_samples_steps": steps,
+  "save_checkpoint_steps": steps,
+  "logdir": "LSTM-WKT103-MIXED",
+  "processed_data_folder": processed_data_folder,
+  "eval_steps": steps * 4,
+
+  "optimizer": "Adam",
+  "optimizer_params": {},
+
+  "lr_policy": fixed_lr,
+  "lr_policy_params": {
+    "learning_rate": 1e-3
+  },
+
+  "summaries": ['learning_rate', 'variables', 'gradients', 
+                'variable_norm', 'gradient_norm', 'global_gradient_norm'],
+  # "max_grad_norm": 0.25,
+  # "dtype": tf.float32,
+  "dtype": "mixed",
+  "loss_scaling": "Backoff",
+  "encoder": LMEncoder,
+  "encoder_params": {
+    "initializer": tf.random_uniform_initializer,
+    "initializer_params": {
+      "minval": -0.1,
+      "maxval": 0.1,
+    },
+    "core_cell": WeightDropLayerNormBasicLSTMCell,
+    "core_cell_params": {
+        "num_units": 1024,
+        "forget_bias": 1.0,
+    },
+    "encoder_layers": 3,
+    "encoder_dp_input_keep_prob": 1.0,
+    "encoder_dp_output_keep_prob": 0.85,
+    "encoder_last_input_keep_prob": 1.0,
+    "encoder_last_output_keep_prob": 0.85,
+    "recurrent_keep_prob": 0.7,
+    'encoder_emb_keep_prob': 0.8,
+    "encoder_use_skip_connections": False,
+    "emb_size": 320,
+    "sampling_prob": 0.0, # 0 is always use the ground truth
+    "fc_use_bias": True,
+    "weight_tied": True,
+    "awd_initializer": False,
+    "num_sampled": 8192,
+  },
+
+  "decoder": FakeDecoder,
+  "regularizer": tf.contrib.layers.l2_regularizer,
+  "regularizer_params": {
+    'scale': 2e-6,
+  },
+
+  "loss": BasicSampledSequenceLoss,
+  "loss_params": {
+    "offset_target_by_one": False,
+    "average_across_timestep": True,
+    "do_mask": False,
+  }
+}
+
+train_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "rand_start": True,
+    "shuffle": True,
+    "shuffle_buffer_size": 25000,
+    "repeat": True,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "bptt": bptt,
+  },
+}
+eval_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 1,
+    "bptt": bptt,
+  },
+}
+
+infer_params = {
+  "data_layer": WKTDataLayer,
+  "data_layer_params": {
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "rand_start": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "bptt": bptt,
+    "seed_tokens": "something The only game",
+  },
+}
\ No newline at end of file
diff --git a/example_configs/lstmlm/lstmlm-test.py b/example_configs/lm/lstm-wkt2-fp32.py
similarity index 61%
rename from example_configs/lstmlm/lstmlm-test.py
rename to example_configs/lm/lstm-wkt2-fp32.py
index 187373a44..e80d6d53f 100644
--- a/example_configs/lstmlm/lstmlm-test.py
+++ b/example_configs/lm/lstm-wkt2-fp32.py
@@ -2,78 +2,68 @@
 
 from open_seq2seq.models import LSTMLM
 from open_seq2seq.encoders import LMEncoder
-# from open_seq2seq.encoders import BidirectionalRNNEncoderWithEmbedding
 from open_seq2seq.decoders import FakeDecoder
-from open_seq2seq.data import LMTextDataLayer, LMTextDataLayerGenerate
+from open_seq2seq.data import WKTDataLayer
 from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
-# from open_seq2seq.losses import CrossEntropyLoss
 from open_seq2seq.losses import BasicSequenceLoss
 from open_seq2seq.optimizers.lr_policies import fixed_lr
-# from open_seq2seq.data.text2text.text2text import SpecialTextTokens
-# from open_seq2seq.optimizers.lr_policies import exp_decay
 
-data_root = "[REPLACE THIS TO THE PATH WITH YOUR WIKITEXT DATA]"
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR WikiText-2-raw DATA]"
+processed_data_folder = 'wkt2-processed-data'
 
 base_model = LSTMLM
-bptt = 72
+bptt = 96
 steps = 40
 
 base_params = {
-  "restore_best_checkpoint": True, # best checkpoint is only saved when using train_eval mode
-  "use_horovod": False,
+  "restore_best_checkpoint": True,
+  "use_horovod": True,
   "num_gpus": 2,
 
-  "batch_size_per_gpu": 160, 
+  "batch_size_per_gpu": 160,
   "num_epochs": 1500,
   "save_summaries_steps": steps,
   "print_loss_steps": steps,
   "print_samples_steps": steps,
   "save_checkpoint_steps": steps,
-  "logdir": "LSTM-FP32-2GPU",
+  "logdir": "LSTM-WKT2-FP32",
+  "processed_data_folder": processed_data_folder,
   "eval_steps": steps * 2,
 
-  "optimizer": "Adam", # need to change to NT-ASGD
+  "optimizer": "Adam",
   "optimizer_params": {},
-  # luong10 decay scheme
 
   "lr_policy": fixed_lr,
   "lr_policy_params": {
-    "learning_rate": 9e-4
+    "learning_rate": 4e-4
   },
-
   "summaries": ['learning_rate', 'variables', 'gradients', 
                 'variable_norm', 'gradient_norm', 'global_gradient_norm'],
-
+  # "max_grad_norm": 0.25,
   "dtype": tf.float32,
   #"dtype": "mixed",
   #"automatic_loss_scaling": "Backoff",
   "encoder": LMEncoder,
-  # "encoder": BidirectionalRNNEncoderWithEmbedding,
-  "encoder_params": { # will need to update
+  "encoder_params": {
     "initializer": tf.random_uniform_initializer,
-    "initializer_params": { # need different initializers for embeddings and for weights
+    "initializer_params": {
       "minval": -0.1,
       "maxval": 0.1,
     },
     "core_cell": WeightDropLayerNormBasicLSTMCell,
     "core_cell_params": {
-        "num_units": 800, # paper 1150
-        "forget_bias": 1.0,
-    },
-    "last_cell_params": {
-        "num_units": 320,
+        "num_units": 896,
         "forget_bias": 1.0,
     },
     "encoder_layers": 3,
     "encoder_dp_input_keep_prob": 1.0,
-    "encoder_dp_output_keep_prob": 0.6, # output dropout for middle layer 0.3
+    "encoder_dp_output_keep_prob": 0.6,
     "encoder_last_input_keep_prob": 1.0,
-    "encoder_last_output_keep_prob": 0.6, # output droput at last layer is 0.4
+    "encoder_last_output_keep_prob": 0.6,
     "recurrent_keep_prob": 0.7,
     'encoder_emb_keep_prob': 0.37,
     "encoder_use_skip_connections": False,
-    "emb_size": 320,
-    "vocab_size": 33278,
+    "emb_size": 256,
     "num_tokens_gen": 10,
     "sampling_prob": 0.0, # 0 is always use the ground truth
     "fc_use_bias": True,
@@ -81,14 +71,13 @@
     "awd_initializer": False,
   },
 
-  "decoder": FakeDecoder, # need a new decoder with AR and TAR
+  "decoder": FakeDecoder,
 
   "regularizer": tf.contrib.layers.l2_regularizer,
   "regularizer_params": {
-    'scale': 2e-6, # alpha
+    'scale': 2e-6,
   },
 
-  # "loss": CrossEntropyLoss, # will need to write new loss + regularizer
   "loss": BasicSequenceLoss,
   "loss_params": {
     "offset_target_by_one": False,
@@ -98,12 +87,12 @@
 }
 
 train_params = {
-  "data_layer": LMTextDataLayer,
+  "data_layer": WKTDataLayer,
   "data_layer_params": {
     "data_root": data_root,
     "pad_vocab_to_eight": False,
     "rand_start": True,
-    "shuffle": False,
+    "shuffle": True,
     "shuffle_buffer_size": 25000,
     "repeat": True,
     "map_parallel_calls": 16,
@@ -112,9 +101,8 @@
   },
 }
 eval_params = {
-  "data_layer": LMTextDataLayer,
+  "data_layer": WKTDataLayer,
   "data_layer_params": {
-    # "data_root": data_root,
     "pad_vocab_to_eight": False,
     "shuffle": False,
     "repeat": False,
@@ -125,9 +113,8 @@
 }
 
 infer_params = {
-  "data_layer": LMTextDataLayer,
+  "data_layer": WKTDataLayer,
   "data_layer_params": {
-    # "data_root": data_root,
     "pad_vocab_to_eight": False,
     "shuffle": False,
     "repeat": False,
@@ -137,4 +124,4 @@
     "bptt": bptt,
     "seed_tokens": "something The only game",
   },
-}
+}
\ No newline at end of file
diff --git a/example_configs/text2text/toy-reversal/nmt-reversal-CR.py b/example_configs/text2text/toy-reversal/nmt-reversal-CR.py
index 30abd7ad5..71c439968 100644
--- a/example_configs/text2text/toy-reversal/nmt-reversal-CR.py
+++ b/example_configs/text2text/toy-reversal/nmt-reversal-CR.py
@@ -162,4 +162,5 @@
     "delimiter": " ",
     "special_tokens_already_in_vocab": False,
   },
+
 }
diff --git a/example_configs/transfer/imdb-from-scratch.py b/example_configs/transfer/imdb-from-scratch.py
new file mode 100644
index 000000000..e59752552
--- /dev/null
+++ b/example_configs/transfer/imdb-from-scratch.py
@@ -0,0 +1,130 @@
+import tensorflow as tf
+
+from open_seq2seq.models import LSTMLM
+from open_seq2seq.encoders import LMEncoder
+from open_seq2seq.decoders import FakeDecoder
+from open_seq2seq.data import IMDBDataLayer
+from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
+from open_seq2seq.losses import CrossEntropyLoss
+from open_seq2seq.optimizers.lr_policies import fixed_lr
+
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR IMDB DATA]"
+processed_data_folder = 'imdb-processed-data-wkt2'
+
+base_model = LSTMLM
+max_length = 256
+binary = True
+steps = 40
+
+base_params = {
+  "restore_best_checkpoint": True,
+  "use_horovod": False,
+  "num_gpus": 2,
+
+  "batch_size_per_gpu": 160,
+  "num_epochs": 1500,
+  "save_summaries_steps": steps,
+  "print_loss_steps": steps,
+  "print_samples_steps": steps,
+  "save_checkpoint_steps": steps,
+  "logdir": "IMDB-START",
+  "lm_vocab_file": 'wkt2-processed-data/vocab.txt',
+  # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]'
+  "processed_data_folder": processed_data_folder,
+  "eval_steps": steps * 2,
+
+  "optimizer": "Adam",
+  "optimizer_params": {},
+  # luong10 decay scheme
+
+  "lr_policy": fixed_lr,
+  "lr_policy_params": {
+    "learning_rate": 9e-4
+  },
+
+  "summaries": ['learning_rate', 'variables', 'gradients', 
+                'variable_norm', 'gradient_norm', 'global_gradient_norm'],
+  # "max_grad_norm": 0.25,
+  "dtype": tf.float32,
+  #"dtype": "mixed",
+  #"automatic_loss_scaling": "Backoff",
+  "encoder": LMEncoder,
+  "encoder_params": {
+    "initializer": tf.random_uniform_initializer,
+    "initializer_params": {
+      "minval": -0.1,
+      "maxval": 0.1,
+    },
+    "core_cell": WeightDropLayerNormBasicLSTMCell,
+    "core_cell_params": {
+        "num_units": 896,
+        "forget_bias": 1.0,
+    },
+    "encoder_layers": 3,
+    "encoder_dp_input_keep_prob": 1.0,
+    "encoder_dp_output_keep_prob": 0.6, 
+    "encoder_last_input_keep_prob": 1.0,
+    "encoder_last_output_keep_prob": 0.6,
+    "recurrent_keep_prob": 0.7,
+    'encoder_emb_keep_prob': 0.37,
+    "encoder_use_skip_connections": False,
+    "emb_size": 256,
+    "num_tokens_gen": 10,
+    "sampling_prob": 0.0, # 0 is always use the ground truth
+    "fc_use_bias": True,
+    "weight_tied": True,
+    "awd_initializer": False,
+  },
+
+  "decoder": FakeDecoder,
+
+  "regularizer": tf.contrib.layers.l2_regularizer,
+  "regularizer_params": {
+    'scale': 2e-6,
+  },
+
+  "loss": CrossEntropyLoss,
+}
+
+train_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": True,
+    "shuffle_buffer_size": 25000,
+    "repeat": True,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "binary": binary,
+    "max_length": max_length,
+  },
+}
+eval_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    # "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 1,
+    "binary": binary,
+    "max_length": max_length,
+  },
+}
+
+infer_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    # "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "rand_start": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "binary": binary,
+    "max_length": max_length,
+  },
+}
\ No newline at end of file
diff --git a/example_configs/transfer/imdb-wkt103.py b/example_configs/transfer/imdb-wkt103.py
new file mode 100644
index 000000000..8deb1d351
--- /dev/null
+++ b/example_configs/transfer/imdb-wkt103.py
@@ -0,0 +1,135 @@
+import tensorflow as tf
+
+from open_seq2seq.models import LSTMLM
+from open_seq2seq.encoders import LMEncoder
+from open_seq2seq.decoders import FakeDecoder
+from open_seq2seq.data import IMDBDataLayer
+from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
+from open_seq2seq.losses import CrossEntropyLoss
+from open_seq2seq.optimizers.lr_policies import fixed_lr
+
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR IMDB DATA]"
+processed_data_folder = 'imdb-processed-data-wkt103'
+
+base_model = LSTMLM
+max_length = 256
+binary = True
+steps = 10
+
+base_params = {
+  "restore_best_checkpoint": True,
+  "use_horovod": False,
+  "num_gpus": 1,
+
+  "batch_size_per_gpu": 16,
+  "eval_batch_size_per_gpu": 64,
+  "num_epochs": 100,
+  "save_summaries_steps": steps,
+  "print_loss_steps": steps,
+  "print_samples_steps": steps,
+  "save_checkpoint_steps": steps,
+  "load_model": "WKT103-CPT",
+  "logdir": "IMDB-WKT103-EXP1",
+  "lm_vocab_file": 'wkt103-processed-data/vocab.txt',
+  # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]'
+  "processed_data_folder": processed_data_folder,
+  "eval_steps": steps,
+
+  "optimizer": "Adam",
+  "optimizer_params": {},
+  # luong10 decay scheme
+
+  "lr_policy": fixed_lr,
+  "lr_policy_params": {
+    "learning_rate": 1e-4
+  },
+
+  "summaries": ['learning_rate', 'variables', 'gradients', 
+                'variable_norm', 'gradient_norm', 'global_gradient_norm'],
+  # "max_grad_norm": 0.25,
+  "dtype": tf.float32,
+  #"dtype": "mixed",
+  #"loss_scaling": "Backoff",
+  "encoder": LMEncoder,
+  "encoder_params": {
+    "initializer": tf.random_uniform_initializer,
+    "initializer_params": {
+      "minval": -0.1,
+      "maxval": 0.1,
+    },
+    "core_cell": WeightDropLayerNormBasicLSTMCell,
+    "core_cell_params": {
+        "num_units": 1024,
+        "forget_bias": 1.0,
+    },
+    "encoder_layers": 3,
+    "encoder_dp_input_keep_prob": 1.0,
+    "encoder_dp_output_keep_prob": 0.8,
+    "encoder_last_input_keep_prob": 1.0,
+    "encoder_last_output_keep_prob": 0.8,
+    "recurrent_keep_prob": 1.0,
+    'encoder_emb_keep_prob': 0.6,
+    "encoder_use_skip_connections": False,
+    "emb_size": 256,
+    "num_tokens_gen": 10,
+    "sampling_prob": 0.0, # 0 is always use the ground truth
+    "fc_use_bias": True,
+    "weight_tied": True,
+    "awd_initializer": False,
+  },
+
+  "decoder": FakeDecoder,
+
+  "regularizer": tf.contrib.layers.l2_regularizer,
+  "regularizer_params": {
+    'scale': 2e-6,
+  },
+
+  "loss": CrossEntropyLoss,
+}
+
+train_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": True,
+    "shuffle_buffer_size": 25000,
+    "repeat": True,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "binary": binary,
+    "max_length": max_length,
+    "get_stats": True,
+    # "small": True,
+  },
+}
+eval_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    # "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 1,
+    "binary": binary,
+    "max_length": max_length,
+    # "small": True,
+  },
+}
+
+infer_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    # "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "rand_start": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "binary": binary,
+    "max_length": max_length,
+  },
+}
diff --git a/example_configs/transfer/imdb-wkt2.py b/example_configs/transfer/imdb-wkt2.py
new file mode 100644
index 000000000..e162bfa01
--- /dev/null
+++ b/example_configs/transfer/imdb-wkt2.py
@@ -0,0 +1,133 @@
+import tensorflow as tf
+
+from open_seq2seq.models import LSTMLM
+from open_seq2seq.encoders import LMEncoder
+from open_seq2seq.decoders import FakeDecoder
+from open_seq2seq.data import IMDBDataLayer
+from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
+from open_seq2seq.losses import CrossEntropyLoss
+from open_seq2seq.optimizers.lr_policies import fixed_lr
+
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR IMDB DATA]"
+processed_data_folder = 'imdb-processed-data-wkt2'
+
+base_model = LSTMLM
+max_length = 256
+binary = True
+steps = 5
+
+base_params = {
+  "restore_best_checkpoint": True,
+  "use_horovod": False,
+  "num_gpus": 2,
+
+  "batch_size_per_gpu": 16,
+  "num_epochs": 25,
+  "save_summaries_steps": steps,
+  "print_loss_steps": steps,
+  "print_samples_steps": steps,
+  "save_checkpoint_steps": steps,
+  "load_model": "AWDLSTM-EXP69",
+  "logdir": "IMDB-WKT2",
+  "lm_vocab_file": 'wkt2-processed-data/vocab.txt',
+  # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]'
+  "processed_data_folder": processed_data_folder,
+  "eval_steps": steps * 2,
+
+  "optimizer": "Adam",
+  "optimizer_params": {},
+  # luong10 decay scheme
+
+  "lr_policy": fixed_lr,
+  "lr_policy_params": {
+    "learning_rate": 1e-5
+  },
+
+  "summaries": ['learning_rate', 'variables', 'gradients', 
+                'variable_norm', 'gradient_norm', 'global_gradient_norm'],
+  # "max_grad_norm": 0.25,
+  "dtype": tf.float32,
+  #"dtype": "mixed",
+  #"loss_scaling": "Backoff",
+  "encoder": LMEncoder,
+  "encoder_params": {
+    "initializer": tf.random_uniform_initializer,
+    "initializer_params": {
+      "minval": -0.1,
+      "maxval": 0.1,
+    },
+    "core_cell": WeightDropLayerNormBasicLSTMCell,
+    "core_cell_params": {
+        "num_units": 896,
+        "forget_bias": 1.0,
+    },
+    "encoder_layers": 3,
+    "encoder_dp_input_keep_prob": 1.0,
+    "encoder_dp_output_keep_prob": 1.0,
+    "encoder_last_input_keep_prob": 1.0,
+    "encoder_last_output_keep_prob": 1.0,
+    "recurrent_keep_prob": 1.0,
+    'encoder_emb_keep_prob': 1.0,
+    "encoder_use_skip_connections": False,
+    "emb_size": 256,
+    "num_tokens_gen": 10,
+    "sampling_prob": 0.0, # 0 is always use the ground truth
+    "fc_use_bias": True,
+    "weight_tied": True,
+    "awd_initializer": False,
+  },
+
+  "decoder": FakeDecoder,
+
+  "regularizer": tf.contrib.layers.l2_regularizer,
+  "regularizer_params": {
+    'scale': 2e-4,
+  },
+
+  "loss": CrossEntropyLoss,
+}
+
+train_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": True,
+    "shuffle_buffer_size": 25000,
+    "repeat": True,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "binary": binary,
+    "max_length": max_length,
+    # "small": True,
+  },
+}
+eval_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    # "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 1,
+    "binary": binary,
+    "max_length": max_length,
+    # "small": True,
+  },
+}
+
+infer_params = {
+  "data_layer": IMDBDataLayer,
+  "data_layer_params": {
+    # "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "rand_start": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "binary": binary,
+    "max_length": max_length,
+  },
+}
\ No newline at end of file
diff --git a/example_configs/transfer/sst-wkt2-small.py b/example_configs/transfer/sst-wkt2-small.py
new file mode 100644
index 000000000..c6edf7fe0
--- /dev/null
+++ b/example_configs/transfer/sst-wkt2-small.py
@@ -0,0 +1,129 @@
+import tensorflow as tf
+
+from open_seq2seq.models import LSTMLM
+from open_seq2seq.encoders import LMEncoder
+from open_seq2seq.decoders import FakeDecoder
+from open_seq2seq.data import SSTDataLayer
+from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
+from open_seq2seq.losses import BasicSequenceLoss, CrossEntropyLoss
+from open_seq2seq.optimizers.lr_policies import fixed_lr
+
+base_model = LSTMLM
+steps = 10
+
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR SST DATA]"
+processed_data_folder = 'sst-processed-data-wkt2'
+binary = True
+max_length = 96
+
+base_params = {
+  "restore_best_checkpoint": True, # best checkpoint is only saved when using train_eval mode
+  "use_horovod": False,
+  "num_gpus": 1,
+
+  "batch_size_per_gpu": 20, 
+  "eval_batch_size_per_gpu": 80,
+  "num_epochs": 120,
+  "save_summaries_steps": steps,
+  "print_loss_steps": steps,
+  "print_samples_steps": steps,
+  "save_checkpoint_steps": steps,
+  "load_model": "LSTM-FP32-2GPU-SMALL",
+  "lm_vocab_file": 'wkt2-processed-data/vocab.txt',
+  # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]'
+  "logdir": "SST-WKT2-SMALL",
+  "processed_data_folder": processed_data_folder,
+  "eval_steps": steps,
+
+  "optimizer": "Adam",
+  "optimizer_params": {},
+  # luong10 decay scheme
+
+  "lr_policy": fixed_lr,
+  "lr_policy_params": {
+    "learning_rate": 1e-4
+  },
+
+  "summaries": ['learning_rate', 'variables', 'gradients', 
+                'variable_norm', 'gradient_norm', 'global_gradient_norm'],
+  # "max_grad_norm": 0.25,
+  "dtype": tf.float32,
+  #"dtype": "mixed",
+  #"loss_scaling": "Backoff",
+  "encoder": LMEncoder,
+  "encoder_params": { # will need to update
+    "initializer": tf.random_uniform_initializer,
+    "initializer_params": { # need different initializers for embeddings and for weights
+      "minval": -0.1,
+      "maxval": 0.1,
+    },
+    "core_cell": WeightDropLayerNormBasicLSTMCell,
+    "core_cell_params": {
+        "num_units": 128,
+        "forget_bias": 1.0,
+    },
+    "encoder_layers": 3,
+    "encoder_dp_input_keep_prob": 1.0,
+    "encoder_dp_output_keep_prob": 0.8,
+    "encoder_last_input_keep_prob": 1.0,
+    "encoder_last_output_keep_prob": 0.8,
+    "recurrent_keep_prob": 1.0,
+    'encoder_emb_keep_prob': 0.7,
+    "encoder_use_skip_connections": False,
+    "emb_size": 64,
+    "num_tokens_gen": 10,
+    "sampling_prob": 0.0, # 0 is always use the ground truth
+    "fc_use_bias": True,
+    "weight_tied": True,
+    "awd_initializer": False,
+    "use_cell_state": True,
+  },
+
+  "decoder": FakeDecoder,
+
+  "regularizer": tf.contrib.layers.l2_regularizer,
+  "regularizer_params": {
+    'scale': 2e-6,
+  },
+
+  "loss": CrossEntropyLoss,
+}
+
+train_params = {
+  "data_layer": SSTDataLayer,
+  "data_layer_params": {
+    "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": True,
+    "shuffle_buffer_size": 25000,
+    "repeat": True,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "max_length": max_length,
+    "get_stats": True,
+  },
+}
+eval_params = {
+  "data_layer": SSTDataLayer,
+  "data_layer_params": {
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 1,
+    "max_length": max_length,
+  },
+}
+
+infer_params = {
+  "data_layer": SSTDataLayer,
+  "data_layer_params": {
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "rand_start": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "max_length": max_length,
+  },
+}
\ No newline at end of file
diff --git a/example_configs/transfer/sst-wkt2.py b/example_configs/transfer/sst-wkt2.py
new file mode 100644
index 000000000..a5741aed4
--- /dev/null
+++ b/example_configs/transfer/sst-wkt2.py
@@ -0,0 +1,128 @@
+import tensorflow as tf
+
+from open_seq2seq.models import LSTMLM
+from open_seq2seq.encoders import LMEncoder
+from open_seq2seq.decoders import FakeDecoder
+from open_seq2seq.data import SSTDataLayer
+from open_seq2seq.parts.rnns.weight_drop import WeightDropLayerNormBasicLSTMCell
+from open_seq2seq.losses import BasicSequenceLoss, CrossEntropyLoss
+from open_seq2seq.optimizers.lr_policies import fixed_lr
+
+base_model = LSTMLM
+steps = 10
+
+data_root = "[REPLACE THIS TO THE PATH WITH YOUR SST DATA]"
+processed_data_folder = 'sst-processed-data-wkt2'
+binary = True
+max_length = 96
+
+base_params = {
+  "restore_best_checkpoint": True, # best checkpoint is only saved when using train_eval mode
+  "use_horovod": False,
+  "num_gpus": 1,
+
+  "batch_size_per_gpu": 20, 
+  "eval_batch_size_per_gpu": 80,
+  "num_epochs": 120,
+  "save_summaries_steps": steps,
+  "print_loss_steps": steps,
+  "print_samples_steps": steps,
+  "save_checkpoint_steps": steps,
+  "load_model": "AWDLSTM-EXP69",
+  "lm_vocab_file": 'wkt2-processed-data/vocab.txt',
+  # "lm_vocab_file": '[LINK TO THE VOCAB FILE IN THE PROCESSED DATA USED TO TRAIN THE BASE LM]'
+  "logdir": "SST-WKT2-EXP10",
+  "processed_data_folder": processed_data_folder,
+  "eval_steps": steps,
+
+  "optimizer": "Adam",
+  "optimizer_params": {},
+  # luong10 decay scheme
+
+  "lr_policy": fixed_lr,
+  "lr_policy_params": {
+    "learning_rate": 1e-4
+  },
+
+  "summaries": ['learning_rate', 'variables', 'gradients', 
+                'variable_norm', 'gradient_norm', 'global_gradient_norm'],
+  # "max_grad_norm": 0.25,
+  "dtype": tf.float32,
+  #"dtype": "mixed",
+  #"loss_scaling": "Backoff",
+  "encoder": LMEncoder,
+  "encoder_params": { # will need to update
+    "initializer": tf.random_uniform_initializer,
+    "initializer_params": { # need different initializers for embeddings and for weights
+      "minval": -0.1,
+      "maxval": 0.1,
+    },
+    "core_cell": WeightDropLayerNormBasicLSTMCell,
+    "core_cell_params": {
+        "num_units": 896,
+        "forget_bias": 1.0,
+    },
+    "encoder_layers": 3,
+    "encoder_dp_input_keep_prob": 1.0,
+    "encoder_dp_output_keep_prob": 0.8,
+    "encoder_last_input_keep_prob": 1.0,
+    "encoder_last_output_keep_prob": 0.8,
+    "recurrent_keep_prob": 1.0,
+    'encoder_emb_keep_prob': 0.7,
+    "encoder_use_skip_connections": False,
+    "emb_size": 256,
+    "num_tokens_gen": 10,
+    "sampling_prob": 0.0, # 0 is always use the ground truth
+    "fc_use_bias": True,
+    "weight_tied": True,
+    "awd_initializer": False,
+    "use_cell_state": True,
+  },
+
+  "decoder": FakeDecoder,
+
+  "regularizer": tf.contrib.layers.l2_regularizer,
+  "regularizer_params": {
+    'scale': 2e-6,
+  },
+
+  "loss": CrossEntropyLoss,
+}
+
+train_params = {
+  "data_layer": SSTDataLayer,
+  "data_layer_params": {
+    "data_root": data_root,
+    "pad_vocab_to_eight": False,
+    "shuffle": True,
+    "shuffle_buffer_size": 25000,
+    "repeat": True,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "max_length": max_length,
+  },
+}
+eval_params = {
+  "data_layer": SSTDataLayer,
+  "data_layer_params": {
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 1,
+    "max_length": max_length,
+  },
+}
+
+infer_params = {
+  "data_layer": SSTDataLayer,
+  "data_layer_params": {
+    "pad_vocab_to_eight": False,
+    "shuffle": False,
+    "repeat": False,
+    "rand_start": False,
+    "map_parallel_calls": 16,
+    "prefetch_buffer_size": 8,
+    "max_length": max_length,
+  },
+}
\ No newline at end of file
diff --git a/open_seq2seq/data/__init__.py b/open_seq2seq/data/__init__.py
index 8d6ec1775..3f9450aa6 100644
--- a/open_seq2seq/data/__init__.py
+++ b/open_seq2seq/data/__init__.py
@@ -2,5 +2,5 @@
 from .data_layer import DataLayer
 from .speech2text.speech2text import Speech2TextDataLayer
 from .image2label.image2label import ImagenetDataLayer
-from .lm.lmdata import LMTextDataLayer, LMTextDataLayerGenerate
+from .lm.lmdata import WKTDataLayer, IMDBDataLayer, SSTDataLayer
 from .text2speech.text2speech import Text2SpeechDataLayer
diff --git a/open_seq2seq/data/lm/lmdata.py b/open_seq2seq/data/lm/lmdata.py
index e63a4680b..cefffa365 100644
--- a/open_seq2seq/data/lm/lmdata.py
+++ b/open_seq2seq/data/lm/lmdata.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 NVIDIA Corporation
+# Copyright (c) 2018 NVIDIA Corporation
 import random
 
 import numpy as np
@@ -9,15 +9,21 @@
 from open_seq2seq.data.utils import load_pre_existing_vocabulary, pad_vocab_to_eight
 from open_seq2seq.data.text2text.t2t import _read_and_batch_from_files
 
-from open_seq2seq.data.lm.lmutils import Corpus
-
-class LMTextDataLayer(DataLayer):
+from open_seq2seq.data.lm.lmutils import Dictionary, Corpus, IMDBCorpus, SSTCorpus
+
+class WKTDataLayer(DataLayer):
+  '''
+  WKTDataLayer does the necessary pre-processing to make the WikiText datasets 
+  ready to be fed into the model. We use the ``word_token`` method 
+  available in the ``nltk`` package. 
+  You can download the datasets here:
+  https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
+  bptt: backpropagation through time - the length of the sequences used for training
+  rand_start: whether to start from a random starting index between (0, bptt)
+  '''
   @staticmethod
   def get_required_params():
     return dict(DataLayer.get_required_params(), **{
-      # 'content_file': str,
-      # 'vocab_file': str,
-      'shuffle': bool,
       'repeat': bool,
       'bptt': int,
     })
@@ -30,7 +36,6 @@ def get_optional_params():
       'small': bool,
       'use_targets': bool,
       'delimiter': str,
-      'target_file': str,
       'map_parallel_calls': int,
       'prefetch_buffer_size': int,
       'pad_lengths_to_eight': bool,
@@ -41,21 +46,33 @@ def get_optional_params():
     })
 
   def __init__(self, params, model, num_workers=1, worker_id=0):
-    super(LMTextDataLayer, self).__init__(params, model,
+    super(WKTDataLayer, self).__init__(params, model,
                                           num_workers, worker_id)
 
-    self._processed_data_folder = self.params.get('processed_data_folder', 'processed_data')
+    self._processed_data_folder = self.params.get('processed_data_folder', 'wkt-processed_data')
     self._data_root = self.params.get('data_root', None)
+
     self.corp = Corpus(self._data_root, self._processed_data_folder)
+
+    seed_tokens = self.params.get('seed_tokens', 'The').split()
+    
+    self.end_token = self.corp.dictionary.word2idx[self.corp.dictionary.EOS]
+    self.params['seed_tokens'] = [self.corp.dictionary.word2idx[seed_token] for seed_token in seed_tokens]
+    
+    if self.params['mode'] == 'infer':
+      self.corp.content = self.params['seed_tokens']
+
     if self.params['mode'] == 'train':
-      self._batch_size = self.params['batch_size']
+      self.batch_size = self.params['batch_size']
       self.corp.content = self.corp.train
     elif self.params['mode'] == 'eval':
-      self._batch_size = self.params['batch_size']
+      self.batch_size = self.params['batch_size']
       self.corp.content = self.corp.valid
     else:
-      self._batch_size = 1
-      self.corp.content = self.corp.test
+      if len(self.corp.content) < self.params['batch_size']:
+        self.batch_size = len(self.corp.content)
+      else:
+        self.batch_size = self.params['batch_size']
 
     self.vocab_file = (self._processed_data_folder, 'vocab.txt')
     self.bptt = self.params['bptt']
@@ -67,44 +84,23 @@ def __init__(self, params, model, num_workers=1, worker_id=0):
     self._shuffle_buffer_size = self.params.get('shuffle_buffer_size', -1)
     self._num_workers = num_workers
     self._worker_id = worker_id
-    self.params["delimiter"] = self.params.get("delimiter", " ")
-    self.params["small"] = self.params.get("small", False)
+    self.delimiter = self.params.get("delimiter", " ")
+    self._small = self.params.get("small", False)
     self.start = 0
 
-    if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0):
-      raise ValueError("If padding to 8 in data layer, then "
-                       "max_length should be multiple of 8")
-
     # load source and target vocabularies to RAM
-
-    seed_tokens = self.params.get('seed_tokens', 'The').split()
-    
-    self.params['end_token'] = self.corp.dictionary.word2idx[self.corp.dictionary.EOS]
-    self.params['seed_tokens'] = [self.corp.dictionary.word2idx[seed_token] for seed_token in seed_tokens]
-    if self.params["small"]:
+    if self._small:
       if self.params['mode'] == 'eval':
         self.corp.content = self.corp.content[:200]
       else:
         self.corp.content = self.corp.content[:9004]
 
-
     if self.params.get('pad_vocab_to_eight', False):
       self.corp.content = pad_vocab_to_eight(self.corp.content)
 
-    if self.params['mode'] == 'infer':
-      if len(self.corp.content) > self.bptt:
-        self.corp.content = self.corp.content[-self.bptt:]
-
     self.dataset_size = len(self.corp.content)
-
-    self.params['vocab_size'] = len(self.corp.dictionary.idx2word)
-    self.PAD_ID = self.params['vocab_size']
-    self.PAD = '<pad>'
-    self.corp.dictionary.idx2word.append(self.PAD)
-    self.corp.dictionary.word2idx[self.PAD] = self.PAD_ID
-
+    self.vocab_size = len(self.corp.dictionary.idx2word)
     self._input_tensors = {}
-    self._batch_size
 
   def gen(self):
     while True:
@@ -119,15 +115,17 @@ def gen(self):
 
   def gen_infer(self):
     while True:
-      yield (self.corp.content, self.corp.content)
-    
+      for seed in self.corp.content:
+        yield ([seed], [seed])
+      
   def build_graph(self):
     if self.params['mode'] == 'train' or self.params['mode'] == 'eval':
       gen = self.gen
       batch_shape = self.bptt
     else:
       gen = self.gen_infer
-      batch_shape = len(self.corp.content)
+      batch_shape = 1
+    
     _src_tgt_dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), 
                                 (tf.TensorShape([batch_shape]), tf.TensorShape([batch_shape])))
 
@@ -149,7 +147,7 @@ def build_graph(self):
     _src_tgt_dataset = _src_tgt_dataset.map(lambda x, y: ((x, tf.size(x)), (y, tf.size(y))), 
                             num_parallel_calls=self._map_parallel_calls)
 
-    self.batched_dataset = _src_tgt_dataset.batch(self._batch_size)
+    self.batched_dataset = _src_tgt_dataset.batch(self.batch_size)
 
     self._iterator = self.batched_dataset.make_initializable_iterator()
 
@@ -166,7 +164,7 @@ def build_graph(self):
   def get_size_in_samples(self):
     if self.params['mode'] == 'train' or self.params['mode'] == 'eval':
       return (self.dataset_size - self.start) // self.bptt
-    return 1
+    return len(self.corp.content)
 
   @property
   def iterator(self):
@@ -176,84 +174,116 @@ def iterator(self):
   def input_tensors(self):
     return self._input_tensors
 
-class LMTextDataLayerGenerate(DataLayer):
+class TextClassificationDataLayer(DataLayer):
+  '''
+  The base ckass to process data for text classification tasks.
+  If the data has already been processed, it shoud load the processed
+  data instead of re-processing it.
+  '''
   @staticmethod
   def get_required_params():
     return dict(DataLayer.get_required_params(), **{
-      'vocab_file': str,
-      'bptt': int,
+      'lm_vocab_file': str,
+      'shuffle': bool,
+      'repeat': bool,
+      'max_length': int,
+      'processed_data_folder': str,
     })
 
   @staticmethod
   def get_optional_params():
     return dict(DataLayer.get_optional_params(), **{
+      'rand_start': bool,
+      'small': bool,
+      'use_targets': bool,
       'delimiter': str,
       'map_parallel_calls': int,
       'prefetch_buffer_size': int,
       'pad_lengths_to_eight': bool,
       'pad_vocab_to_eight': bool,
-      'seed_file': str,
+      'shuffle_buffer_size': int,
+      'data_root': str,
+      'binary': bool,
+      'num_classes': int,
+      'get_stats': bool,
     })
 
   def __init__(self, params, model, num_workers=1, worker_id=0):
-    super(LMTextDataLayerGenerate, self).__init__(params, model,
+    super(TextClassificationDataLayer, self).__init__(params, model,
                                           num_workers, worker_id)
-    self._batch_size = 1
-    self.vocab_file = self.params['vocab_file']
-    self.bptt = self.params['bptt']
+
+    self._data_root = self.params.get('data_root', None)
+    self._binary = self.params.get('binary', True)
+    self._get_stats = self.params.get('get_stats', False)
+    self._lm_vocab_file = self.params['lm_vocab_file']
+
     self._map_parallel_calls = self.params.get('map_parallel_calls', 8)
     self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False)
     self._prefetch_buffer_size = self.params.get('prefetch_buffer_size',
                                                  tf.contrib.data.AUTOTUNE)
+    self._shuffle_buffer_size = self.params.get('shuffle_buffer_size', -1)
     self._num_workers = num_workers
     self._worker_id = worker_id
-    self.params["delimiter"] = self.params.get("delimiter", " ")
-    self.seed_file = self.params.get("seed_file", None)
-
-    # load source and target vocabularies to RAM
-    self.corp = Corpus(self.params['vocab_file'])
-
-    if self.seed_file:
-      self.input_string = open(self.seed_file, 'r').read().strip()
-    else:
-      self.input_string = input('Please enter your seed string (case sensitive): ').strip()
-
-    self.corp.content = self.corp.tokenize(self.input_string)
-    if len(self.corp.content) > self.bptt:
-      self.corp.content = self.corp.content[-self.bptt:]
-
-    self.dataset_size = len(self.corp.content)
-
-    self.params['vocab_size'] = len(self.corp.dictionary.idx2word)
-    self.PAD_ID = self.params['vocab_size']
-    self.PAD = '<pad>'
-    self.corp.dictionary.idx2word.append(self.PAD)
-    self.corp.dictionary.word2idx[self.PAD] = self.PAD_ID
+    self._small = self.params.get("small", False)
+    self._max_length = self.params['max_length']
+    self.delimiter = self.params.get("delimiter", " ")
+    self.EOS_ID = -1
+    self.batch_size = self.params['batch_size']
 
+    if self._pad_lengths_to_eight and not (self._max_length % 8 == 0):
+      raise ValueError("If padding to 8 in data layer, then "
+                       "max_length should be multiple of 8")
     self._input_tensors = {}
 
   def gen(self):
-    yield self.corp.content
+    while True:
+      for review, raw_rating in self.corp.content:
+        if len(review) > self._max_length:
+          review = review[-self._max_length:]
+        rating = np.zeros(self.num_classes)
+        rating[raw_rating] = 1
+        yield (review, rating)
 
   def build_graph(self):
-    _src_tgt_dataset = tf.data.Dataset.from_generator(self.gen, (tf.int32), 
-                                (tf.TensorShape([len(self.corp.content)])))
+    _src_tgt_dataset = tf.data.Dataset.from_generator(self.gen, 
+                                        (tf.int32, tf.int32), 
+                                        (tf.TensorShape([None]), tf.TensorShape([self.num_classes])))
+
+    if self._num_workers > 1:
+      _src_tgt_dataset = _src_tgt_dataset\
+        .shard(num_shards=self._num_workers, index=self._worker_id)
 
-    _src_tgt_dataset = _src_tgt_dataset.map(lambda x: ((x, tf.size(x))), 
+    if self.params['shuffle']:
+      bf_size = self.get_size_in_samples() if self._shuffle_buffer_size == -1 \
+                                           else self._shuffle_buffer_size
+      _src_tgt_dataset = _src_tgt_dataset.shuffle(buffer_size=bf_size)
+
+    if self.params['repeat']:
+      _src_tgt_dataset = _src_tgt_dataset.repeat()
+
+    _src_tgt_dataset = _src_tgt_dataset.map(lambda x, y: ((x, tf.size(x)), (y, tf.size(y))), 
                             num_parallel_calls=self._map_parallel_calls)
 
-    self.batched_dataset = _src_tgt_dataset.batch(self._batch_size)
+    self.batched_dataset = _src_tgt_dataset.padded_batch(
+      self.batch_size,
+      padded_shapes=((tf.TensorShape([None]),
+                      tf.TensorShape([])),
+                     (tf.TensorShape([None]),
+                      tf.TensorShape([]))),
+      padding_values=(
+      (self.EOS_ID, 0),
+      (self.EOS_ID, 0))).prefetch(buffer_size=self._prefetch_buffer_size)
 
     self._iterator = self.batched_dataset.make_initializable_iterator()
-    t1, _ = self.iterator.get_next()
-    t1 = tf.expand_dims(t1[0], axis=0)
-    print(t1)
-    self._input_tensors['source_tensors'] = [t1[0], t1[1]]
-    print(self._input_tensors)
 
+    t1, t2 = self.iterator.get_next()
+    x, x_length = t1[0], t1[1]
+    y, y_length = t2[0], t2[1]
+    self._input_tensors['source_tensors'] = [x, x_length]
+    self._input_tensors['target_tensors'] = [y, y_length]
 
   def get_size_in_samples(self):
-    return 1
+    return self.dataset_size
 
   @property
   def iterator(self):
@@ -261,4 +291,71 @@ def iterator(self):
 
   @property
   def input_tensors(self):
-    return self._input_tensors
\ No newline at end of file
+    return self._input_tensors
+
+class IMDBDataLayer(TextClassificationDataLayer):
+  '''
+  Data layer to process the raw IMDB data, which can be downloaded here:
+  http://ai.stanford.edu/~amaas/data/sentiment/
+
+  '''
+  def __init__(self, params, model, num_workers=1, worker_id=0):
+    super(IMDBDataLayer, self).__init__(params, model, num_workers, worker_id)
+    self._processed_data_folder = self.params['processed_data_folder']
+
+    if self._binary:
+      self.num_classes = 2
+    else:
+      self.num_classes = 10
+
+    self.corp = IMDBCorpus(self._data_root, 
+                          self._processed_data_folder, 
+                          self._lm_vocab_file, 
+                          self._binary,
+                          get_stats=self._get_stats)
+    
+    if self.params['mode'] == 'train':
+      self.corp.content = self.corp.train
+    elif self.params['mode'] == 'eval':
+      self.corp.content = self.corp.valid
+    else:
+      self.corp.content = self.corp.test    
+    
+    if self._small:
+      if self.params['mode'] == 'eval':
+        self.corp.content = self.corp.content[:self.batch_size * 2]
+      else:
+        self.corp.content = self.corp.content[:self.batch_size * 4]
+
+    self.dataset_size = len(self.corp.content)
+    self.vocab_size = len(self.corp.dictionary.idx2word)
+    self.EOS_ID = self.corp.dictionary.word2idx[self.corp.dictionary.EOS]
+    self.end_token = self.corp.dictionary.word2idx[self.corp.dictionary.EOS]
+
+class SSTDataLayer(TextClassificationDataLayer):
+  '''
+  Data layer to process the raw SST (Stanford Sentiment Treebank).
+  Read about the dataset here:
+  https://nlp.stanford.edu/sentiment/
+  Download the preprocessed version that can be used for this DataLayer here:
+  https://github.com/NVIDIA/sentiment-discovery/tree/master/data/binary_sst
+  '''
+  def __init__(self, params, model, num_workers=1, worker_id=0):
+    super(SSTDataLayer, self).__init__(params, model, num_workers, worker_id)
+    self._processed_data_folder = self.params['processed_data_folder']
+    self.corp = SSTCorpus(self._data_root, 
+                          self._processed_data_folder, 
+                          self._lm_vocab_file,
+                          get_stats=self._get_stats)
+    
+    if self.params['mode'] == 'train':
+      self.corp.content = self.corp.train
+    elif self.params['mode'] == 'eval':
+      self.corp.content = self.corp.valid
+    else:
+      self.corp.content = self.corp.test
+    self.num_classes = 2
+    self.dataset_size = len(self.corp.content)
+    self.vocab_size = len(self.corp.dictionary.idx2word)
+    self.EOS_ID = self.corp.dictionary.word2idx[self.corp.dictionary.EOS]
+    self.end_token = self.corp.dictionary.word2idx[self.corp.dictionary.EOS]
\ No newline at end of file
diff --git a/open_seq2seq/data/lm/lmutils.py b/open_seq2seq/data/lm/lmutils.py
index d29ebfc45..3c278cce3 100644
--- a/open_seq2seq/data/lm/lmutils.py
+++ b/open_seq2seq/data/lm/lmutils.py
@@ -1,109 +1,494 @@
 from collections import Counter
+import glob
 import os
 import pathlib
+import random
+import re
+import shutil
 
+from nltk.tokenize import word_tokenize
 import numpy as np
+import pandas as pd
 
 class Dictionary(object):
+  '''
+  Adapted from salesforce's repo:
+  https://github.com/salesforce/awd-lstm-lm/blob/master/data.py
+  '''
+  def __init__(self, limit=3, vocab_link=None): # do we need limit?
+    self.word2idx = {}
+    self.idx2word = []
+    self.counter = Counter()
+    self.UNK = '<unk>'
+    self.EOS = '<eos>'
+    if vocab_link and os.path.isfile(vocab_link):
+      self.load_vocab(vocab_link)
+
+  def add_word(self, word):
+    if word not in self.word2idx:
+      self.idx2word.append(word)
+      self.word2idx[word] = len(self.idx2word) - 1
+    token_id = self.word2idx[word]
+    self.counter[token_id] += 1
+    return self.word2idx[word]
+
+  def load_vocab(self, vocab_link):
+    vocab_file = open(vocab_link, 'r')
+    lines = vocab_file.readlines()
+    n = int(lines[-1].strip())
+    self.idx2word = [0 for _ in range(n)]
+    for line in lines[:-1]:
+      parts = line.strip().split('\t')
+      token_id, word, count = int(parts[0]), parts[1], int(parts[2]) 
+      self.word2idx[word] = token_id
+      self.idx2word[token_id] = word
+      self.counter[token_id] = count
+    if not self.UNK in self.word2idx:
+      self.add_word(self.UNK)
+    if not self.EOS in self.word2idx:
+      self.add_word(self.EOS)
+
+
+  def __len__(self):
+    return len(self.idx2word)
+
+def check_exist(proc_path):
+  filenames = ['train.ids', 'valid.ids', 'test.ids']
+  paths = [os.path.join(proc_path, name) for name in filenames]
+  paths.append(proc_path)
+  for name in paths:
+    if not os.path.exists(name):
+      return False
+  return True
+
+def list2str(list):
+  return '\t'.join([str(num) for num in list])
+
+def unzip(data):
+  tmp = [list(t) for t in zip(*data)]
+  return (tmp[0], tmp[1])
+
+class Corpus(object):
+  def __init__(self, raw_path, proc_path, change_contraction=True, limit=3):
+    pathlib.Path(proc_path).mkdir(exist_ok=True)
+    self.limit = limit
+    self.dictionary = Dictionary(limit)
+    self.vocab_link = 'vocab.txt'
+    exists = check_exist(proc_path)
+    self.change_contraction = change_contraction
+
+    if not exists:
+      print('Creating corpus from raw data ...')
+      if raw_path and 'raw' in raw_path:
+        self._change_names(raw_path)
+      if not raw_path:
+        raise ValueError("data_root [directory to the original data] must be specified")
+      self.preprocess(raw_path, proc_path)
+      self.create_dictionary(proc_path, os.path.join(proc_path, 'train.txt'))
+      self.dictionary = Dictionary(limit)
+      self.dictionary.load_vocab(os.path.join(proc_path, self.vocab_link))
+      self.train = self.tokenize(proc_path, proc_path, 'train.txt')
+      self.valid = self.tokenize(proc_path, proc_path, 'valid.txt')
+      self.test = self.tokenize(proc_path, proc_path, 'test.txt')
+    else:
+      self.load_corpus(proc_path)
+
+  def _change_names(self, raw_path):
+    if os.path.isfile(os.path.join(raw_path, 'wiki.train.raw')):
+      os.rename(os.path.join(raw_path, 'wiki.train.raw'), os.path.join(raw_path, 'train.txt'))
+      os.rename(os.path.join(raw_path, 'wiki.valid.raw'), os.path.join(raw_path, 'valid.txt'))
+      os.rename(os.path.join(raw_path, 'wiki.test.raw'), os.path.join(raw_path, 'test.txt'))
+
+  def preprocess(self, raw_path, proc_path):
+    for filename in ['train.txt', 'valid.txt', 'test.txt']:
+      in_ = open(os.path.join(raw_path, filename), 'r')
+      out = open(os.path.join(proc_path, filename), 'w')
+      for line in in_:
+        line = re.sub('@-@', '-', line)
+        line = re.sub('-', ' - ', line)
+        line = re.sub('etc .', 'etc.', line)
+        if self.change_contraction:
+          line = re.sub("n 't", " n't", line)
+        tokens = []
+        for token in line.split():
+          tokens.append(token.strip())
+        out.write(' '.join(tokens) + '\n')
+
+  def create_dictionary(self, proc_path, filename):
     '''
-    Adapted from salesforce's repo:
-    https://github.com/salesforce/awd-lstm-lm/blob/master/data.py
+    Add words to the dictionary only if it's in the train file
     '''
-    def __init__(self):
-        self.word2idx = {}
-        self.idx2word = []
-        self.counter = Counter()
-        self.UNK = '<unk>'
-        self.EOS = '<eos>'
-
-    def add_word(self, word):
-        if word not in self.word2idx:
-            self.idx2word.append(word)
-            self.word2idx[word] = len(self.idx2word) - 1
-        token_id = self.word2idx[word]
-        self.counter[token_id] += 1
-        return self.word2idx[word]
-
-    def __len__(self):
-        return len(self.idx2word)
+    self.dictionary.add_word(self.dictionary.UNK)
+    with open(filename, 'r') as f:
+      f.readline()
+      for line in f:
+        words = line.split() + [self.dictionary.EOS]
+        for word in words:
+          self.dictionary.add_word(word)
 
-class Corpus(object):
-    def __init__(self, raw_path, proc_path):
-        pathlib.Path(proc_path).mkdir(exist_ok=True)
-        self.dictionary = Dictionary()
-        self.vocab_link = 'vocab.txt'
-        exists = self.check_exist(proc_path)
-
-        if not exists:
-            print('Creating corpus from raw data ...')
-            if not raw_path:
-                raise ValueError("data_root [directory to the original data] must be specified")
-            self.create_dictionary(proc_path, os.path.join(raw_path, 'train.txt'))
-            self.train = self.tokenize(raw_path, proc_path, 'train.txt')
-            self.valid = self.tokenize(raw_path, proc_path, 'valid.txt')
-            self.test = self.tokenize(raw_path, proc_path, 'test.txt')
+    with open(os.path.join(proc_path, self.vocab_link), 'w') as f:
+      f.write('\t'.join(['0', self.dictionary.UNK, '0']) + '\n')
+      idx = 1
+      for token_id, count in self.dictionary.counter.most_common():
+        if count < self.limit:
+          f.write(str(idx) + '\n')
+          return
+        f.write('\t'.join([str(idx), 
+              self.dictionary.idx2word[token_id], 
+              str(count)]) + '\n')
+        idx += 1
+      
+  def tokenize(self, raw_path, proc_path, filename):
+    unk_id = self.dictionary.word2idx[self.dictionary.UNK]
+    out = open(os.path.join(proc_path, filename[:-3] + 'ids'), 'w')
+    with open(os.path.join(raw_path, filename), 'r') as f:
+      ids = []
+      for line in f:
+        words = line.split() + [self.dictionary.EOS]
+        for word in words:
+          ids.append(self.dictionary.word2idx.get(word, unk_id))
+    out.write(list2str(ids))
+    out.close()
+
+    return np.asarray(ids)
+
+  def load_ids(self, filename):
+    ids = open(filename, 'r').read().strip().split('\t')
+    return np.asarray([int(i) for i in ids])
+
+  def list2str(self, list):
+    return '\t'.join([str(num) for num in list])
+
+  def load_corpus(self, proc_path):
+    print('Loading corpus from processed data ...')
+    self.dictionary.load_vocab(os.path.join(proc_path, self.vocab_link))
+    self.train = self.load_ids(os.path.join(proc_path, 'train.ids'))
+    self.valid = self.load_ids(os.path.join(proc_path, 'valid.ids'))
+    self.test = self.load_ids(os.path.join(proc_path, 'test.ids'))
+
+class IMDBCorpus(object):
+  def __init__(self, raw_path, proc_path, lm_vocab_link, binary=True, get_stats=False):
+    exists = check_exist(proc_path)
+    pathlib.Path(proc_path).mkdir(exist_ok=True)
+    self.dictionary = Dictionary(vocab_link=lm_vocab_link)
+    self.binary = binary
+    self.raw_path = raw_path
+    self.proc_path = proc_path
+    self._get_stats = get_stats
+
+    if not exists:
+      print('Creating corpus from raw data ...')
+      if not raw_path:
+        raise ValueError("data_root [directory to the original data] must be specified")
+      self.preprocess()
+    else:
+      self.load_corpus(proc_path)
+
+  def check_oov(self, txt):
+    txt = txt.lower()
+    txt = re.sub('thats', "that's", txt)
+    txt = re.sub('wouldnt', "wounldn't", txt)
+    txt = re.sub('couldnt', "couldn't", txt)
+    txt = re.sub('cant', "can't", txt)
+    txt = re.sub('dont', "don't", txt)
+    txt = re.sub("didnt", "didn't", txt)
+    txt = re.sub("isnt", "isn't", txt)
+    txt = re.sub("wasnt", "wasn't", txt)
+    return word_tokenize(txt)
+
+  def tokenize(self, txt):
+    txt = re.sub('<br />', ' ', txt)
+    txt = re.sub('', ' ', txt)
+    txt = re.sub('', ' ', txt)
+    txt = re.sub('-', ' - ', txt)
+    txt = re.sub('\.', ' . ', txt)
+    txt = re.sub('\+', ' + ', txt)
+    txt = re.sub('\*', ' * ', txt)
+    txt = re.sub('/', ' / ', txt)
+    txt = re.sub('`', "'", txt)
+    txt = re.sub(' ms \.', " ms.", txt)
+    txt = re.sub('Ms \.', "Ms.", txt)
+    
+    words = []
+    for token in word_tokenize(txt):
+      if not token in self.dictionary.word2idx:
+        if token.startswith("'"):
+          words.append("'")
+          token = token[1:]
+        if not token in self.dictionary.word2idx:
+          tokens = self.check_oov(token)
+          words.extend(tokens)
+        else:
+          words.append(token)
+      else:
+        words.append(token) 
+    
+    txt = ' '.join(words)
+    txt = re.sub("''", '"', txt)
+    txt = re.sub("' '", '"', txt)
+    txt = re.sub("``", '"', txt)
+    txt = re.sub('etc \.', 'etc. ', txt)
+    txt = re.sub(' etc ', ' etc. ', txt)
+    return txt
+
+  def tokenize_folder(self, mode, token_file, rating_file):
+    review_outfile = open(token_file, 'w')
+    rating_outfile = open(rating_file, 'w')
+    for sent in ['pos', 'neg']:
+      files = glob.glob(os.path.join(self.raw_path, mode, sent, '*.txt'))
+      for file in files:
+        in_file = open(file, 'r')
+        txt = self.tokenize(in_file.read())
+        review_outfile.write(txt + "\n")
+        if self.binary:
+          if sent == 'pos':
+            rating = "1"
+          else:
+            rating = "0"
+        else:
+          idx = file.rfind("_")
+          rating = str(int(file[idx + 1:-4]) - 1)
+        rating_outfile.write(rating + '\n')
+        in_file.close()
+
+  def txt2ids(self, mode, token_file, rating_file):
+    if self._get_stats:
+      import matplotlib
+      matplotlib.use("TkAgg")
+      from matplotlib import pyplot as plt
+    rating_lines = open(rating_file, 'r').readlines()
+    ratings = [int(line.strip()) for line in rating_lines]
+    reviews = []
+    unk_id = self.dictionary.word2idx[self.dictionary.UNK]
+    unseen = []
+    all_tokens = 0
+    all_unseen = 0
+    for line in open(token_file, 'r'):
+      tokens = line.strip().split()
+      reviews.append([self.dictionary.word2idx.get(token, unk_id) for token in tokens])
+      if self._get_stats:
+        for token in tokens:
+          all_tokens += 1
+          if not token in self.dictionary.word2idx:
+            unseen.append(token)
+            all_unseen += 1
+
+    if self._get_stats:
+      counter = Counter(unseen)
+
+      out = open(os.path.join(self.proc_path, mode + '_unseen.txt'), 'w')
+      for key, count in counter.most_common():
+          out.write(key + '\t' + str(count) + '\n')
+
+      lengths = np.asarray([len(review) for review in reviews])
+      stat_file = open(os.path.join(self.proc_path, 'statistics.txt'), 'w')
+      stat_file.write(mode + '\n')
+      short_lengths = [l for l in lengths if l <= 256]
+      stat_file.write('\t'.join(['Min', 'Max', 'Mean', 'Median', 'STD', 'Total', '<=256']) + '\n')
+      stats = [np.min(lengths), np.max(lengths), np.mean(lengths), np.median(lengths), np.std(lengths), len(lengths), len(short_lengths)]
+      stat_file.write('\t'.join([str(t) for t in stats]) + '\n')
+      stat_file.write('Total {} unseen out of {} all tokens. Probability {}.\n'.
+        format(all_unseen, all_tokens, all_unseen / all_tokens))
+      plt.hist(lengths, bins=20)
+      plt.savefig(os.path.join(self.proc_path, mode + '_hist.png'))
+      plt.hist(short_lengths, bins=20)
+      plt.savefig(os.path.join(self.proc_path, mode + '_short_hist.png'))
+
+    return list(zip(reviews, ratings))
+
+  def preprocess_folder(self, mode):
+    token_file = os.path.join(self.proc_path, mode + '.tok')
+    rating_file = os.path.join(self.proc_path, mode + '.inter.rat')
+    self.tokenize_folder(mode, token_file, rating_file)
+    return self.txt2ids(mode, token_file, rating_file)
+
+  def partition(self, data, val_count=1000):
+    random.shuffle(data)
+    return data[val_count:], data[:val_count]
+
+  def ids2file(self):
+    for mode in ['train', 'valid', 'test']:
+      data = getattr(self, mode)
+      review_out = open(os.path.join(self.proc_path, mode + '.ids'), 'w')
+      rating_out = open(os.path.join(self.proc_path, mode + '.rat'), 'w')
+      for review, rating in data:
+        review_out.write(list2str(review) + '\n')
+        rating_out.write(str(rating) + '\n')
+
+  def preprocess(self):
+    os.makedirs(self.proc_path, exist_ok=True)
+    train = self.preprocess_folder('train')
+    self.train, self.valid = self.partition(train)
+    self.test = self.preprocess_folder('test')
+    self.ids2file()
+
+  def load_ids(self, mode):
+    review_lines = open(os.path.join(self.proc_path, mode + '.ids')).readlines()
+    rating_lines = open(os.path.join(self.proc_path, mode + '.rat')).readlines()
+    ratings = [int(line.strip()) for line in rating_lines]
+    reviews = [[int(i) for i in line.strip().split('\t')] for line in review_lines]
+    return list(zip(reviews, ratings))
+
+  def load_corpus(self, proc_path):
+    print('Loading corpus from processed data ...')
+    self.train = self.load_ids('train')
+    self.valid = self.load_ids('valid')
+    self.test = self.load_ids('test')
+
+class SSTCorpus(object):
+  def __init__(self, raw_path, proc_path, lm_vocab_link, get_stats=False):
+    exists = check_exist(proc_path)
+    pathlib.Path(proc_path).mkdir(exist_ok=True)
+    self.dictionary = Dictionary(vocab_link=lm_vocab_link)
+    self.raw_path = raw_path
+    self.proc_path = proc_path
+    self._get_stats = get_stats
+
+    if not exists:
+      print('Creating corpus from raw data ...')
+      if not raw_path:
+        raise ValueError("data_root [directory to the original data] must be specified")
+      self.preprocess()
+    else:
+      self.load_corpus(proc_path)
+
+  def check_oov(self, txt):
+    txt = txt.lower()
+    txt = re.sub('thats', "that's", txt)
+    txt = re.sub('wouldnt', "wounldn't", txt)
+    txt = re.sub('couldnt', "couldn't", txt)
+    txt = re.sub('cant', "can't", txt)
+    txt = re.sub('dont', "don't", txt)
+    txt = re.sub("didnt", "didn't", txt)
+    txt = re.sub("isnt", "isn't", txt)
+    txt = re.sub("wasnt", "wasn't", txt)
+    return word_tokenize(txt)
+
+  def tokenize(self, txt):
+    txt = re.sub('-', ' - ', txt)
+    txt = re.sub('\+', ' + ', txt)
+    txt = re.sub('\*', ' * ', txt)
+    txt = re.sub('/', ' / ', txt)
+    txt = re.sub('`', "'", txt)
+    
+    words = []
+    for token in word_tokenize(txt):
+      if not token in self.dictionary.word2idx:
+        if token.startswith("'"):
+          words.append("'")
+          token = token[1:]
+        if not token in self.dictionary.word2idx:
+          tokens = self.check_oov(token)
+          words.extend(tokens)
         else:
-            self.load_corpus(proc_path)
-
-    def check_exist(self, proc_path):
-        paths = [proc_path, proc_path + '/vocab.txt', proc_path + '/train.ids', 
-                proc_path + '/valid.ids', proc_path + '/test.ids']
-        for name in paths:
-            if not os.path.exists(name):
-                return False
-        return True
-
-    def create_dictionary(self, proc_path, filename):
-        '''
-        Add words to the dictionary only if it's train file
-        '''
-        with open(filename, 'r') as f:
-            f.readline()
-            for line in f:
-                words = line.split() + [self.dictionary.EOS]
-                for word in words:
-                    self.dictionary.add_word(word)
-
-        with open(os.path.join(proc_path, self.vocab_link), 'w') as f:
-            f.write(str(len(self.dictionary)) + '\n')
-            for token_id, count in self.dictionary.counter.most_common():
-                f.write('\t'.join([str(token_id), 
-                            self.dictionary.idx2word[token_id], 
-                            str(count)]) + '\n')
-
-
-    def tokenize(self, raw_path, proc_path, filename):
-        unk_id = self.dictionary.word2idx[self.dictionary.UNK]
-        out = open(os.path.join(proc_path, filename[:-3] + 'ids'), 'w')
-        with open(os.path.join(raw_path, filename), 'r') as f:
-            ids = []
-            for line in f:
-                words = line.split() + [self.dictionary.EOS]
-                for word in words:
-                    ids.append(self.dictionary.word2idx.get(word, unk_id))
-        out.write(self.list2str(ids)) #TODO: change to pickle
-        out.close()
-
-        return np.asarray(ids)
-
-    def load_ids(self, filename):
-        ids = open(filename, 'r').read().strip().split('\t')
-        return np.asarray([int(i) for i in ids])
-
-    def list2str(self, list):
-        return '\t'.join([str(num) for num in list])
-
-    def load_corpus(self, proc_path):
-        print('Loading corpus from processed data ...')
-        vocab_file = open(os.path.join(proc_path, self.vocab_link), 'r')
-        n = int(vocab_file.readline().strip())
-        self.dictionary.idx2word = [0 for _ in range(n)]
-        for line in vocab_file:
-            parts = line.strip().split('\t')
-            token_id, word, count = int(parts[0]), parts[1], int(parts[2]) 
-            self.dictionary.word2idx[word] = token_id
-            self.dictionary.idx2word[token_id] = word
-            self.dictionary.counter[token_id] = count
-        self.train = self.load_ids(os.path.join(proc_path, 'train.ids'))
-        self.valid = self.load_ids(os.path.join(proc_path, 'valid.ids'))
-        self.test = self.load_ids(os.path.join(proc_path, 'test.ids'))
\ No newline at end of file
+          words.append(token)
+      else:
+        words.append(token) 
+    
+    txt = ' '.join(words)
+    txt = re.sub("''", '"', txt)
+    txt = re.sub("' '", '"', txt)
+    txt = re.sub("``", '"', txt)
+    txt = re.sub('etc \.', 'etc. ', txt)
+    txt = re.sub(' etc ', ' etc. ', txt)
+    return txt
+
+  def tokenize_file(self, mode):
+    data = pd.read_csv(os.path.join(self.raw_path, mode + '.csv'))
+
+    if mode == 'val':
+      mode = 'valid'
+    review_file = open(os.path.join(self.proc_path, mode + '.tok'), 'w')
+    rating_file = open(os.path.join(self.proc_path, mode + '.rat'), 'w')
+    for _, row in data.iterrows():
+      review = self.tokenize(row['sentence'])
+      review_file.write(review + '\n')
+      rating_file.write(str(row['label']) + '\n')
+
+  def txt2ids(self, mode):
+    if self._get_stats:
+      import matplotlib
+      matplotlib.use("TkAgg")
+      from matplotlib import pyplot as plt
+
+    reviews = []
+    unk_id = self.dictionary.word2idx[self.dictionary.UNK]
+    unseen = []
+    all_tokens = 0
+    all_unseen = 0
+
+    rating_lines = open(os.path.join(self.proc_path, mode + '.rat'), 'r').readlines()
+    ratings = [int(line.strip()) for line in rating_lines]
+
+    for line in open(os.path.join(self.proc_path, mode + '.tok'), 'r'):
+      tokens = line.strip().split()
+      reviews.append([self.dictionary.word2idx.get(token, unk_id) for token in tokens])
+      if self._get_stats:
+        for token in tokens:
+          all_tokens += 1
+          if not token in self.dictionary.word2idx:
+            unseen.append(token)
+            all_unseen += 1
+
+    if self._get_stats:
+      counter = Counter(unseen)
+
+      out = open(os.path.join(self.proc_path, mode + '_unseen.txt'), 'w')
+      for key, count in counter.most_common():
+          out.write(key + '\t' + str(count) + '\n')
+
+      lengths = np.asarray([len(review) for review in reviews])
+      stat_file = open(os.path.join(self.proc_path, 'statistics.txt'), 'a')
+      stat_file.write(mode + '\n')
+      short_lengths = [l for l in lengths if l <= 96]
+      stat_file.write('\t'.join(['Min', 'Max', 'Mean', 'Median', 'STD', 'Total', '<=96']) + '\n')
+      stats = [np.min(lengths), np.max(lengths), np.mean(lengths), np.median(lengths), np.std(lengths), len(lengths), len(short_lengths)]
+      stat_file.write('\t'.join([str(t) for t in stats]) + '\n')
+      stat_file.write('Total {} unseen out of {} all tokens. Probability {}.\n'.
+        format(all_unseen, all_tokens, all_unseen / all_tokens))
+      plt.hist(lengths, bins=20)
+      plt.savefig(os.path.join(self.proc_path, mode + '_hist.png'))
+      plt.hist(short_lengths, bins=20)
+      plt.savefig(os.path.join(self.proc_path, mode + '_short_hist.png'))
+
+    return list(zip(reviews, ratings))
+
+  def preprocess_file(self, mode):
+    self.tokenize_file(mode)
+    if mode == 'val':
+      mode = 'valid'
+    return self.txt2ids(mode)
+
+  def ids2file(self):
+    for mode in ['train', 'valid', 'test']:
+      data = getattr(self, mode)
+      review_out = open(os.path.join(self.proc_path, mode + '.ids'), 'w')
+      rating_out = open(os.path.join(self.proc_path, mode + '.rat'), 'w')
+      for review, rating in data:
+        review_out.write(list2str(review) + '\n')
+        rating_out.write(str(rating) + '\n')
+
+  def preprocess(self):
+    os.makedirs(self.proc_path, exist_ok=True)
+    self.train = self.preprocess_file('train')
+    self.valid = self.preprocess_file('val')
+    self.test = self.preprocess_file('test')
+    self.ids2file()
+
+  def load_ids(self, mode):
+    review_lines = open(os.path.join(self.proc_path, mode + '.ids')).readlines()
+    rating_lines = open(os.path.join(self.proc_path, mode + '.rat')).readlines()
+    ratings = [int(line.strip()) for line in rating_lines]
+    reviews = [[int(i) for i in line.strip().split('\t')] for line in review_lines]
+    return list(zip(reviews, ratings))
+
+  def load_corpus(self, proc_path):
+    print('Loading corpus from processed data ...')
+    self.train = self.load_ids('train')
+    self.valid = self.load_ids('valid')
+    self.test = self.load_ids('test')
+
+# SSTCorpus('/home/chipn/data/binary_sst', 'sst-processed-data-wkt2' , '/home/chipn/dev/OpenSeq2Seq/wkt2-processed-data/vocab.txt')
+# SSTCorpus('/home/chipn/data/binary_sst', 'sst-processed-data-wkt103' , '/home/chipn/dev/OpenSeq2Seq/wkt103-processed-data/vocab.txt')
+# IMDBCorpus('/home/chipn/data/aclImdb', 'imdb-processed-data-wkt103' , '/home/chipn/dev/OpenSeq2Seq/wkt103-processed-data/vocab.txt')
+# IMDBCorpus('/home/chipn/data/aclImdb', 'imdb-processed-data-wkt2' , '/home/chipn/dev/OpenSeq2Seq/wkt2-processed-data/vocab.txt')
\ No newline at end of file
diff --git a/open_seq2seq/encoders/lm_encoders.py b/open_seq2seq/encoders/lm_encoders.py
index 83fde4195..ebb50edc6 100644
--- a/open_seq2seq/encoders/lm_encoders.py
+++ b/open_seq2seq/encoders/lm_encoders.py
@@ -18,7 +18,7 @@
 
 class LMEncoder(Encoder):
   """
-  RNN-based encoder with embeddings
+  RNN-based encoder with embeddings for language modeling
   """
   @staticmethod
   def get_required_params():
@@ -29,8 +29,6 @@ def get_required_params():
       'encoder_use_skip_connections': bool,
       'core_cell': None,
       'core_cell_params': dict,
-      'last_cell_params': dict,
-      'output_dim': int,
       'end_token': int,
       "batch_size": int,
     })
@@ -41,7 +39,7 @@ def get_optional_params():
       'encoder_dp_input_keep_prob': float,
       'encoder_dp_output_keep_prob': float,
       "encoder_last_input_keep_prob": float,
-      "encoder_last_output_keep_prob": float, # output droput at last layer is 0.4
+      "encoder_last_output_keep_prob": float,
       'encoder_emb_keep_prob': float,
       'variational_recurrent': bool,
       'time_major': bool,
@@ -61,6 +59,8 @@ def get_optional_params():
       "weight_variational": bool,
       "dropout_seed": int,
       "num_sampled": int,
+      "fc_dim": int,
+      "use_cell_state": bool,
     })
 
   def __init__(self, params, model,
@@ -68,19 +68,48 @@ def __init__(self, params, model,
     """
     Initializes bi-directional encoder with embeddings
     :param params: dictionary with encoder parameters
+
+    Many of the techniques in this implementation is taken from the paper
+    "Regularizing and Optimizing LSTM Language Models" (Merity et al., 2017)
+    https://arxiv.org/pdf/1708.02182.pdf
+
     Must define:
       * vocab_size - data vocabulary size
       * emb_size - size of embedding to use
       * encoder_cell_units - number of units in RNN cell
       * encoder_cell_type - cell type: lstm, gru, etc.
       * encoder_layers - number of layers
-      * encoder_dp_input_keep_prob -
-      * encoder_dp_output_keep_prob -
       * encoder_use_skip_connections - true/false
       * time_major (optional)
       * use_swap_memory (optional)
       * mode - train or infer
-      ... add any cell-specific parameters here as well
+      * input_weight_keep_prob: keep probability for dropout of W 
+                                (kernel used to multiply with the input tensor)
+      * recurrent_weight_keep_prob: keep probability for dropout of U
+                                  (kernel used to multiply with last hidden state tensor)
+      * recurrent_keep_prob: keep probability for dropout
+                            when applying tanh for the input transform step
+      * weight_variational: whether to keep the same weight dropout mask
+                            at every timestep. This feature is not yet implemented.
+      * emb_keep_prob: keep probability for dropout of the embedding matrix
+      * encoder_dp_input_keep_prob: keep probability for dropout on input of a LSTM cell
+                                    in the layer which is not the last layer
+      * encoder_dp_output_keep_prob: keep probability for dropout on output of a LSTM cell
+                                    in the layer which is not the last layer
+      * encoder_last_input_keep_prob: like ``encoder_dp_input_keep_prob`` but for the 
+                                      cell in the last layer
+      * encoder_dp_output_keep_prob: like ``encoder_dp_output_keep_prob`` but for the 
+                                      cell in the last layer
+      * weight_tied: whether to tie the embedding matrix to the last linear layer.
+                     can only do so if the dimension of the last output layer is
+                     the same as the vocabulary size
+      * use_cell_state: if set to True, concat the last hidden state and 
+                        the last cell state to input into the last output layer.
+                        This only works for the text classification task, not the
+                        language modeling phase.
+      For different ways to do dropout for LSTM cells, please read this article:
+      https://medium.com/@bingobee01/a-review-of-dropout-as-applied-to-rnns-72e79ecd5b7b
+
     :param encoder_params:
     """
     super(LMEncoder, self).__init__(
@@ -101,16 +130,15 @@ def __init__(self, params, model,
     self.params['recurrent_weight_keep_prob'] = self.params.get('recurrent_weight_keep_prob', 1.0)
     self.params['weight_variational'] = self.params.get('weight_variational', False)
     self.params['dropout_seed'] = self.params.get('dropout_seed', 1822)
-    self._num_sampled = self.params.get('num_sampled', self._vocab_size) # if num_sampled not define then just take full softmax
-
-    if mode == 'infer':
-      self.num_tokens_gen = self.params.get('num_tokens_gen', 1)
+    self._fc_dim = self.params.get('fc_dim', self._vocab_size)
+    self._num_sampled = self.params.get('num_sampled', self._fc_dim) # if num_sampled not defined, take full softmax
+    self._lm_phase = self._fc_dim == self._vocab_size
+    self._num_tokens_gen = self.params.get('num_tokens_gen', 200)
+    self._batch_size = self.params['batch_size']
+    
+    if mode == 'infer' and self._lm_phase:
       self._batch_size = len(self.params['seed_tokens'])
-    else:
-      self.num_tokens_gen = 1
-      self._batch_size = self.params['batch_size']
-      # if self._vocab_size > 100000 and mode == 'eval':
-      #   self._batch_size = 2
+    self._use_cell_state = self.params.get('use_cell_state', False)
 
   def encode(self, input_dict):
     """Wrapper around :meth:`self._encode() <_encode>` method.
@@ -193,21 +221,36 @@ def _encode(self, input_dict):
       emb_keep_prob, recurrent_keep_prob = 1.0, 1.0
       input_weight_keep_prob, recurrent_weight_keep_prob = 1.0, 1.0
 
+
     self._output_layer = tf.layers.Dense(
-      self._vocab_size, 
+      self._fc_dim, 
       kernel_regularizer=regularizer,
       kernel_initializer=initializer,
       use_bias=fc_use_bias,
+      dtype=self._params['dtype']
     )
 
     if self._weight_tied:
-      fake_input = tf.zeros(shape=(1, self._emb_size))
-      fake_output = self._output_layer.apply(fake_input)
-      with tf.variable_scope("dense", reuse=True):
-        dense_weights = tf.get_variable("kernel")
-        dense_biases = tf.get_variable("bias")
+      last_cell_params = copy.deepcopy(self.params['core_cell_params'])
+      last_cell_params['num_units'] = self._emb_size
+    else:
+      last_cell_params = self.params['core_cell_params']
+    
+    last_output_dim = last_cell_params['num_units']
+
+    if self._use_cell_state:
+      last_output_dim = 2 * last_output_dim
+
+
+    fake_input = tf.zeros(shape=(1, last_output_dim), 
+                          dtype=self._params['dtype'])
+    fake_output = self._output_layer.apply(fake_input)
+    with tf.variable_scope("dense", reuse=True):
+      dense_weights = tf.get_variable("kernel")
+      dense_biases = tf.get_variable("bias")
+    
+    if self._weight_tied and self._lm_phase:
       enc_emb_w = tf.transpose(dense_weights)
-        
     else:
       enc_emb_w = tf.get_variable(
         name="EncoderEmbeddingMatrix",
@@ -217,11 +260,6 @@ def _encode(self, input_dict):
 
     self._enc_emb_w = tf.nn.dropout(enc_emb_w, keep_prob=emb_keep_prob)
 
-    if self._weight_tied:
-      last_cell_params = self.params['last_cell_params']
-    else:
-      last_cell_params = self.params['core_cell_params']
-
     fwd_cells = [
       single_cell(cell_class=self.params['core_cell'],
                   cell_params=self.params['core_cell_params'],
@@ -261,7 +299,8 @@ def _encode(self, input_dict):
     source_sequence = input_dict['source_tensors'][0]
     source_length = input_dict['source_tensors'][1]
 
-    if self._mode == 'train' or self._mode == 'eval':
+    # Inference for language modeling requires a different graph
+    if (not self._lm_phase) or self._mode == 'train' or self._mode == 'eval':
       embedded_inputs = tf.cast(tf.nn.embedding_lookup(
         self.enc_emb_w,
         source_sequence,
@@ -273,21 +312,26 @@ def _encode(self, input_dict):
         sequence_length=source_length,
         time_major=time_major,
         swap_memory=use_swap_memory,
-        dtype=embedded_inputs.dtype,
+        dtype=self._params['dtype'],
         scope='decoder',
       )
-      if self._mode == 'eval' or self._num_sampled >= self._vocab_size:
-        logits = self._output_layer.apply(encoder_outputs) # full softmax
-        output_dict = {'logits': logits, 'outputs': [tf.argmax(logits, axis=-1)]}
-      else:
+      if not self._lm_phase:
+        if self._use_cell_state:
+          encoder_outputs = tf.concat([encoder_state[-1].h, encoder_state[-1].c], axis=1)
+        else:
+          encoder_outputs = encoder_state[-1].h
+
+      if self._mode == 'train' and self._num_sampled < self._fc_dim: # sampled softmax
         output_dict = {'weights': enc_emb_w,
                     'bias': dense_biases,
                     'inputs': encoder_outputs,
                     'logits': encoder_outputs,
                     'outputs': [encoder_outputs],
                     'num_sampled': self._num_sampled}
-
-    else:
+      else: # full softmax
+        logits = self._output_layer.apply(encoder_outputs)
+        output_dict = {'logits': logits, 'outputs': [logits]}
+    else: # infer in LM phase
       embedding_fn = lambda ids: tf.cast(tf.nn.embedding_lookup(
                                           self.enc_emb_w,
                                           ids,
@@ -306,7 +350,7 @@ def _encode(self, input_dict):
         ),
         output_layer=self._output_layer,
       )
-      maximum_iterations = tf.constant(200)
+      maximum_iterations = tf.constant(self._num_tokens_gen)
 
       final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
         decoder=decoder,
diff --git a/open_seq2seq/losses/cross_entropy_loss.py b/open_seq2seq/losses/cross_entropy_loss.py
index 33b59da28..ded3caa1d 100644
--- a/open_seq2seq/losses/cross_entropy_loss.py
+++ b/open_seq2seq/losses/cross_entropy_loss.py
@@ -17,5 +17,4 @@ def __init__(self, params, model, name="cross_entropy_loss"):
   def _compute_loss(self, input_dict):
     logits = input_dict['decoder_output']['logits']
     labels = input_dict['target_tensors'][0]
-    loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels)
-    return loss
+    return tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels)
diff --git a/open_seq2seq/losses/sequence_loss.py b/open_seq2seq/losses/sequence_loss.py
index a6d0edeca..81ce3bc88 100644
--- a/open_seq2seq/losses/sequence_loss.py
+++ b/open_seq2seq/losses/sequence_loss.py
@@ -262,7 +262,7 @@ def _compute_loss(self, input_dict):
     labels = input_dict["target_tensors"][0]
 
     def _pad_tensors_to_same_length(x, y):
-      """Pad x and y so that the results have the
+      """ Pad x and y so that the results have the
       same length (second dimension).
       """
       with tf.name_scope("pad_to_same_length"):
@@ -311,7 +311,9 @@ def _pad_tensors_to_same_length(x, y):
 
 class BasicSampledSequenceLoss(Loss):
   """
-  Basic sequence-to-sequence loss. This one does not use one-hot encodings
+  Basic sampled sequence-to-sequence loss. This is used when the full softmax
+  is computational prohibitive.
+  This one does not use one-hot encodings.
   """
   @staticmethod
   def get_required_params():
@@ -371,19 +373,29 @@ def _compute_loss(self, input_dict):
     tgt_lengths = input_dict['target_tensors'][1]
 
     if 'weights' in input_dict['decoder_output']:
-      print('DOING SAMPLED LOSS')
+      print("Because 'weights' is in the input_dict, we are using sampled softmax loss.")
       inputs = input_dict["decoder_output"]['inputs']
       self._hid_dim = inputs.get_shape().as_list()[-1]
       inputs = tf.reshape(inputs, (-1, self._hid_dim))
       targets = tf.reshape(target_sequence, (-1, 1))
-      crossent = tf.nn.sampled_softmax_loss(input_dict["decoder_output"]['weights'],
-                                            input_dict[
-          "decoder_output"]['bias'],
-          targets,
-          inputs,
-          input_dict['decoder_output'][
-          'num_sampled'],
-          self._tgt_vocab_size)
+
+      weights = input_dict["decoder_output"]['weights']
+      biases = input_dict["decoder_output"]['bias']
+      
+      if inputs.dtype.base_dtype != tf.float32:
+        inputs = tf.cast(inputs, tf.float32)
+      if weights.dtype.base_dtype != tf.float32:
+        weights = tf.cast(weights, tf.float32)
+      if biases.dtype.base_dtype != tf.float32:
+        biases = tf.cast(biases, tf.float32)
+      crossent = tf.nn.sampled_softmax_loss(weights, 
+                                            biases, 
+                                            targets, 
+                                            inputs,
+                                            input_dict['decoder_output']['num_sampled'],
+                                            self._tgt_vocab_size)
+
+
       if self._average_across_timestep:
         loss = tf.reduce_mean(crossent)
       else:
@@ -391,6 +403,7 @@ def _compute_loss(self, input_dict):
         loss /= self._batch_size
 
     else:
+      print("Because 'weights' is not in the input_dict, we are using normal softmax loss.")
       logits = input_dict["decoder_output"]["logits"]
 
       if self._offset_target_by_one:
diff --git a/open_seq2seq/models/encoder_decoder.py b/open_seq2seq/models/encoder_decoder.py
index da8cb7c7f..08b258c35 100644
--- a/open_seq2seq/models/encoder_decoder.py
+++ b/open_seq2seq/models/encoder_decoder.py
@@ -7,7 +7,6 @@
 from open_seq2seq.models.model import Model
 from open_seq2seq.utils.utils import deco_print
 
-
 class EncoderDecoderModel(Model):
   """
   Standard encoder-decoder class with one encoder and one decoder.
@@ -64,8 +63,8 @@ def __init__(self, params, mode="train", hvd=None):
     * **loss_params** (dict) --- dictionary with loss configuration. For
       complete list of possible parameters see the corresponding class docs.
     """
-    super(EncoderDecoderModel, self).__init__(
-        params=params, mode=mode, hvd=hvd)
+    super(EncoderDecoderModel, self).__init__(params=params, mode=mode, hvd=hvd)
+
     if 'encoder_params' not in self.params:
       self.params['encoder_params'] = {}
     if 'decoder_params' not in self.params:
diff --git a/open_seq2seq/models/lstm_lm.py b/open_seq2seq/models/lstm_lm.py
index b01df9174..075d1102e 100644
--- a/open_seq2seq/models/lstm_lm.py
+++ b/open_seq2seq/models/lstm_lm.py
@@ -1,61 +1,113 @@
+import random
+import numpy as np
 import tensorflow as tf
 
 from .encoder_decoder import EncoderDecoderModel
+from open_seq2seq.data import WKTDataLayer
 from open_seq2seq.utils.utils import deco_print, array_to_string
+from open_seq2seq.utils import metrics
 
 class LSTMLM(EncoderDecoderModel):
   """
-  An example class implementing classical text-to-text model.
+  An example class implementing an LSTM language model.
   """
+  def __init__(self, params, mode="train", hvd=None):
+    super(EncoderDecoderModel, self).__init__(params=params, mode=mode, hvd=hvd)
+
+    if 'encoder_params' not in self.params:
+      self.params['encoder_params'] = {}
+    if 'decoder_params' not in self.params:
+      self.params['decoder_params'] = {}
+    if 'loss_params' not in self.params:
+      self.params['loss_params'] = {}
+
+    self._lm_phase = isinstance(self.get_data_layer(), WKTDataLayer)
+
+    self._encoder = self._create_encoder()
+    self._decoder = self._create_decoder()
+    if self.mode == 'train' or self.mode == 'eval':
+      self._loss_computator = self._create_loss()
+    else:
+      self._loss_computator = None
+
+    self.delimiter = self.get_data_layer().delimiter
 
   def _create_encoder(self):
+    self._print_f1 = False
     self.params['encoder_params']['vocab_size'] = (
-      self.get_data_layer().params['vocab_size']
-    )
-    self.params['encoder_params']['output_dim'] = (
-      self.get_data_layer().params['vocab_size']
+      self.get_data_layer().vocab_size
     )
     self.params['encoder_params']['end_token'] = (
-      self.get_data_layer().params['end_token']
-    )
-    self.params['encoder_params']['seed_tokens'] = (
-      self.get_data_layer().params['seed_tokens']
+      self.get_data_layer().end_token
     )
     self.params['encoder_params']['batch_size'] = (
-      self.get_data_layer().params['batch_size']
+      self.get_data_layer().batch_size
     )
+    if not self._lm_phase:
+      self.params['encoder_params']['fc_dim'] = (
+        self.get_data_layer().num_classes
+      )
+      if self.params['encoder_params']['fc_dim'] == 2:
+        self._print_f1 = True
+    if self._lm_phase:
+      self.params['encoder_params']['seed_tokens'] = (
+        self.get_data_layer().params['seed_tokens']
+      )
     return super(LSTMLM, self)._create_encoder()
 
   def _create_loss(self):
-    # self.params['loss_params']['batch_size'] = self.params['batch_size_per_gpu']
-    self.params['loss_params']['batch_size'] = self.get_data_layer().params['batch_size']
-    
-    self.params['loss_params']['tgt_vocab_size'] = (
-      self.get_data_layer().params['vocab_size']
-    )
+    if self._lm_phase:
+      self.params['loss_params']['batch_size'] = (
+        self.get_data_layer().batch_size
+      )
+      self.params['loss_params']['tgt_vocab_size'] = (
+        self.get_data_layer().vocab_size
+      )
 
     return super(LSTMLM, self)._create_loss()
 
 
   def infer(self, input_values, output_values):
-    vocab = self.get_data_layer().corp.dictionary.idx2word
-    seed_tokens = self.params['encoder_params']['seed_tokens']
-    for i in range(len(seed_tokens)):
-      print(output_values[0][i].shape)
-      print('Seed:', vocab[seed_tokens[i]] + '\n')
-      deco_print(
-        "Output: " + array_to_string(
-          output_values[0][i],
+    if self._lm_phase:
+      vocab = self.get_data_layer().corp.dictionary.idx2word
+      seed_tokens = self.params['encoder_params']['seed_tokens']
+      for i in range(len(seed_tokens)):
+        print('Seed:', vocab[seed_tokens[i]] + '\n')
+        deco_print(
+          "Output: " + array_to_string(
+            output_values[0][i],
+            vocab=self.get_data_layer().corp.dictionary.idx2word,
+            delim=self.delimiter,
+          ),
+          offset=4,
+        )
+      return []
+    else:
+      ex, elen_x = input_values['source_tensors']
+      ey, elen_y = None, None
+      if 'target_tensors' in input_values:
+        ey, elen_y = input_values['target_tensors']
+
+      n_samples = len(ex)
+      results = []
+      for i in range(n_samples):
+        current_x = array_to_string(
+          ex[i][:elen_x[i]],
           vocab=self.get_data_layer().corp.dictionary.idx2word,
-          delim=self.get_data_layer().params["delimiter"],
+          delim=self.delimiter,
         ),
-        offset=4,
-      )
+        current_pred = np.argmax(output_values[0][i])
+        curret_y = None
+        if ey is not None:
+          current_y = np.argmax(ey[i])
+
+        results.append((current_x[0], current_pred, current_y))
+      return results
+  
 
   def maybe_print_logs(self, input_values, output_values, training_step):
     x, len_x = input_values['source_tensors']
-    y, len_y = input_values['target_tensors']
-    # samples = output_values[0][0]
+    y, len_y = input_values['target_tensors']    
 
     x_sample = x[0]
     len_x_sample = len_x[0]
@@ -66,19 +118,49 @@ def maybe_print_logs(self, input_values, output_values, training_step):
       "Train Source[0]:     " + array_to_string(
         x_sample[:len_x_sample],
         vocab=self.get_data_layer().corp.dictionary.idx2word,
-        delim=self.get_data_layer().params["delimiter"],
-      ),
-      offset=4,
-    )
-    deco_print(
-      "Train Target[0]:     " + array_to_string(
-        y_sample[:len_y_sample],
-        vocab=self.get_data_layer().corp.dictionary.idx2word,
-        delim=self.get_data_layer().params["delimiter"],
+        delim=self.delimiter,
       ),
       offset=4,
     )
 
+    if self._lm_phase:
+      deco_print(
+        "Train Target[0]:     " + array_to_string(
+          y_sample[:len_y_sample],
+          vocab=self.get_data_layer().corp.dictionary.idx2word,
+          delim=self.delimiter,
+        ),
+        offset=4,
+      )
+    else:
+      deco_print(
+        "TRAIN Target[0]:     " + str(np.argmax(y_sample)),
+        offset=4,
+      )
+      samples = output_values[0][0]
+      deco_print(
+        "TRAIN Prediction[0]:     " + str(samples),
+        offset=4,
+      )
+      labels = np.argmax(y, 1)
+      preds = np.argmax(output_values[0], axis=-1)
+      print('Labels', labels)
+      print('Preds', preds)
+
+      deco_print(
+        "Accuracy: {:.4f}".format(metrics.accuracy(labels, preds)),
+        offset = 4,
+      )
+
+      if self._print_f1:
+        deco_print(
+          "Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f}"
+              .format(metrics.precision(labels, preds), 
+                      metrics.recall(labels, preds),
+                      metrics.f1(labels, preds)),
+          offset = 4,
+        )
+
     return {}
 
   def evaluate(self, input_values, output_values):
@@ -89,33 +171,129 @@ def evaluate(self, input_values, output_values):
     len_x_sample = elen_x[0]
     y_sample = ey[0]
     len_y_sample = elen_y[0]
+    
+    return_values = {}
+    
+    if self._lm_phase:
+      flip = random.random()
+      if flip <= 0.9:
+        return return_values
+
+      deco_print(
+        "*****EVAL Source[0]:     " + array_to_string(
+          x_sample[:len_x_sample],
+          vocab=self.get_data_layer().corp.dictionary.idx2word,
+          delim=self.delimiter,
+        ),
+        offset=4,
+      )
+      samples = np.argmax(output_values[0][0], axis=-1)
+      deco_print(
+        "*****EVAL Target[0]:     " + array_to_string(
+          y_sample[:len_y_sample],
+          vocab=self.get_data_layer().corp.dictionary.idx2word,
+          delim=self.delimiter,
+        ),
+        offset=4,
+      )
+    
+      deco_print(
+        "*****EVAL Prediction[0]: " + array_to_string(
+          samples,
+          vocab=self.get_data_layer().corp.dictionary.idx2word,
+          delim=self.delimiter,
+        ),
+        offset=4,
+      )
+    else:
+      deco_print(
+        "*****EVAL Source[0]:     " + array_to_string(
+          x_sample[:len_x_sample],
+          vocab=self.get_data_layer().corp.dictionary.idx2word,
+          delim=self.delimiter,
+        ),
+        offset=4,
+      )
+      samples = output_values[0][0]
+      deco_print(
+        "EVAL Target[0]:     " + str(np.argmax(y_sample)),
+        offset=4,
+      )
+      deco_print(
+        "EVAL Prediction[0]:     " + str(samples),
+        offset=4,
+      )
+
+      labels = np.argmax(ey, 1)
+      preds = np.argmax(output_values[0], axis=-1)
+      print('Labels', labels)
+      print('Preds', preds)
+
+      return_values['accuracy'] = metrics.accuracy(labels, preds)
+
+      if self._print_f1:
+        return_values['true_pos'] = metrics.true_positives(labels, preds)
+        return_values['pred_pos'] = np.sum(preds)
+        return_values['actual_pos'] = np.sum(labels)
+
+    return return_values
+
+  def finalize_evaluation(self, results_per_batch, training_step=None):
+    accuracies = []
+    true_pos, pred_pos, actual_pos = 0.0, 0.0, 0.0
+
+    for results in results_per_batch:
+      if not 'accuracy' in results:
+        return {}
+      accuracies.append(results['accuracy'])
+      if 'true_pos' in results:
+        true_pos += results['true_pos']
+        pred_pos += results['pred_pos']
+        actual_pos += results['actual_pos']
 
     deco_print(
-      "*****EVAL Source[0]:     " + array_to_string(
-        x_sample[:len_x_sample],
-        vocab=self.get_data_layer().corp.dictionary.idx2word,
-        delim=self.get_data_layer().params["delimiter"],
-      ),
-      offset=4,
-    )
-    deco_print(
-      "*****EVAL Target[0]:     " + array_to_string(
-        y_sample[:len_y_sample],
-        vocab=self.get_data_layer().corp.dictionary.idx2word,
-        delim=self.get_data_layer().params["delimiter"],
-      ),
-      offset=4,
-    )
-    samples = output_values[0][0]
-    deco_print(
-      "*****EVAL Prediction[0]: " + array_to_string(
-        samples,
-        vocab=self.get_data_layer().corp.dictionary.idx2word,
-        delim=self.get_data_layer().params["delimiter"],
-      ),
-      offset=4,
+      "EVAL Accuracy: {:.4f}".format(np.mean(accuracies)),
+      offset = 4,
     )
 
+    if true_pos > 0:
+      prec = true_pos / pred_pos
+      rec = true_pos / actual_pos
+      f1 = 2.0 * prec * rec / (rec + prec)
+      deco_print(
+        "EVAL Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f} | True pos: {}"
+            .format(prec, rec, f1, true_pos),
+        offset = 4,
+      )
+    return {}
+
+  def finalize_inference(self, results_per_batch, output_file):
+    out = open(output_file, 'w')
+    out.write('\t'.join(['Source', 'Pred', 'Label']) + '\n')
+    preds, labels = [], []
+
+    for results in results_per_batch:
+      for x, pred, y in results:
+        out.write('\t'.join([x, str(pred), str(y)]) + '\n')
+        preds.append(pred)
+        labels.append(y)
+
+    if len(labels) > 0 and labels[0] is not None:
+      preds = np.asarray(preds)
+      labels = np.asarray(labels)
+      deco_print(
+        "TEST Accuracy: {:.4f}".format(metrics.accuracy(labels, preds)),
+        offset = 4,
+      )
+      deco_print(
+        "TEST Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f}"
+            .format(metrics.precision(labels, preds), 
+                    metrics.recall(labels, preds),
+                    metrics.f1(labels, preds)),
+        offset = 4,
+      )
+    return {}
+
   def _get_num_objects_per_step(self, worker_id=0):
     """Returns number of source tokens + number of target tokens in batch."""
     data_layer = self.get_data_layer(worker_id)
diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py
index 489cc4410..81b9b2abd 100644
--- a/open_seq2seq/models/model.py
+++ b/open_seq2seq/models/model.py
@@ -57,14 +57,15 @@ class :meth:`__init__` method.
         'num_gpus': int,  # cannot be used when gpu_ids is specified
         'gpu_ids': list,  # cannot be used when num_gpus is specified
 
+        'load_model': str,
+
         'save_summaries_steps': None,  # could be int or None
         'print_loss_steps': None,  # could be int or None
         'print_samples_steps': None,  # could be int or None
         'print_bench_info_steps': None,  # could be int or None
         'save_checkpoint_steps': None,  # could be int or None
-        'restore_best_checkpoint': bool, # whether to restore best check point
+        'restore_best_checkpoint': bool, # if True,restore best check point instead of latest checkpoint
         'eval_steps': int,
-        'base_logdir': str,
         'finetune': bool,
         'eval_batch_size_per_gpu': int,
 
@@ -89,6 +90,8 @@ class :meth:`__init__` method.
         'loss_scaling_params': dict,
         'summaries': list,
         'iter_size': int,
+        'lm_vocab_file': str, #TODO: move this paramters to lstm_lm.py
+        'processed_data_folder': str,
     }
 
   def __init__(self, params, mode="train", hvd=None):
@@ -121,6 +124,20 @@ def __init__(self, params, mode="train", hvd=None):
       used if ``num_gpus`` is specified. When ``use_horovod`` is True
       this parameter is ignored.
     * **batch_size_per_gpu** (int) --- batch size to use for each GPU.
+    * **eval_batch_size_per_gpu** (int) --- batch size to use for each GPU during
+      inference. This is for when training and inference have different computation
+      and memory requirements, such as when training uses sampled softmax and
+      inference uses full softmax. If not specified, it's set
+      to ``batch_size_per_gpu``.
+    * **restore_best_checkpoint** (bool) --- if set to True, when doing evaluation 
+      and inference, the model will load the best checkpoint instead of the latest
+      checkpoint. Best checkpoint is evaluated based on evaluation results, so 
+      it's only available when the model is trained untder ``train_eval`` mode.
+      Default to False.
+    * **load_model** (str) --- points to the location of the pretrained model for
+      transfer learning. If specified, during training, the system will look
+      into the checkpoint in this folder and restore all variables whose names and 
+      shapes match a variable in the new model.
     * **num_epochs** (int) --- number of epochs to run training for.
       This parameter cannot be used if ``max_steps`` is specified.
     * **max_steps** (int) --- number of steps to run training for.
@@ -234,7 +251,9 @@ class docs.
       self._params['print_bench_info_steps'] = None
 
     self._params['finetune'] = self._params.get('finetune', False)
-    self._params['base_logdir'] = self._params.get('base_logdir', None)
+    # self._params['base_logdir'] = self._params.get('base_logdir', None)
+    self._params['load_model'] = self._params.get('load_model', None)
+    self._params['load_fc'] = self._params.get('load_fc', False)
     self._params['eval_batch_size_per_gpu'] = self._params.get(
         'eval_batch_size_per_gpu',
         self._params['batch_size_per_gpu']
@@ -277,9 +296,14 @@ class docs.
       dl_params['batch_size'] = self._params['batch_size_per_gpu']
     else:
       dl_params['batch_size'] = self._params['eval_batch_size_per_gpu']
+    if 'lm_vocab_file' in self._params:
+      dl_params['lm_vocab_file'] = self._params['lm_vocab_file']
+    if 'processed_data_folder' in self._params:
+      dl_params['processed_data_folder'] = self._params['processed_data_folder']
     dl_params['mode'] = self._mode
     dl_params['interactive'] = self._interactive
 
+
     if self.on_horovod:
       self._data_layer = self._params['data_layer'](
           params=dl_params, model=self,
@@ -353,7 +377,6 @@ def compile(self, force_var_reuse=False, checkpoint=None, use_trt=False, precisi
           else:
             self.get_data_layer(gpu_cnt).build_graph()
           input_tensors = self.get_data_layer(gpu_cnt).input_tensors
-
           loss, self._outputs[gpu_cnt] = self.build_forward_pass_graph(
               input_tensors,
               gpu_id=gpu_cnt,
@@ -366,6 +389,7 @@ def compile(self, force_var_reuse=False, checkpoint=None, use_trt=False, precisi
             raise ValueError('Decoder outputs have to be either None or list')
           if self._mode == "train" or self._mode == "eval":
             losses.append(loss)
+                
       # end of for gpu_ind loop
       if self._mode == "train":
         self.loss = tf.reduce_mean(losses)
@@ -386,8 +410,13 @@ def compile(self, force_var_reuse=False, checkpoint=None, use_trt=False, precisi
         self.get_data_layer().build_graph()
         input_tensors = self.get_data_layer().input_tensors
 
-        loss, self._output = self._build_forward_pass_graph(input_tensors,
+        all_loss, self._output = self._build_forward_pass_graph(input_tensors,
                                                             gpu_id=0)
+        if isinstance(all_loss, (dict,)):
+            loss = all_loss['loss']
+        else:
+          loss = all_loss
+
         if self._output is not None and not isinstance(self._output, list):
           raise ValueError('Decoder outputs have to be either None or list')
 
diff --git a/open_seq2seq/models/text2text.py b/open_seq2seq/models/text2text.py
index bbe42042c..5a816dd1a 100644
--- a/open_seq2seq/models/text2text.py
+++ b/open_seq2seq/models/text2text.py
@@ -93,7 +93,7 @@ def infer(self, input_values, output_values):
                                            SpecialTextTokens.EOS_ID),
             PAD_ID=self.decoder.params.get('PAD_SYMBOL',
                                            SpecialTextTokens.PAD_ID),
-          ignore_special=True, delim=' ',
+            ignore_special=True, delim=' ',
         ))
         input_strings.append(text_ids_to_string(
             input_sample[i],
@@ -101,7 +101,7 @@ def infer(self, input_values, output_values):
             S_ID=self.decoder.params.get('GO_SYMBOL',
                                          SpecialTextTokens.S_ID.value),
             EOS_ID=self.decoder.params.get('END_SYMBOL',
-                                           SpecialTextTokens.EOS_ID.value),
+                                       SpecialTextTokens.EOS_ID.value),
             PAD_ID=self.decoder.params.get('PAD_SYMBOL',
                                            SpecialTextTokens.PAD_ID),
             ignore_special=True, delim=' ',
diff --git a/open_seq2seq/parts/rnns/utils.py b/open_seq2seq/parts/rnns/utils.py
index 7d4a1d9b5..a2d167cba 100644
--- a/open_seq2seq/parts/rnns/utils.py
+++ b/open_seq2seq/parts/rnns/utils.py
@@ -52,8 +52,6 @@ def single_cell(
   if awd_initializer:
     val = 1.0/math.sqrt(cell_params['num_units'])
     cell_params['initializer'] = tf.random_uniform_initializer(minval=-val, maxval=val)
-  # else:
-  #   cell_params['initializer'] = tf.contrib.layers.xavier_initializer()
   if 'WeightDropLayerNormBasicLSTMCell' in str(cell_class):
     if recurrent_keep_prob < 1.0:
       cell_params['recurrent_keep_prob'] = recurrent_keep_prob
diff --git a/open_seq2seq/parts/rnns/weight_drop.py b/open_seq2seq/parts/rnns/weight_drop.py
index 3280bc496..bc25b3198 100644
--- a/open_seq2seq/parts/rnns/weight_drop.py
+++ b/open_seq2seq/parts/rnns/weight_drop.py
@@ -2,6 +2,7 @@
 
 class WeightDropLayerNormBasicLSTMCell(tf.contrib.rnn.RNNCell):
   """LSTM unit with layer normalization, weight dropout, and recurrent dropout.
+  This is based on LSTM's standard implementation of LayerNormBasicLSTMCell.
   This class adds layer normalization and recurrent dropout to a
   basic LSTM unit. Layer normalization implementation is based on:
     https://arxiv.org/abs/1607.06450.
@@ -32,8 +33,6 @@ def __init__(self,
                dtype=None):
     """Initializes the basic LSTM cell.
     Args:
-      input_weight is W
-      recurrent_weight is U
       num_units: int, The number of units in the LSTM cell.
       forget_bias: float, The bias added to forget gates (see above).
       input_size: Deprecated and unused.
@@ -43,9 +42,14 @@ def __init__(self,
         `layer_norm` has been set to `False`, this argument will be ignored.
       norm_shift: float, The layer normalization shift initial value. If
         `layer_norm` has been set to `False`, this argument will be ignored.
-      dropout_keep_prob: unit Tensor or float between 0 and 1 representing the
-        recurrent dropout probability value. If float and 1.0, no dropout will
-        be applied.
+      input_weight_keep_prob: keep probablility for dropout of W 
+                              (kernel used to multiply with the input tensor)
+      recurrent_weight_keep_prob: keep probablility for dropout of U
+                                 (kernel used to multiply with last hidden state tensor)
+      recurrent_keep_prob: keep probability for dropout
+                           when applying tanh for the input transform step
+      weight_variational: whether to keep the same weight dropout mask
+                          at every timestep. This feature is not yet implemented.
       dropout_prob_seed: (optional) integer, the randomness seed.
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
@@ -88,7 +92,7 @@ def _norm(self, inp, scope, dtype=tf.float32):
     shape = inp.get_shape()[-1:]
     gamma_init = tf.constant_initializer(self._norm_gain)
     beta_init = tf.constant_initializer(self._norm_shift)
-    with tf.variable_scope(scope): # replace vs with tf. vs stands for va
+    with tf.variable_scope(scope):
       # Initialize beta and gamma for use by layer_norm.
       tf.get_variable("gamma", shape=shape, initializer=gamma_init, dtype=dtype)
       tf.get_variable("beta", shape=shape, initializer=beta_init, dtype=dtype)
@@ -117,6 +121,9 @@ def _linear(self, args, inputs_shape, h_shape):
     return out
 
   def _variational_dropout(self, values, noise, keep_prob):
+    '''
+    TODO: Implement variational dropout for weight dropout
+    '''
     return tf.nn.dropout(values, keep_prob, seed=self._dropout_seed)
 
   def _dropout(self, values, dropout_noise, keep_prob):
diff --git a/open_seq2seq/utils/funcs.py b/open_seq2seq/utils/funcs.py
index a81e59309..cce788bd5 100644
--- a/open_seq2seq/utils/funcs.py
+++ b/open_seq2seq/utils/funcs.py
@@ -14,7 +14,8 @@
                                      collect_if_horovod
 from .hooks import PrintSamplesHook, RunEvaluationHook, PrintLossAndTimeHook, \
                    BroadcastGlobalVariablesHook
-from open_seq2seq.models import LSTMLM
+from .helpers import TransferMonitoredTrainingSession, TransferScaffold
+from open_seq2seq.data import WKTDataLayer
 
 
 def train(train_model, eval_model=None, debug_port=None):
@@ -42,8 +43,10 @@ def train(train_model, eval_model=None, debug_port=None):
 
   if master_worker:
     checkpoint_dir = train_model.params['logdir']
+    base_ckpt_dir = train_model.params['load_model']
   else:
     checkpoint_dir = None
+    base_ckpt_dir = None
 
   if eval_model is not None:
     # noinspection PyTypeChecker
@@ -52,7 +55,7 @@ def train(train_model, eval_model=None, debug_port=None):
             every_steps=eval_model.params['eval_steps'],
             model=eval_model,
             last_step=train_model.last_step,
-            print_ppl=isinstance(eval_model, LSTMLM),
+            print_ppl=isinstance(eval_model.get_data_layer(), WKTDataLayer),
         ),
     )
 
@@ -70,7 +73,7 @@ def train(train_model, eval_model=None, debug_port=None):
       hooks.append(PrintLossAndTimeHook(
           every_steps=train_model.params['print_loss_steps'],
           model=train_model,
-          print_ppl=isinstance(train_model, LSTMLM)
+          print_ppl=isinstance(train_model.get_data_layer(), WKTDataLayer),
       ))
     if train_model.params['print_samples_steps'] is not None:
       # noinspection PyTypeChecker
@@ -94,10 +97,16 @@ def train(train_model, eval_model=None, debug_port=None):
         [train_model.get_data_layer(i).iterator.initializer
          for i in range(train_model.num_gpus)]
     )
-
-  scaffold = tf.train.Scaffold(
-      local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer)
-  )
+  
+  fine_tuning = (not base_ckpt_dir) or tf.train.latest_checkpoint(checkpoint_dir)
+  if fine_tuning:   
+    scaffold = tf.train.Scaffold(
+        local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer)
+    )
+  else:
+    scaffold = TransferScaffold(
+        local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer)
+    )
   fetches = [train_model.train_op]
   try:
     total_objects = 0.0
@@ -109,7 +118,8 @@ def train(train_model, eval_model=None, debug_port=None):
                "train model does not define get_num_objects_per_step method.")
 
   # starting training
-  with tf.train.MonitoredTrainingSession(
+  if fine_tuning:
+    sess = TransferMonitoredTrainingSession(
       scaffold=scaffold,
       checkpoint_dir=checkpoint_dir,
       save_summaries_steps=train_model.params['save_summaries_steps'],
@@ -118,43 +128,55 @@ def train(train_model, eval_model=None, debug_port=None):
       log_step_count_steps=train_model.params['save_summaries_steps'],
       stop_grace_period_secs=300,
       hooks=hooks,
-  ) as sess:
-    step = 0
-    num_bench_updates = 0
-    while True:
-      if sess.should_stop():
-        break
-      tm = time.time()
-      try:
-        feed_dict = {}
-        iter_size = train_model.params.get('iter_size', 1)
-        if iter_size > 1:
-          feed_dict[train_model.skip_update_ph] = step % iter_size != 0
-        if step % iter_size == 0:
-          if step >= bench_start:
-            num_bench_updates += 1
-          fetches_vals = sess.run(fetches, feed_dict)
-        else:
-          # necessary to skip "no-update" steps when iter_size > 1
-          def run_with_no_hooks(step_context):
-            return step_context.session.run(fetches, feed_dict)
-          fetches_vals = sess.run_step_fn(run_with_no_hooks)
-      except tf.errors.OutOfRangeError:
-        break
-      if step >= bench_start:
-        total_time += time.time() - tm
-        if len(fetches) > 1:
-          for i in range(train_model.num_gpus):
-            total_objects += np.sum(fetches_vals[i + 1])
-          if train_model.params['print_bench_info_steps'] is not None:
-            if step % train_model.params['print_bench_info_steps'] == 0:
-              total_objects_cur = collect_if_horovod(total_objects, hvd,
-                                                     mode="sum")
-              if master_worker:
-                avg_objects = 1.0 * total_objects_cur / total_time
-                deco_print("Avg objects per second: {:.3f}".format(avg_objects))
-
-      step += 1
+      base_ckpt_dir=base_ckpt_dir,
+      load_fc=train_model.params['load_fc'])
+  else:
+    sess = tf.train.MonitoredTrainingSession(
+      scaffold=scaffold,
+      checkpoint_dir=checkpoint_dir,
+      save_summaries_steps=train_model.params['save_summaries_steps'],
+      config=sess_config,
+      save_checkpoint_secs=None,
+      log_step_count_steps=train_model.params['save_summaries_steps'],
+      stop_grace_period_secs=300,
+      hooks=hooks)
+  step = 0
+  num_bench_updates = 0
+  while True:
+    if sess.should_stop():
+      break
+    tm = time.time()
+    try:
+      feed_dict = {}
+      iter_size = train_model.params.get('iter_size', 1)
+      if iter_size > 1:
+        feed_dict[train_model.skip_update_ph] = step % iter_size != 0
+      if step % iter_size == 0:
+        if step >= bench_start:
+          num_bench_updates += 1
+        fetches_vals = sess.run(fetches, feed_dict)
+      else:
+        # necessary to skip "no-update" steps when iter_size > 1
+        def run_with_no_hooks(step_context):
+          return step_context.session.run(fetches, feed_dict)
+        fetches_vals = sess.run_step_fn(run_with_no_hooks)
+    except tf.errors.OutOfRangeError:
+      break
+    if step >= bench_start:
+      total_time += time.time() - tm
+      if len(fetches) > 1:
+        for i in range(train_model.num_gpus):
+          total_objects += np.sum(fetches_vals[i + 1])
+        if train_model.params['print_bench_info_steps'] is not None:
+          if step % train_model.params['print_bench_info_steps'] == 0:
+            total_objects_cur = collect_if_horovod(total_objects, hvd,
+                                                   mode="sum")
+            if master_worker:
+              avg_objects = 1.0 * total_objects_cur / total_time
+              deco_print("Avg objects per second: {:.3f}".format(avg_objects))
+
+    step += 1
+  sess.close()
 
   if len(fetches) > 1:
     total_objects = collect_if_horovod(total_objects, hvd, mode="sum")
diff --git a/open_seq2seq/utils/helpers.py b/open_seq2seq/utils/helpers.py
new file mode 100644
index 000000000..476a330f1
--- /dev/null
+++ b/open_seq2seq/utils/helpers.py
@@ -0,0 +1,480 @@
+'''
+This file modifies standard TensorFlow modules necessary for transfer learning, 
+such as MonitoredTrainingSession, ChiefSessionCreator, Scaffold, SessionManager
+'''
+
+import time
+
+import tensorflow as tf
+from tensorflow.python.tools import inspect_checkpoint as chkp
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.ops import resources
+from tensorflow.python.training import saver as training_saver
+
+# Value that indicates no value was provided.
+USE_DEFAULT = object()
+
+def TransferMonitoredTrainingSession(master='',  # pylint: disable=invalid-name
+                                     is_chief=True,
+                                     checkpoint_dir=None,
+                                     scaffold=None,
+                                     hooks=None,
+                                     chief_only_hooks=None,
+                                     save_checkpoint_secs=USE_DEFAULT,
+                                     save_summaries_steps=USE_DEFAULT,
+                                     save_summaries_secs=USE_DEFAULT,
+                                     config=None,
+                                     stop_grace_period_secs=120,
+                                     log_step_count_steps=100,
+                                     max_wait_secs=7200,
+                                     save_checkpoint_steps=USE_DEFAULT,
+                                     summary_dir=None,
+                                     base_ckpt_dir=None,
+                                     load_fc=False):
+  """Creates a `MonitoredSession` for training.
+  For a chief, this utility sets proper session initializer/restorer. It also
+  creates hooks related to checkpoint and summary saving. For workers, this
+  utility sets proper session creator which waits for the chief to
+  initialize/restore. Please check `tf.train.MonitoredSession` for more
+  information.
+  Args:
+    master: `String` the TensorFlow master to use.
+    is_chief: If `True`, it will take care of initialization and recovery the
+      underlying TensorFlow session. If `False`, it will wait on a chief to
+      initialize or recover the TensorFlow session.
+    checkpoint_dir: A string.  Optional path to a directory where to restore
+      variables.
+    scaffold: A `Scaffold` used for gathering or building supportive ops. If
+      not specified, a default one is created. It's used to finalize the graph.
+    hooks: Optional list of `SessionRunHook` objects.
+    chief_only_hooks: list of `SessionRunHook` objects. Activate these hooks if
+      `is_chief==True`, ignore otherwise.
+    save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
+      using a default checkpoint saver. If both `save_checkpoint_steps` and
+      `save_checkpoint_secs` are set to `None`, then the default checkpoint
+      saver isn't used. If both are provided, then only `save_checkpoint_secs`
+      is used. Default 600.
+    save_summaries_steps: The frequency, in number of global steps, that the
+      summaries are written to disk using a default summary saver. If both
+      `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
+      the default summary saver isn't used. Default 100.
+    save_summaries_secs: The frequency, in secs, that the summaries are written
+      to disk using a default summary saver.  If both `save_summaries_steps` and
+      `save_summaries_secs` are set to `None`, then the default summary saver
+      isn't used. Default not enabled.
+    config: an instance of `tf.ConfigProto` proto used to configure the session.
+      It's the `config` argument of constructor of `tf.Session`.
+    stop_grace_period_secs: Number of seconds given to threads to stop after
+      `close()` has been called.
+    log_step_count_steps: The frequency, in number of global steps, that the
+      global step/sec is logged.
+    max_wait_secs: Maximum time workers should wait for the session to
+      become available. This should be kept relatively short to help detect
+      incorrect code, but sometimes may need to be increased if the chief takes
+      a while to start up.
+    save_checkpoint_steps: The frequency, in number of global steps, that a
+      checkpoint is saved using a default checkpoint saver. If both
+      `save_checkpoint_steps` and `save_checkpoint_secs` are set to `None`, then
+      the default checkpoint saver isn't used. If both are provided, then only
+      `save_checkpoint_secs` is used. Default not enabled.
+    summary_dir: A string.  Optional path to a directory where to
+      save summaries. If None, checkpoint_dir is used instead.
+  Returns:
+    A `MonitoredSession` object.
+  """
+  if save_summaries_steps == USE_DEFAULT and save_summaries_secs == USE_DEFAULT:
+    save_summaries_steps = 100
+    save_summaries_secs = None
+  elif save_summaries_secs == USE_DEFAULT:
+    save_summaries_secs = None
+  elif save_summaries_steps == USE_DEFAULT:
+    save_summaries_steps = None
+
+  if (save_checkpoint_steps == USE_DEFAULT and
+      save_checkpoint_secs == USE_DEFAULT):
+    save_checkpoint_steps = None
+    save_checkpoint_secs = 600
+  elif save_checkpoint_secs == USE_DEFAULT:
+    save_checkpoint_secs = None
+  elif save_checkpoint_steps == USE_DEFAULT:
+    save_checkpoint_steps = None
+
+  if not is_chief:
+    session_creator = tf.train.WorkerSessionCreator(
+        scaffold=scaffold,
+        master=master,
+        config=config,
+        max_wait_secs=max_wait_secs)
+    return tf.train.MonitoredSession(session_creator=session_creator, hooks=hooks or [],
+                            stop_grace_period_secs=stop_grace_period_secs)
+
+  all_hooks = []
+  if chief_only_hooks:
+    all_hooks.extend(chief_only_hooks)
+
+  if not base_ckpt_dir or tf.train.latest_checkpoint(checkpoint_dir):
+  # if no base checkpoint or if checkpoint for the current model already exists
+    session_creator = tf.train.ChiefSessionCreator(
+        scaffold=scaffold,
+        checkpoint_dir=checkpoint_dir,
+        master=master,
+        config=config)
+
+  else: # load variables from the base model's checkpoint
+    print("Loading the base model")
+    session_creator = TransferChiefSessionCreator(
+        scaffold=scaffold,
+        checkpoint_dir=base_ckpt_dir,
+        master=master,
+        config=config,
+        load_fc=load_fc)
+
+  summary_dir = summary_dir or checkpoint_dir
+  if summary_dir:
+    if log_step_count_steps and log_step_count_steps > 0:
+      all_hooks.append(
+          tf.train.StepCounterHook(
+              output_dir=summary_dir, every_n_steps=log_step_count_steps))
+
+    if (save_summaries_steps and save_summaries_steps > 0) or (
+        save_summaries_secs and save_summaries_secs > 0):
+      all_hooks.append(tf.train.SummarySaverHook(
+          scaffold=scaffold,
+          save_steps=save_summaries_steps,
+          save_secs=save_summaries_secs,
+          output_dir=summary_dir))
+
+  if checkpoint_dir:
+    if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
+        save_checkpoint_steps and save_checkpoint_steps > 0):
+      all_hooks.append(tf.train.CheckpointSaverHook(
+          checkpoint_dir,
+          save_steps=save_checkpoint_steps,
+          save_secs=save_checkpoint_secs,
+          scaffold=scaffold))
+
+  if hooks:
+    all_hooks.extend(hooks)
+  return tf.train.MonitoredSession(session_creator=session_creator, hooks=all_hooks,
+                          stop_grace_period_secs=stop_grace_period_secs)
+
+class TransferChiefSessionCreator(tf.train.SessionCreator):
+  def __init__(self,
+               scaffold=None,
+               master='',
+               config=None,
+               checkpoint_dir=None,
+               checkpoint_filename_with_path=None,
+               load_fc=False):
+    """Initializes a chief session creator.
+    Args:
+      scaffold: A `Scaffold` used for gathering or building supportive ops. If
+        not specified a default one is created. It's used to finalize the graph.
+      master: `String` representation of the TensorFlow master to use.
+      config: `ConfigProto` proto used to configure the session.
+      checkpoint_dir: A string.  Optional path to a directory where to restore
+        variables.
+      checkpoint_filename_with_path: Full file name path to the checkpoint file.
+    """
+    self._checkpoint_dir = checkpoint_dir
+    self._checkpoint_filename_with_path = checkpoint_filename_with_path
+    self._scaffold = scaffold or TransferScaffold()
+    self._session_manager = None
+    self._master = master
+    self._config = config
+    self._load_fc = load_fc
+
+  def _get_session_manager(self):
+    if self._session_manager:
+      return self._session_manager
+
+    self._session_manager = TransferSessionManager(
+        local_init_op=self._scaffold.local_init_op,
+        ready_op=self._scaffold.ready_op,
+        ready_for_local_init_op=self._scaffold.ready_for_local_init_op,
+        graph=tf.get_default_graph())
+    return self._session_manager
+
+  def create_session(self):
+    print('SCAFFOLD TYPE:', type(self._scaffold))
+    self._scaffold.finalize()
+    # tf.get_default_graph()._unsafe_unfinalize()
+    
+    return self._get_session_manager().prepare_session(
+        self._master,
+        saver=self._scaffold.saver,
+        checkpoint_dir=self._checkpoint_dir,
+        checkpoint_filename_with_path=self._checkpoint_filename_with_path,
+        config=self._config,
+        init_op=self._scaffold.init_op,
+        init_feed_dict=self._scaffold.init_feed_dict,
+        init_fn=self._scaffold.init_fn,
+        load_fc=self._load_fc)
+
+class TransferScaffold(tf.train.Scaffold):
+  def finalize(self):
+    """Creates operations if needed and finalizes the graph."""
+    if self._init_op is None:
+      def default_init_op():
+        return tf.group(
+            tf.global_variables_initializer(),
+            resources.initialize_resources(resources.shared_resources()))
+      self._init_op = TransferScaffold.get_or_default(
+          'init_op',
+          tf.GraphKeys.INIT_OP,
+          default_init_op)
+    if self._ready_op is None:
+      def default_ready_op():
+        return tf.concat([
+            tf.report_uninitialized_variables(),
+            resources.report_uninitialized_resources()
+        ], 0)
+      self._ready_op = TransferScaffold.get_or_default(
+          'ready_op', tf.GraphKeys.READY_OP,
+          default_ready_op)
+    if self._ready_for_local_init_op is None:
+      def default_ready_for_local_init_op():
+        return tf.report_uninitialized_variables(
+            tf.global_variables())
+      self._ready_for_local_init_op = TransferScaffold.get_or_default(
+          'ready_for_local_init_op', tf.GraphKeys.READY_FOR_LOCAL_INIT_OP,
+          default_ready_for_local_init_op)
+    if self._local_init_op is None:
+      self._local_init_op = TransferScaffold.get_or_default(
+          'local_init_op', tf.GraphKeys.LOCAL_INIT_OP,
+          TransferScaffold.default_local_init_op)
+    if self._summary_op is None:
+      self._summary_op = TransferScaffold.get_or_default('summary_op',
+                                                 tf.GraphKeys.SUMMARY_OP,
+                                                 tf.summary.merge_all)
+    # pylint: disable=g-long-lambda
+    if self._saver is None:
+      self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
+    # pylint: enable=g-long-lambda
+    self._saver.build()
+
+    # ops.get_default_graph().finalize()
+    # logging.info('Graph was finalized.')
+    return self
+
+class TransferSessionManager(tf.train.SessionManager):
+  def _restore_checkpoint(self,
+                          master,
+                          sess,
+                          saver=None,
+                          checkpoint_dir=None,
+                          checkpoint_filename_with_path=None,
+                          wait_for_checkpoint=False,
+                          max_wait_secs=7200,
+                          config=None,
+                          load_fc=False):
+    """Creates a `Session`, and tries to restore a checkpoint.
+    Args:
+      master: `String` representation of the TensorFlow master to use.
+      saver: A `Saver` object used to restore a model.
+      checkpoint_dir: Path to the checkpoint files. The latest checkpoint in the
+        dir will be used to restore.
+      checkpoint_filename_with_path: Full file name path to the checkpoint file.
+      wait_for_checkpoint: Whether to wait for checkpoint to become available.
+      max_wait_secs: Maximum time to wait for checkpoints to become available.
+      config: Optional `ConfigProto` proto used to configure the session.
+    Returns:
+      A pair (sess, is_restored) where 'is_restored' is `True` if
+      the session could be restored, `False` otherwise.
+    Raises:
+      ValueError: If both checkpoint_dir and checkpoint_filename_with_path are
+        set.
+    """
+    self._target = master
+    # sess = tf.Session(self._target, graph=self._graph, config=config)
+
+    if checkpoint_dir and checkpoint_filename_with_path:
+      raise ValueError("Can not provide both checkpoint_dir and "
+                       "checkpoint_filename_with_path.")
+    # If either saver or checkpoint_* is not specified, cannot restore. Just
+    # return.
+    print('checkpoint_dir', checkpoint_dir)
+    print('checkpoint_filename_with_path', checkpoint_filename_with_path)
+    if not saver or not (checkpoint_dir or checkpoint_filename_with_path):
+      return sess, False
+
+    if checkpoint_filename_with_path:
+      # saver.restore(sess, checkpoint_filename_with_path)
+      restore_certain_variables(sess, checkpoint_filename_with_path)
+      return sess, True
+
+    # Waits up until max_wait_secs for checkpoint to become available.
+    wait_time = 0
+    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
+    while not ckpt or not ckpt.model_checkpoint_path:
+      if wait_for_checkpoint and wait_time < max_wait_secs:
+        tf.logging.info("Waiting for checkpoint to be available.")
+        time.sleep(self._recovery_wait_secs)
+        wait_time += self._recovery_wait_secs
+        ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
+      else:
+        return sess, False
+
+    # Loads the checkpoint.
+    ckpt_file = ckpt.model_checkpoint_path
+    restore_certain_variables(sess, ckpt_file)
+    saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths)
+    return sess, True
+
+  def prepare_session(self,
+                      master,
+                      init_op=None,
+                      saver=None,
+                      checkpoint_dir=None,
+                      checkpoint_filename_with_path=None,
+                      wait_for_checkpoint=False,
+                      max_wait_secs=7200,
+                      config=None,
+                      init_feed_dict=None,
+                      init_fn=None,
+                      load_fc=False):
+    """Creates a `Session`. Makes sure the model is ready to be used.
+      Creates a `Session` on 'master'. If a `saver` object is passed in, and
+      `checkpoint_dir` points to a directory containing valid checkpoint
+      files, then it will try to recover the model from checkpoint. If
+      no checkpoint files are available, and `wait_for_checkpoint` is
+      `True`, then the process would check every `recovery_wait_secs`,
+      up to `max_wait_secs`, for recovery to succeed.
+      If the model cannot be recovered successfully then it is initialized by
+      running the `init_op` and calling `init_fn` if they are provided.
+      The `local_init_op` is also run after init_op and init_fn, regardless of
+      whether the model was recovered successfully, but only if
+      `ready_for_local_init_op` passes.
+      If the model is recovered from a checkpoint it is assumed that all
+      global variables have been initialized, in particular neither `init_op`
+      nor `init_fn` will be executed.
+      It is an error if the model cannot be recovered and no `init_op`
+      or `init_fn` or `local_init_op` are passed.
+      Args:
+        master: `String` representation of the TensorFlow master to use.
+        init_op: Optional `Operation` used to initialize the model.
+        saver: A `Saver` object used to restore a model.
+        checkpoint_dir: Path to the checkpoint files. The latest checkpoint in the
+          dir will be used to restore.
+        checkpoint_filename_with_path: Full file name path to the checkpoint file.
+        wait_for_checkpoint: Whether to wait for checkpoint to become available.
+        max_wait_secs: Maximum time to wait for checkpoints to become available.
+        config: Optional `ConfigProto` proto used to configure the session.
+        init_feed_dict: Optional dictionary that maps `Tensor` objects to feed
+          values.  This feed dictionary is passed to the session `run()` call when
+          running the init op.
+        init_fn: Optional callable used to initialize the model. Called after the
+          optional `init_op` is called.  The callable must accept one argument,
+          the session being initialized.
+      Returns:
+        A `Session` object that can be used to drive the model.
+      Raises:
+        RuntimeError: If the model cannot be initialized or recovered.
+        ValueError: If both checkpoint_dir and checkpoint_filename_with_path are
+          set.
+    """
+    sess = tf.Session(master, graph=self._graph, config=config)
+    if init_op is None and not init_fn and self._local_init_op is None:
+      raise RuntimeError("Model is not initialized and no init_op or "
+                         "init_fn or local_init_op was given")
+    if init_op is not None:
+      sess.run(init_op, feed_dict=init_feed_dict)
+    if init_fn:
+      init_fn(sess)
+    sess.run(tf.local_variables_initializer()) # why do i have to add this?
+    print("LOCAL INIT OP", self._local_init_op)
+    sess, is_loaded_from_checkpoint = self._restore_checkpoint(
+                          master,
+                          sess,
+                          saver,
+                          checkpoint_dir=checkpoint_dir,
+                          checkpoint_filename_with_path=checkpoint_filename_with_path,
+                          wait_for_checkpoint=wait_for_checkpoint,
+                          max_wait_secs=max_wait_secs,
+                          config=config,
+                          load_fc=load_fc)
+    
+
+    local_init_success, msg = self._try_run_local_init_op(sess)
+    if not local_init_success:
+      raise RuntimeError(
+          "Init operations did not make model ready for local_init.  "
+          "Init op: %s, init fn: %s, error: %s" % (_maybe_name(init_op),
+                                                   init_fn,
+                                                   msg))
+
+    is_ready, msg = self._model_ready(sess)
+    if not is_ready:
+      raise RuntimeError(
+          "Init operations did not make model ready.  "
+          "Init op: %s, init fn: %s, local_init_op: %s, error: %s" %
+          (_maybe_name(init_op), init_fn, self._local_init_op, msg))
+    return sess
+
+def _restore_embed(embed_var, var_to_shape_map, reader):
+  has_embed = len([var for var in var_to_shape_map if 'EmbeddingMatrix' in var]) > 0
+  if has_embed:
+    return None, False # assume same name
+  for var in var_to_shape_map:
+    if var.endswith('dense/kernel') and var_to_shape_map[var] == tf.transpose(embed_var).shape:
+      print('Assigning', var, 'to', embed_var.name)
+      return embed_var.assign(reader.get_tensor(var).T), True
+  return None, False
+
+def restore_certain_variables(sess, filename):
+  print('Restoring only the variables found in the checkpoint')
+  trainables = {v.name: v for v in tf.trainable_variables()}
+  assign_ops = []
+  vars_to_initialize = []
+
+  try:
+    reader = tf.train.NewCheckpointReader(filename)
+    var_to_shape_map = reader.get_variable_to_shape_map()
+    non_loss_var = {var: var_to_shape_map[var] for var in var_to_shape_map if 'Loss_Optimization' not in var}
+    for var in var_to_shape_map:
+      if 'global_step' in var:
+        print('Restoring from the step', reader.get_tensor(var))
+    for name in trainables:
+      idx = name.find(":")
+      if idx != -1:
+        true_name = name[:idx]
+      # if name.endswith(':0'):
+      #   true_name = name[:-2]
+      if true_name in var_to_shape_map and trainables[name].shape == var_to_shape_map[true_name]:
+        print('Restoring value to', true_name)
+        assign_ops.append(trainables[name].assign(reader.get_tensor(true_name)))
+      if 'EmbeddingMatrix' in true_name:
+        embed_op, has_embed_op = _restore_embed(trainables[name], var_to_shape_map, reader)
+        if has_embed_op:
+          assign_ops.append(embed_op)
+
+    print('assign_ops', assign_ops)
+  except Exception as e:  # pylint: disable=broad-except
+    print(str(e))
+    if "corrupted compressed block contents" in str(e):
+      print("It's likely that your checkpoint file has been compressed "
+            "with SNAPPY.")
+    if ("Data loss" in str(e) and
+        (any([e in file_name for e in [".index", ".meta", ".data"]]))):
+      proposed_file = ".".join(file_name.split(".")[0:-1])
+      v2_file_error_template = """
+      It's likely that this is a V2 checkpoint and you need to provide the filename
+      *prefix*.  Try removing the '.' and extension.  Try:
+      inspect checkpoint --file_name = {}"""
+      print(v2_file_error_template.format(proposed_file))
+  sess.run(assign_ops)
+
+def _maybe_name(obj):
+  """Returns object name if it has one, or a message otherwise.
+  This is useful for names that apper in error messages.
+  Args:
+    obj: Object to get the name of.
+  Returns:
+    name, "None", or a "no name" message.
+  """
+  if obj is None:
+    return "None"
+  elif hasattr(obj, "name"):
+    return obj.name
+  else:
+    return "<no name for %s>" % type(obj)
diff --git a/open_seq2seq/utils/hooks.py b/open_seq2seq/utils/hooks.py
index f8222bd17..c8a65bb12 100644
--- a/open_seq2seq/utils/hooks.py
+++ b/open_seq2seq/utils/hooks.py
@@ -143,12 +143,14 @@ def after_run(self, run_context, run_values):
     loss = results[0]
     if not self._model.on_horovod or self._model.hvd.rank() == 0:
       if self._print_ppl:
-        deco_print("loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}"
+        deco_print("Train loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}"
                    .format(loss, math.exp(loss),
                            loss/math.log(2)),
                    start="", end=", ")
       else:
-        deco_print("loss: {:.4f} ".format(loss), start="", end=", ")
+        deco_print(
+          "Train loss: {:.4f} ".format(loss),
+          offset=4)
 
     tm = (time.time() - self._last_time) / self._every_steps
     m, s = divmod(tm, 60)
@@ -199,6 +201,7 @@ def after_run(self, run_context, run_values):
         self._model, run_context.session, mode="eval", compute_loss=True,
     )
 
+
     if not self._model.on_horovod or self._model.hvd.rank() == 0:
       if self._print_ppl:
         deco_print("Validation loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}"
diff --git a/open_seq2seq/utils/metrics.py b/open_seq2seq/utils/metrics.py
new file mode 100644
index 000000000..63861ed6b
--- /dev/null
+++ b/open_seq2seq/utils/metrics.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2018 NVIDIA Corporation
+'''
+This file implements function to calcuate basic metrics.
+'''
+import numpy as np
+import tensorflow as tf
+
+def true_positives(labels, preds):
+  return np.sum(np.logical_and(labels, preds)) 
+
+def accuracy(labels, preds):
+  return np.sum(np.equal(labels, preds)) / len(preds)
+
+def recall(labels, preds):
+  return true_positives(labels, preds) / np.sum(labels)
+
+def precision(labels, preds):
+  return true_positives(labels, preds) / np.sum(preds)
+
+def f1(labels, preds):
+  rec = recall(labels, preds)
+  pre = precision(labels, preds)
+  if rec == 0 or pre == 0:
+    return 0
+  return 2 * rec * pre / (rec + pre)
diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py
index 2d9a0bde9..18038b275 100644
--- a/open_seq2seq/utils/utils.py
+++ b/open_seq2seq/utils/utils.py
@@ -197,8 +197,7 @@ def iterate_data(model, sess, compute_loss, mode, verbose, num_steps=None):
 
     if verbose:
       if size_defined:
-        data_size = int(np.sum(np.ceil(np.array(dl_sizes) /
-                                       model.params['batch_size_per_gpu'])))
+        data_size = int(np.sum(np.ceil(np.array(dl_sizes) / batch_size)))
         if step == 0 or len(fetches_vals) == 0 or \
            (data_size > 10 and processed_batches % (data_size // 10) == 0):
           deco_print("Processed {}/{} batches{}".format(
@@ -229,6 +228,7 @@ def iterate_data(model, sess, compute_loss, mode, verbose, num_steps=None):
     else:
       deco_print("Not enough steps for benchmarking{}".format(ending))
 
+
   if compute_loss:
     return results_per_batch, total_loss, np.sum(total_samples)
   else:
@@ -486,7 +486,7 @@ def get_base_config(args):
   parser.add_argument("--mode", default='train',
                       help="Could be \"train\", \"eval\", "
                            "\"train_eval\" or \"infer\"")
-  parser.add_argument("--infer_output_file",
+  parser.add_argument("--infer_output_file", default='infer-out.txt',
                       help="Path to the output of inference")
   parser.add_argument('--continue_learning', dest='continue_learning',
                       action='store_true', help="whether to continue learning")
@@ -568,6 +568,7 @@ def check_logdir(args, base_config, restore_best_checkpoint=False):
       ckpt_dir = os.path.join(logdir, 'logs')
     else:
       ckpt_dir = logdir
+
     if args.mode == 'train' or args.mode == 'train_eval':
       if os.path.isfile(logdir):
         raise IOError("There is a file with the same name as \"logdir\" "
@@ -595,11 +596,16 @@ def check_logdir(args, base_config, restore_best_checkpoint=False):
     elif (args.mode == 'infer' or args.mode == 'eval' or
         args.mode == 'interactive_infer'):
       if os.path.isdir(logdir) and os.listdir(logdir) != []:
-        if restore_best_checkpoint:
-          deco_print("Restoring from best checkpoint")
-          checkpoint = tf.train.latest_checkpoint(ckpt_dir + '/best_models')
+      # if os.path.isdir(logdir) and 'checkpoint' in os.listdir(logdir):
+        best_ckpt_dir = os.path.join(ckpt_dir, 'best_models')
+        if restore_best_checkpoint and os.path.isdir(best_ckpt_dir):
+          deco_print("Restoring from the best checkpoint")
+          checkpoint = tf.train.latest_checkpoint(best_ckpt_dir)
+          ckpt_dir = best_ckpt_dir
         else:
+          deco_print("Restoring from the latest checkpoint")
           checkpoint = tf.train.latest_checkpoint(ckpt_dir)
+        
         if checkpoint is None:
           raise IOError(
               "There is no valid TensorFlow checkpoint in the "
@@ -620,6 +626,40 @@ def check_logdir(args, base_config, restore_best_checkpoint=False):
 
   return checkpoint
 
+def check_base_model_logdir(base_logdir, restore_best_checkpoint=False):
+  """A helper function that ensures the logdir is setup correctly
+
+  Args:
+    args (dict): Dictionary as returned from get_base_config()
+    base_config (dict): Dictionary as returned from get_base_config()
+    restore_best_checkpoint (bool): If True, will look for ckpt_dir + /best_models
+  Returns:
+    checkpoint: Either None if continue-learning is not set and training, or
+      the name of the checkpoint used to restore the model
+  """
+  # checking that everything is correct with log directory
+  if not base_logdir:
+    return ''
+
+  if (not os.path.isdir(base_logdir)) or len(os.listdir(base_logdir)) == 0:
+    raise IOError("The log directory for the base model is empty or does not exist.")
+
+  ckpt_dir = os.path.join(base_logdir, 'logs')
+  if not os.path.isdir(ckpt_dir):
+    raise IOError("There's no folder 'logs' in the base model logdir. \
+                    If checkpoints exist, put them in the 'logs' folder.")
+
+  if restore_best_checkpoint and os.path.isdir(os.path.join(ckpt_dir, 'best_models')):
+    ckpt_dir = os.path.join(ckpt_dir, 'best_models')
+
+  checkpoint = tf.train.latest_checkpoint(ckpt_dir)
+  if checkpoint is None:
+    raise IOError(
+        "There is no valid TensorFlow checkpoint in the \
+        {} directory. Can't load model".format(ckpt_dir))
+
+  return ckpt_dir
+
 def create_logdir(args, base_config):
   """A helper function that ensures the logdir and log files are setup corretly.
   Only called in --enable_logs is set.
@@ -663,7 +703,7 @@ def create_logdir(args, base_config):
 
   return old_stdout, old_stderr, stdout_log, stderr_log
 
-def create_model(args, base_config, config_module, base_model, hvd):
+def create_model(args, base_config, config_module, base_model, hvd, restore_best_checkpoint=False):
   """A helpful function that creates the train, eval, and infer models as
   needed.
 
@@ -749,7 +789,9 @@ def create_model(args, base_config, config_module, base_model, hvd):
     model.compile(force_var_reuse=False)
   else:
     model = base_model(params=infer_config, mode=args.mode, hvd=hvd)
-    checkpoint = check_logdir(args, base_config)
+    if base_config['logdir'].endswith('logs'):
+      base_config['logdir'] =  base_config['logdir'][:-5]
+    checkpoint = check_logdir(args, base_config, restore_best_checkpoint)
     model.compile(checkpoint=checkpoint, use_trt=args.use_trt, precision=args.precision)
 
   return model
diff --git a/requirements.txt b/requirements.txt
index 7848adc17..7a2fad808 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,4 @@ librosa==0.6.1
 matplotlib
 joblib==0.11
 sentencepiece
-sacrebleu
+sacrebleu
\ No newline at end of file
diff --git a/run.py b/run.py
index 804ea3353..cba148e2d 100644
--- a/run.py
+++ b/run.py
@@ -8,7 +8,7 @@
 import sys
 import tensorflow as tf
 from open_seq2seq.utils.utils import deco_print, get_base_config, check_logdir,\
-                                     create_logdir, create_model
+                                     create_logdir, create_model, check_base_model_logdir
 from open_seq2seq.utils import train, infer, evaluate
 
 def main():
@@ -34,6 +34,15 @@ def main():
   else:
     hvd = None
 
+  load_model = base_config.get('load_model', None)
+  restore_best_checkpoint = base_config.get('restore_best_checkpoint', False)
+  
+
+  base_ckpt_dir = check_base_model_logdir(load_model, restore_best_checkpoint)
+  base_config['load_model'] = base_ckpt_dir
+
+  # Check logdir and create it if necessary
+  checkpoint = check_logdir(args, base_config, restore_best_checkpoint)
 
   if args.enable_logs:
     if hvd is None or hvd.rank() == 0:
@@ -46,7 +55,10 @@ def main():
   if args.mode == 'train' or args.mode == 'train_eval' or args.benchmark:
     if hvd is None or hvd.rank() == 0:
       if checkpoint is None or args.benchmark:
-        deco_print("Starting training from scratch")
+        if base_ckpt_dir:
+          deco_print("Starting training from the base model")
+        else:
+          deco_print("Starting training from scratch")
       else:
         deco_print(
             "Restored checkpoint from {}. Resuming training".format(checkpoint),
@@ -57,7 +69,7 @@ def main():
 
   # Create model and train/eval/infer
   with tf.Graph().as_default():
-    model = create_model(args, base_config, config_module, base_model, hvd)
+    model = create_model(args, base_config, config_module, base_model, hvd, restore_best_checkpoint)
     if args.mode == "train_eval":
       train(model[0], model[1], debug_port=args.debug_port)
     elif args.mode == "train":
diff --git a/scripts/get_best_accuracy.py b/scripts/get_best_accuracy.py
new file mode 100644
index 000000000..87d040aaa
--- /dev/null
+++ b/scripts/get_best_accuracy.py
@@ -0,0 +1,46 @@
+'''
+Return the best evaluation accuracy from a file
+output-ed by the sentiment analysis model
+'''
+import sys
+
+def get_best_accuracy(output_file):
+	output = open(output_file, 'r')
+	keyword = "***     EVAL Accuracy: "
+	best_acc = 0.0
+	loss, stat, step = '', '', ''
+	get_stat = False
+	n = len(keyword)
+	m = len("***     Validation loss: ")
+	last = ''
+	get_step = False
+	for line in output.readlines():
+		line = line.strip()
+		if get_stat:
+			stat = line
+			get_stat = False
+			get_step = True
+		elif get_step:
+			step = line
+			get_step = False
+		else:
+			idx = line.find(keyword)
+			if idx != -1:
+				acc = float(line[n:])
+				if acc > best_acc:
+					best_acc = acc
+					loss = last
+					get_stat = True
+			last = line
+
+
+	print("***     Best accuracy:", str(best_acc))
+	print(loss)
+	print(stat)
+	print(step)
+
+if __name__ == '__main__':
+	if len(sys.argv) < 2:
+		raise ValueError('No output file provided to analyze')
+	output_file = sys.argv[1]
+	get_best_accuracy(output_file)
\ No newline at end of file
diff --git a/scripts/multi-bleu.perl b/scripts/multi-bleu.perl
old mode 100644
new mode 100755