From 7c381e393b1fc2ed8874a5e8f726ea9683149680 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 5 Feb 2019 16:24:10 -0800
Subject: [PATCH] add flake8 config from pytorch and format the code

---
 .flake8                                    |  4 +
 accuracy/model_accuracy.py                 |  3 +-
 plot/main.py                               | 47 ++++++-----
 rnns/benchmarks/benchmark_common.py        |  5 +-
 rnns/benchmarks/bnlstm.py                  | 15 ++--
 rnns/benchmarks/common.py                  |  2 +-
 rnns/benchmarks/cudnn_lstm.py              | 28 +++----
 rnns/benchmarks/lstm.py                    | 53 +++++++-----
 rnns/benchmarks/lstm_variants/container.py |  3 +-
 rnns/benchmarks/lstm_variants/lstm.py      | 25 +++---
 rnns/benchmarks/lstm_variants/normalize.py |  2 +-
 rnns/benchmarks/memnn.py                   | 10 +--
 rnns/benchmarks/mlstm.py                   | 39 ++++-----
 rnns/benchmarks/models/bnlstm.py           | 10 +--
 rnns/benchmarks/models/memnn.py            | 14 ++--
 rnns/benchmarks/models/mlstm.py            |  3 +
 rnns/benchmarks/nlp.py                     | 26 +++---
 rnns/benchmarks/qrnn.py                    | 20 ++---
 rnns/benchmarks/rnn.py                     |  6 +-
 rnns/benchmarks/sequence_labeler.py        | 18 ++--
 rnns/benchmarks/sru.py                     | 98 +++++++++++-----------
 rnns/benchmarks/sru_test.py                | 32 +++----
 rnns/benchmarks/torchqrnn/forget_mult.py   | 18 ++--
 rnns/benchmarks/torchqrnn/qrnn.py          | 47 +++++++----
 rnns/fastrnns/bench.py                     |  2 +-
 rnns/fastrnns/factory.py                   |  5 +-
 rnns/fastrnns/scratch.py                   |  4 +-
 rnns/runner.py                             | 10 +--
 run.py                                     |  9 +-
 setup/bench_conf.py                        |  8 +-
 timing/python/benchmarks/__init__.py       |  1 -
 timing/python/benchmarks/misc/mobilenet.py |  4 +-
 timing/python/framework/maybe_garbage.py   | 55 ++++++------
 33 files changed, 344 insertions(+), 282 deletions(-)
 create mode 100644 .flake8
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000..1c753d1568
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 120
+ignore = E203,E305,E402,E721,E741,F401,F403,F405,F821,F841,F999,W503,W504
+exclude = third_party
diff --git a/accuracy/model_accuracy.py b/accuracy/model_accuracy.py
index c415c3a1ce..232182c107 100755
--- a/accuracy/model_accuracy.py
+++ b/accuracy/model_accuracy.py
@@ -6,7 +6,6 @@
 from datetime import datetime
 import logging
 from tqdm import tqdm
-import os
 import gc
 import itertools
 import sys
@@ -15,7 +14,6 @@
 import torchvision.models as models
 
 
-
 parser = argparse.ArgumentParser(description="PyTorch model accuracy benchmark.")
 parser.add_argument('--repeat', type=int, default=5,
                     help="Number of Runs")
@@ -90,6 +88,7 @@ def cmd_string(examples_home, model, data_path):
     cmd = ' '.join(['python3', examples_home, '-a', model, '--lr', str(lr), data_path])
     return cmd
 
+
 def log_init():
     if not os.path.exists(temp_dir):
         os.makedirs(temp_dir)
diff --git a/plot/main.py b/plot/main.py
index 8d363e6956..85f6d4f5e0 100644
--- a/plot/main.py
+++ b/plot/main.py
@@ -14,13 +14,14 @@
 HERE = os.path.dirname(os.path.abspath(__file__))
 MAX_BENCHES = 160
 BENCH_TIMES = 4
-BENCH_EVERY = 10 # th commit
+BENCH_EVERY = 10  # th commit
 
 run = partial(subprocess.check_call, cwd=REPO_DIR)
 run_with_output = partial(subprocess.check_output, cwd=REPO_DIR)
 run_toplevel = subprocess.check_call
 silent = dict(stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+
 def fetch_repo():
     if os.path.exists(REPO_DIR):
         print('> Pulling new changes...')
@@ -30,6 +31,7 @@ def fetch_repo():
     print('> Cloning repository...')
     run_toplevel(['git', 'clone', '--recursive', REPO_URL, REPO_DIR], **silent)
 
+
 def get_history():
     # git log --format='%H %an %ae %at' -n <num commits>
     fields = [
@@ -56,11 +58,11 @@ def build(commit_hash):
     start = time.time()
     cname = container_name(commit_hash)
     run(['docker', 'run',
-           '--runtime=nvidia',
-           '-v', os.path.join(HERE, '..') + ':/mnt/localdrive',
-           '--name', cname,
-           '-t', 'pytorch_bench',
-           '/bin/bash', '/mnt/localdrive/timing/python/install_pytorch.sh', commit_hash], **silent)
+         '--runtime=nvidia',
+         '-v', os.path.join(HERE, '..') + ':/mnt/localdrive',
+         '--name', cname,
+         '-t', 'pytorch_bench',
+         '/bin/bash', '/mnt/localdrive/timing/python/install_pytorch.sh', commit_hash], **silent)
     run(['docker', 'commit', cname, cname], **silent)
     end = time.time()
     diff = int(end - start)
@@ -78,16 +80,16 @@ def run_benchmark(commit_hash, args, **kwargs):
     BENCH_CPUS = '0-11'
     BENCH_MEMS = '0'
     return run_with_output(['docker', 'run',
-            '--cap-add=SYS_PTRACE',
-            '--runtime=nvidia',
-            '--security-opt',
-            'seccomp=unconfined',
-            '-v', os.path.join(HERE, '..') + ':/mnt/localdrive',
-            '-w', '/mnt/localdrive',
-            '--cpuset-cpus=' + BENCH_CPUS,
-            '--cpuset-mems=' + BENCH_MEMS,
-            '-t', container_name(commit_hash),
-            *args], **kwargs).decode('utf8')
+                            '--cap-add=SYS_PTRACE',
+                            '--runtime=nvidia',
+                            '--security-opt',
+                            'seccomp=unconfined',
+                            '-v', os.path.join(HERE, '..') + ':/mnt/localdrive',
+                            '-w', '/mnt/localdrive',
+                            '--cpuset-cpus=' + BENCH_CPUS,
+                            '--cpuset-mems=' + BENCH_MEMS,
+                            '-t', container_name(commit_hash),
+                            *args], **kwargs).decode('utf8')
 
 
 def load_results():
@@ -137,12 +139,14 @@ def merge_into(original, new):
         else:
             original[key] = new[key]
 
+
 def print_plan(to_bench):
     if not to_bench:
         print('> Nothing to do!')
         return
     print('> Building {} commits:'.format(len(to_bench)))
-    print('\n'.join('    - {} from {}'.format(result['hash'], datetime.fromtimestamp(result['commit_time'])) for result in to_bench))
+    print('\n'.join('    - {} from {}'.format(result['hash'],
+                                              datetime.fromtimestamp(result['commit_time'])) for result in to_bench))
 
 
 BENCHMARKS = [
@@ -150,22 +154,27 @@ def print_plan(to_bench):
 ]
 
 # List[Dict[Dict[Int]]] -> Dict[Dict[List[Int]]]
+
+
 def transpose_results(results):
     def get_keys(result):
         return sorted([(outer, inner) for outer in result for inner in result[outer]])
     keys = get_keys(results[0])
     assert all(get_keys(result) == keys for result in results)
     any_result = results[0]
-    return {outer: {inner: [result[outer][inner] for result in results] for inner in any_result[outer]} for outer in any_result}
+    return {outer: {inner: [result[outer][inner] for result in results] for inner in any_result[outer]}
+            for outer in any_result}
 
 
 def result_stats(result):
     def mean(l):
         return sum(l) / len(l)
+
     def std(l):
         m = mean(l)
         return math.sqrt(sum([(v - m) ** 2 for v in l]))
-    return {outer: {inner: (mean(innerv), std(innerv)) for inner, innerv in outerv.items()} for outer, outerv in result.items()}
+    return {outer: {inner: (mean(innerv), std(innerv)) for inner, innerv in outerv.items()}
+            for outer, outerv in result.items()}
 
 
 if __name__ == '__main__':
diff --git a/rnns/benchmarks/benchmark_common.py b/rnns/benchmarks/benchmark_common.py
index be9cfef956..34b412d44c 100644
--- a/rnns/benchmarks/benchmark_common.py
+++ b/rnns/benchmarks/benchmark_common.py
@@ -4,6 +4,7 @@
 
 # Copied and pasted from benchmark_common under benchmark/scripts
 
+
 def benchmark_init(cpu, gpu, skip_cpu_governor_check=False):
     cpu_pin(cpu)
     if not skip_cpu_governor_check:
@@ -32,5 +33,7 @@ def check_cpu_governor(cpu):
                       "The file '{}' is not readable.\n"
                       "More information:\n\n{}".format(fp, e))
 
+
 def print_results_usecs(name, i, gpu_usecs, cpu_usecs, divide_by):
-    print("{}({:2d}): {:8.3f} usecs ({:8.3f} usecs cpu)".format(name, i, gpu_usecs/divide_by, cpu_usecs/divide_by, file=sys.stderr))
+    print("{}({:2d}): {:8.3f} usecs ({:8.3f} usecs cpu)".format(
+        name, i, gpu_usecs / divide_by, cpu_usecs / divide_by, file=sys.stderr))
diff --git a/rnns/benchmarks/bnlstm.py b/rnns/benchmarks/bnlstm.py
index d0268b40dc..4b84dc7d3c 100644
--- a/rnns/benchmarks/bnlstm.py
+++ b/rnns/benchmarks/bnlstm.py
@@ -56,7 +56,6 @@ def cast(tensor):
         model.cuda()
         criterion.cuda()
 
-
     total_loss = 0
     for data, targets in zip(data_batches, target_batches):
         gc.collect()
@@ -71,13 +70,13 @@ def cast(tensor):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="PyTorch BNLSTM benchmark.")
-    parser.add_argument('--num_batches',  type=int, default=1,   help="num batches")
-    parser.add_argument('--hidden-size',  type=int, default=100, help="Hidden size")
-    parser.add_argument('--max-length',   type=int, default=784, help="max seq len")
-    parser.add_argument('--warmup',       type=int, default=10,  help="Warmup iterations")
-    parser.add_argument('--benchmark',    type=int, default=20,  help="Benchmark iterations")
-    parser.add_argument('--jit',          action='store_true',   help="Use JIT")
-    parser.add_argument('--cuda',         action='store_true',   help="Use cuda")
+    parser.add_argument('--num_batches', type=int, default=1, help="num batches")
+    parser.add_argument('--hidden-size', type=int, default=100, help="Hidden size")
+    parser.add_argument('--max-length', type=int, default=784, help="max seq len")
+    parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations")
+    parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations")
+    parser.add_argument('--jit', action='store_true', help="Use JIT")
+    parser.add_argument('--cuda', action='store_true', help="Use cuda")
     args = parser.parse_args()
 
     pprint.pprint(vars(args))
diff --git a/rnns/benchmarks/common.py b/rnns/benchmarks/common.py
index 7a2579ce11..ade1bc7850 100644
--- a/rnns/benchmarks/common.py
+++ b/rnns/benchmarks/common.py
@@ -84,7 +84,7 @@ def summary(self):
         assert not self.timing
 
         def mean_min_max(lst):
-            return SummaryStats(sum(lst)/len(lst), min(lst), max(lst))
+            return SummaryStats(sum(lst) / len(lst), min(lst), max(lst))
 
         gpu_msecs, cpu_msecs = zip(*self.results)
         warmup = self.warmup_iters
diff --git a/rnns/benchmarks/cudnn_lstm.py b/rnns/benchmarks/cudnn_lstm.py
index 05a01ae734..032719c030 100644
--- a/rnns/benchmarks/cudnn_lstm.py
+++ b/rnns/benchmarks/cudnn_lstm.py
@@ -23,13 +23,12 @@ def run_cudnn_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512,
 
     benchmark_init(cpu, gpu, skip_cpu_governor_check)
 
-
     def V(x):
         return Variable(x)  # mandatory
 
     input = V(torch.randn(seq_len, batch_size, input_size).cuda(gpu))
-    hx    = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu))
-    cx    = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu))
+    hx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu))
+    cx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu))
 
     lstm = torch.nn.LSTM(input_size, hidden_size, layers).cuda(gpu)
     lstm.flatten_parameters()
@@ -48,17 +47,18 @@ def V(x):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="PyTorch CuDNN LSTM benchmark.")
-    parser.add_argument('--cpu',                     type=int,  default=0,    help="CPU to run on")
-    parser.add_argument('--gpu',                     type=int,  default=0,    help="GPU to run on")
-    parser.add_argument('--batch-size',              type=int,  default=1,    help="Batch size")
-    parser.add_argument('--input-size',              type=int,  default=256,  help="Input size")
-    parser.add_argument('--hidden-size',             type=int,  default=512,  help="Hidden size")
-    parser.add_argument('--layers',                  type=int,  default=1,    help="Layers")
-    parser.add_argument('--seq-len',                 type=int,  default=512,  help="Sequence length")
-    parser.add_argument('--warmup',                  type=int,  default=10,   help="Warmup iterations")
-    parser.add_argument('--benchmark',               type=int,  default=30,   help="Benchmark iterations")
-    parser.add_argument('--skip-cpu-governor-check', action='store_true',     help="Skip checking whether CPU governor is set to `performance`")
-    parser.add_argument('--backward', action='store_true',     help="time backward")
+    parser.add_argument('--cpu', type=int, default=0, help="CPU to run on")
+    parser.add_argument('--gpu', type=int, default=0, help="GPU to run on")
+    parser.add_argument('--batch-size', type=int, default=1, help="Batch size")
+    parser.add_argument('--input-size', type=int, default=256, help="Input size")
+    parser.add_argument('--hidden-size', type=int, default=512, help="Hidden size")
+    parser.add_argument('--layers', type=int, default=1, help="Layers")
+    parser.add_argument('--seq-len', type=int, default=512, help="Sequence length")
+    parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations")
+    parser.add_argument('--benchmark', type=int, default=30, help="Benchmark iterations")
+    parser.add_argument('--skip-cpu-governor-check', action='store_true',
+                        help="Skip checking whether CPU governor is set to `performance`")
+    parser.add_argument('--backward', action='store_true', help="time backward")
     args = parser.parse_args()
     pprint.pprint(vars(args))
 
diff --git a/rnns/benchmarks/lstm.py b/rnns/benchmarks/lstm.py
index fc02571398..68f2173c73 100644
--- a/rnns/benchmarks/lstm.py
+++ b/rnns/benchmarks/lstm.py
@@ -22,8 +22,12 @@
 # This file copied from scripts/lstm.py.
 
 # If you swap the transpose here, you can test the effect of pre-transposing.
+
+
 def t_use(x):
     return x
+
+
 def t_def(x):
     return x.t()
 
@@ -66,7 +70,7 @@ def lstm(input, hidden, w_ih, w_hh):
 
 def _unfused_lstm(input, hx, cx, w_ih, w_hh):
     hx, cx
-    #return hx.clone(), cx.clone()
+    # return hx.clone(), cx.clone()
     gates = input.mm(t_use(w_ih)) + hx.mm(t_use(w_hh))
 
     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
@@ -104,17 +108,20 @@ def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512,
     benchmark_init(cpu, gpu, skip_cpu_governor_check)
 
     if variable:
-        V = lambda x, requires_grad=False: Variable(x, requires_grad=False)
+        def V(x, requires_grad=False):
+            return Variable(x, requires_grad=False)
     elif autograd:
-        V = lambda x, requires_grad=False: Variable(x, requires_grad=requires_grad)
+        def V(x, requires_grad=False):
+            return Variable(x, requires_grad=requires_grad)
     else:
-        V = lambda x, requires_grad=False: x
+        def V(x, requires_grad=False):
+            return x
 
     input = V(torch.randn(batch_size, input_size).cuda(device=gpu))
-    hx0   = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
-    cx0   = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
-    w_ih  = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True)
-    w_hh  = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True)
+    hx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
+    cx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
+    w_ih = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True)
+    w_hh = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True)
 
     if fused:
         if backward:
@@ -148,20 +155,22 @@ def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="PyTorch LSTM benchmark.")
-    parser.add_argument('--cpu',                     type=int, default=0,     help="CPU to run on")
-    parser.add_argument('--gpu',                     type=int, default=0,     help="GPU to run on")
-    parser.add_argument('--batch-size',              type=int, default=1,     help="Batch size")
-    parser.add_argument('--input-size',              type=int, default=256,   help="Input size")
-    parser.add_argument('--hidden-size',             type=int, default=512,   help="Hidden size")
-    parser.add_argument('--seq-len',                 type=int, default=None,  help="Sequence length")
-    parser.add_argument('--warmup',                  type=int, default=10,    help="Warmup iterations")
-    parser.add_argument('--benchmark',               type=int, default=20,    help="Benchmark iterations")
-    parser.add_argument('--autograd',                action='store_true',     help="Use autograd")
-    parser.add_argument('--variable',                action='store_true',     help="Use Variable, but not autograd (measure baseline overhead)")
-    parser.add_argument('--fused',                   action='store_true',     help="Use fused cell")
-    parser.add_argument('--jit',                     action='store_true',     help="Use JIT compiler (implies --autograd)")
-    parser.add_argument('--backward',                action='store_true',     help="Run backwards computation")
-    parser.add_argument('--skip-cpu-governor-check', action='store_true',     help="Skip checking whether CPU governor is set to `performance`")
+    parser.add_argument('--cpu', type=int, default=0, help="CPU to run on")
+    parser.add_argument('--gpu', type=int, default=0, help="GPU to run on")
+    parser.add_argument('--batch-size', type=int, default=1, help="Batch size")
+    parser.add_argument('--input-size', type=int, default=256, help="Input size")
+    parser.add_argument('--hidden-size', type=int, default=512, help="Hidden size")
+    parser.add_argument('--seq-len', type=int, default=None, help="Sequence length")
+    parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations")
+    parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations")
+    parser.add_argument('--autograd', action='store_true', help="Use autograd")
+    parser.add_argument('--variable', action='store_true',
+                        help="Use Variable, but not autograd (measure baseline overhead)")
+    parser.add_argument('--fused', action='store_true', help="Use fused cell")
+    parser.add_argument('--jit', action='store_true', help="Use JIT compiler (implies --autograd)")
+    parser.add_argument('--backward', action='store_true', help="Run backwards computation")
+    parser.add_argument('--skip-cpu-governor-check', action='store_true',
+                        help="Skip checking whether CPU governor is set to `performance`")
     args = parser.parse_args()
 
     pprint.pprint(vars(args))
diff --git a/rnns/benchmarks/lstm_variants/container.py b/rnns/benchmarks/lstm_variants/container.py
index e94138453a..7aa4bc6658 100644
--- a/rnns/benchmarks/lstm_variants/container.py
+++ b/rnns/benchmarks/lstm_variants/container.py
@@ -17,7 +17,7 @@ class MultiLayerLSTM(nn.Module):
 
     """
     MultiLayer LSTM of any type.
-    
+
     Note: Dropout is deactivated on the last layer.
     """
 
@@ -64,4 +64,3 @@ def forward(self, x, hiddens):
             x, new_h = l(x, h)
             new_hiddens.append(new_h)
         return x, new_hiddens
-
diff --git a/rnns/benchmarks/lstm_variants/lstm.py b/rnns/benchmarks/lstm_variants/lstm.py
index 118746d911..6b3ef7adcc 100644
--- a/rnns/benchmarks/lstm_variants/lstm.py
+++ b/rnns/benchmarks/lstm_variants/lstm.py
@@ -198,8 +198,8 @@ def forward(self, x, hidden):
         c_t = th.mul(c, f_t) + th.mul(i_t, g_t)
 
         if do_dropout and self.dropout_method == 'moon':
-                c_t.data.set_(th.mul(c_t, self.mask).data)
-                c_t.data *= 1.0/(1.0 - self.dropout)
+            c_t.data.set_(th.mul(c_t, self.mask).data)
+            c_t.data *= 1.0 / (1.0 - self.dropout)
 
         h_t = th.mul(o_t, c_t.tanh())
 
@@ -208,8 +208,8 @@ def forward(self, x, hidden):
             if self.dropout_method == 'pytorch':
                 F.dropout(h_t, p=self.dropout, training=self.training, inplace=True)
             if self.dropout_method == 'gal':
-                    h_t.data.set_(th.mul(h_t, self.mask).data)
-                    h_t.data *= 1.0/(1.0 - self.dropout)
+                h_t.data.set_(th.mul(h_t, self.mask).data)
+                h_t.data *= 1.0 / (1.0 - self.dropout)
 
         h_t = h_t.view(1, h_t.size(0), -1)
         c_t = c_t.view(1, c_t.size(0), -1)
@@ -237,6 +237,7 @@ class MoonLSTM(LSTM):
     'RNNDrop: A Novel Dropout for RNNs in ASR'
     https://www.stat.berkeley.edu/~tsmoon/files/Conference/asru2015.pdf
     """
+
     def __init__(self, *args, **kwargs):
         super(MoonLSTM, self).__init__(*args, **kwargs)
         self.dropout_method = 'moon'
@@ -249,6 +250,7 @@ class SemeniutaLSTM(LSTM):
     'Recurrent Dropout without Memory Loss'
     https://arxiv.org/pdf/1603.05118.pdf
     """
+
     def __init__(self, *args, **kwargs):
         super(SemeniutaLSTM, self).__init__(*args, **kwargs)
         self.dropout_method = 'semeniuta'
@@ -275,8 +277,8 @@ def __init__(self, input_size, hidden_size, bias=True, dropout=0.0,
                                             dropout=dropout,
                                             dropout_method=dropout_method)
         if ln_preact:
-            self.ln_i2h = LayerNorm(4*hidden_size, learnable=learnable)
-            self.ln_h2h = LayerNorm(4*hidden_size, learnable=learnable)
+            self.ln_i2h = LayerNorm(4 * hidden_size, learnable=learnable)
+            self.ln_h2h = LayerNorm(4 * hidden_size, learnable=learnable)
         self.ln_preact = ln_preact
         self.ln_cell = LayerNorm(hidden_size, learnable=learnable)
 
@@ -309,8 +311,8 @@ def forward(self, x, hidden):
         c_t = th.mul(c, f_t) + th.mul(i_t, g_t)
 
         if do_dropout and self.dropout_method == 'moon':
-                c_t.data.set_(th.mul(c_t, self.mask).data)
-                c_t.data *= 1.0/(1.0 - self.dropout)
+            c_t.data.set_(th.mul(c_t, self.mask).data)
+            c_t.data *= 1.0 / (1.0 - self.dropout)
 
         c_t = self.ln_cell(c_t)
         h_t = th.mul(o_t, c_t.tanh())
@@ -320,8 +322,8 @@ def forward(self, x, hidden):
             if self.dropout_method == 'pytorch':
                 F.dropout(h_t, p=self.dropout, training=self.training, inplace=True)
             if self.dropout_method == 'gal':
-                    h_t.data.set_(th.mul(h_t, self.mask).data)
-                    h_t.data *= 1.0/(1.0 - self.dropout)
+                h_t.data.set_(th.mul(h_t, self.mask).data)
+                h_t.data *= 1.0 / (1.0 - self.dropout)
 
         h_t = h_t.view(1, h_t.size(0), -1)
         c_t = c_t.view(1, c_t.size(0), -1)
@@ -333,6 +335,7 @@ class LayerNormGalLSTM(LayerNormLSTM):
     """
     Mixes GalLSTM's Dropout with Layer Normalization
     """
+
     def __init__(self, *args, **kwargs):
         super(LayerNormGalLSTM, self).__init__(*args, **kwargs)
         self.dropout_method = 'gal'
@@ -344,6 +347,7 @@ class LayerNormMoonLSTM(LayerNormLSTM):
     """
     Mixes MoonLSTM's Dropout with Layer Normalization
     """
+
     def __init__(self, *args, **kwargs):
         super(LayerNormMoonLSTM, self).__init__(*args, **kwargs)
         self.dropout_method = 'moon'
@@ -355,6 +359,7 @@ class LayerNormSemeniutaLSTM(LayerNormLSTM):
     """
     Mixes SemeniutaLSTM's Dropout with Layer Normalization
     """
+
     def __init__(self, *args, **kwargs):
         super(LayerNormSemeniutaLSTM, self).__init__(*args, **kwargs)
         self.dropout_method = 'semeniuta'
diff --git a/rnns/benchmarks/lstm_variants/normalize.py b/rnns/benchmarks/lstm_variants/normalize.py
index 161d204e2a..b92afcebc9 100644
--- a/rnns/benchmarks/lstm_variants/normalize.py
+++ b/rnns/benchmarks/lstm_variants/normalize.py
@@ -48,5 +48,5 @@ def forward(self, x):
         x = x.view(x.size(0), -1)
         x = (x - th.mean(x, 1).unsqueeze(1)) / th.sqrt(th.var(x, 1).unsqueeze(1) + self.epsilon)
         if self.learnable:
-            x =  self.alpha.expand_as(x) * x + self.beta.expand_as(x)
+            x = self.alpha.expand_as(x) * x + self.beta.expand_as(x)
         return x.view(size)
diff --git a/rnns/benchmarks/memnn.py b/rnns/benchmarks/memnn.py
index 040489bcbc..dd4a0bad41 100644
--- a/rnns/benchmarks/memnn.py
+++ b/rnns/benchmarks/memnn.py
@@ -44,7 +44,7 @@ def run_memnn(warmup=2, benchmark=18, jit=False, cuda=False):
         [  # memories, queries, memory_lengths, query_lengths
             torch.zeros(params.batch_size * params.mem_size, dtype=torch.long, device=device),
             torch.zeros(params.batch_size * 28             , dtype=torch.long, device=device),
-            torch.ones (params.batch_size, params.mem_size , dtype=torch.long, device=device),
+            torch.ones(params.batch_size, params.mem_size , dtype=torch.long, device=device),
             torch.full((params.batch_size,), 28            , dtype=torch.long, device=device),
         ]
         for _ in range(params.num_batches)
@@ -87,10 +87,10 @@ def run_memnn(warmup=2, benchmark=18, jit=False, cuda=False):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="PyTorch memnn bench")
-    parser.add_argument('--warmup',     type=int, default=2,   help="Warmup iterations")
-    parser.add_argument('--benchmark',  type=int, default=10,  help="Benchmark iterations")
-    parser.add_argument('--jit',        action='store_true',   help="Use JIT compiler")
-    parser.add_argument('--cuda',       action='store_true',   help="use cuda")
+    parser.add_argument('--warmup', type=int, default=2, help="Warmup iterations")
+    parser.add_argument('--benchmark', type=int, default=10, help="Benchmark iterations")
+    parser.add_argument('--jit', action='store_true', help="Use JIT compiler")
+    parser.add_argument('--cuda', action='store_true', help="use cuda")
     args = parser.parse_args()
     pprint.pprint(vars(args))
 
diff --git a/rnns/benchmarks/mlstm.py b/rnns/benchmarks/mlstm.py
index 2900afb8cc..e17acc0413 100644
--- a/rnns/benchmarks/mlstm.py
+++ b/rnns/benchmarks/mlstm.py
@@ -52,12 +52,12 @@ def run_mlstm(cpu=0, gpu=0, batch_size=1, input_size=205, hidden_size=1900, embe
     device = torch.device(gpu)
 
     input = torch.randn(seq_len, batch_size, input_size, requires_grad=requires_grad, device=device)
-    hx    = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device)
-    cx    = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device)
-    w_xm  = torch.randn(embed_size, input_size, requires_grad=requires_grad, device=device)
-    w_hm  = torch.randn(embed_size, hidden_size, requires_grad=requires_grad, device=device)
-    w_ih  = torch.randn(4 * hidden_size, input_size, requires_grad=requires_grad, device=device)
-    w_mh  = torch.randn(4 * hidden_size, embed_size, requires_grad=requires_grad, device=device)
+    hx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device)
+    cx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device)
+    w_xm = torch.randn(embed_size, input_size, requires_grad=requires_grad, device=device)
+    w_hm = torch.randn(embed_size, hidden_size, requires_grad=requires_grad, device=device)
+    w_ih = torch.randn(4 * hidden_size, input_size, requires_grad=requires_grad, device=device)
+    w_mh = torch.randn(4 * hidden_size, embed_size, requires_grad=requires_grad, device=device)
     params = [input, hx, cx, w_xm, w_hm, w_ih, w_mh]
 
     if jit:
@@ -81,19 +81,20 @@ def run_mlstm(cpu=0, gpu=0, batch_size=1, input_size=205, hidden_size=1900, embe
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="PyTorch mLSTM benchmark.")
-    parser.add_argument('--cpu',                     type=int, default=0,     help="CPU to run on")
-    parser.add_argument('--gpu',                     type=int, default=0,     help="GPU to run on")
-    parser.add_argument('--batch-size',              type=int, default=1,     help="Batch size")
-    parser.add_argument('--input-size',              type=int, default=205,   help="Input size")
-    parser.add_argument('--hidden-size',             type=int, default=1900,  help="Hidden size")
-    parser.add_argument('--embed-size',              type=int, default=None,  help="Embed size")
-    parser.add_argument('--seq-len',                 type=int, default=20,    help="Sequence length")
-    parser.add_argument('--warmup',                  type=int, default=10,    help="Warmup iterations")
-    parser.add_argument('--benchmark',               type=int, default=20,    help="Benchmark iterations")
-    parser.add_argument('--autograd',                action='store_true',     help="Use autograd")
-    parser.add_argument('--jit',                     action='store_true',     help="Use JIT compiler (implies --autograd)")
-    parser.add_argument('--backward',                action='store_true',     help="benchmark forward + backward (implies --autograd)")
-    parser.add_argument('--skip-cpu-governor-check', action='store_true',     help="Skip checking whether CPU governor is set to `performance`")
+    parser.add_argument('--cpu', type=int, default=0, help="CPU to run on")
+    parser.add_argument('--gpu', type=int, default=0, help="GPU to run on")
+    parser.add_argument('--batch-size', type=int, default=1, help="Batch size")
+    parser.add_argument('--input-size', type=int, default=205, help="Input size")
+    parser.add_argument('--hidden-size', type=int, default=1900, help="Hidden size")
+    parser.add_argument('--embed-size', type=int, default=None, help="Embed size")
+    parser.add_argument('--seq-len', type=int, default=20, help="Sequence length")
+    parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations")
+    parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations")
+    parser.add_argument('--autograd', action='store_true', help="Use autograd")
+    parser.add_argument('--jit', action='store_true', help="Use JIT compiler (implies --autograd)")
+    parser.add_argument('--backward', action='store_true', help="benchmark forward + backward (implies --autograd)")
+    parser.add_argument('--skip-cpu-governor-check', action='store_true',
+                        help="Skip checking whether CPU governor is set to `performance`")
     args = parser.parse_args()
 
     pprint.pprint(vars(args))
diff --git a/rnns/benchmarks/models/bnlstm.py b/rnns/benchmarks/models/bnlstm.py
index f577fd4d3f..6ffb650223 100644
--- a/rnns/benchmarks/models/bnlstm.py
+++ b/rnns/benchmarks/models/bnlstm.py
@@ -127,7 +127,7 @@ def forward(self, input_, hx):
         wi = torch.mm(input_, self.weight_ih)
         f, i, o, g = torch.split(wh_b + wi,
                                  self.hidden_size, dim=1)
-        c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g)
+        c_1 = torch.sigmoid(f) * c_0 + torch.sigmoid(i) * torch.tanh(g)
         h_1 = torch.sigmoid(o) * torch.tanh(c_1)
         return h_1, c_1
 
@@ -140,7 +140,7 @@ def __repr__(self):
 def bnlstm_helper(c_0, bn_wh, bn_wi, bias_batch):
     f, i, o, g = torch.chunk(bn_wh + bn_wi + bias_batch,
                              chunks=4, dim=1)
-    c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g)
+    c_1 = torch.sigmoid(f) * c_0 + torch.sigmoid(i) * torch.tanh(g)
     return c_1, o
 
 
@@ -230,7 +230,7 @@ def forward(self, input_, hx, time):
 
         f, i, o, g = torch.split(bn_wh + bn_wi + bias_batch,
                                  self.hidden_size, dim=1)
-        c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g)
+        c_1 = torch.sigmoid(f) * c_0 + torch.sigmoid(i) * torch.tanh(g)
         h_1 = torch.sigmoid(o) * torch.tanh(self.bn_c(c_1, time=time))
         return h_1, c_1
 
@@ -277,8 +277,8 @@ def _forward_rnn(cell, input_, length, hx):
             else:
                 h_next, c_next = cell(input_=input_[time], hx=hx)
             mask = (time < length).float().unsqueeze(1).expand_as(h_next)
-            h_next = h_next*mask + hx[0]*(1 - mask)
-            c_next = c_next*mask + hx[1]*(1 - mask)
+            h_next = h_next * mask + hx[0] * (1 - mask)
+            c_next = c_next * mask + hx[1] * (1 - mask)
             hx_next = (h_next, c_next)
             output.append(h_next)
             hx = hx_next
diff --git a/rnns/benchmarks/models/memnn.py b/rnns/benchmarks/models/memnn.py
index a2d5e28055..c8ab1afe66 100644
--- a/rnns/benchmarks/models/memnn.py
+++ b/rnns/benchmarks/models/memnn.py
@@ -24,13 +24,13 @@ def __init__(self, opt, num_features):
         self.extra_features_slots = 0
         if opt['time_features']:
             self.time_features = torch.LongTensor(range(num_features,
-                num_features + self.num_time_features))
+                                                        num_features + self.num_time_features))
             num_features += self.num_time_features
             self.extra_features_slots += 1
 
         def embedding():
             return Embed(num_features, opt['embedding_size'],
-                position_encoding=opt['position_encoding'], padding_idx=0)
+                         position_encoding=opt['position_encoding'], padding_idx=0)
 
         self.query_embedder = embedding()
         self.answer_embedder = embedding()
@@ -83,7 +83,7 @@ def forward(self, memories, queries, memory_lengths, query_lengths):
 
         for _ in range(self.opt['hops']):
             query_embeddings = self.memory_hop(query_embeddings,
-                    in_memory_embeddings, out_memory_embeddings, attention_mask)
+                                               in_memory_embeddings, out_memory_embeddings, attention_mask)
         return query_embeddings
 
 
@@ -106,7 +106,7 @@ def forward(self, lengths, indices):
         for i, row in enumerate(lengths_mat):
             for j, length in enumerate(row):
                 if length > 0:
-                    input[i, j, :length] = indices[offset:offset+length]
+                    input[i, j, :length] = indices[offset:offset + length]
                 offset += length
 
         for i, row in enumerate(lengths_mat):
@@ -137,11 +137,11 @@ def position_matrix(J, d):
         # for k in range(1, d+1):
         #     for j in range(1, J+1):
         #         m[j-1, k-1] = (1 - j/J) - (k/d) * (1 - 2 * j/J)
-        k = torch.arange(d+1, dtype=torch.float)[1:].unsqueeze(0).expand(J, d)
-        j = torch.arange(J+1, dtype=torch.float)[1:].unsqueeze(1).expand(J, d)
+        k = torch.arange(d + 1, dtype=torch.float)[1:].unsqueeze(0).expand(J, d)
+        j = torch.arange(J + 1, dtype=torch.float)[1:].unsqueeze(1).expand(J, d)
         J = float(J)
         d = float(d)
-        out = (1. - j/J) - (k/d) * (1. - 2. * j/J)
+        out = (1. - j / J) - (k / d) * (1. - 2. * j / J)
         return out
 
     @staticmethod
diff --git a/rnns/benchmarks/models/mlstm.py b/rnns/benchmarks/models/mlstm.py
index 927a2844a6..989425cea4 100644
--- a/rnns/benchmarks/models/mlstm.py
+++ b/rnns/benchmarks/models/mlstm.py
@@ -7,6 +7,8 @@
 
 # This is slightly different to the most commonly used LSTM variant, where the output gate is
 # applied after the hyperbolic tangent.
+
+
 def KrauseLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
     # Terminology matchup:
     #   - This implementation uses the trick of having all gates concatenated
@@ -29,6 +31,7 @@ def KrauseLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
 
     return hy, cy
 
+
 def MultiplicativeLSTMCell(input, hidden, w_xm, w_hm, w_ih, w_mh, b_xm=None, b_hm=None, b_ih=None, b_mh=None):
     # w_ih holds W_hx, W_ix, W_ox, W_fx
     # w_mh holds W_hm, W_im, W_om, W_fm
diff --git a/rnns/benchmarks/nlp.py b/rnns/benchmarks/nlp.py
index 96069163f1..6b7da1fbd9 100644
--- a/rnns/benchmarks/nlp.py
+++ b/rnns/benchmarks/nlp.py
@@ -16,15 +16,15 @@ class POSTagger(Benchmark):
                         18, 18, 31, 26, 5, 29, 16, 19]
 
     default_params = dict(
-        embedding_size = 50,
-        rnn_size = 51,
-        hidden_size = 52,
-        action_embedding_size = 5,
-        num_input_tokens = 32,
-        num_labels = 32,
-        minibatch_size = 5,
-        preprocess_minibatch = True,
-        cuda = False)
+        embedding_size=50,
+        rnn_size=51,
+        hidden_size=52,
+        action_embedding_size=5,
+        num_input_tokens=32,
+        num_labels=32,
+        minibatch_size=5,
+        preprocess_minibatch=True,
+        cuda=False)
     params = make_params(preprocess_minibatch=over(True, False))
 
     def prepare(self, p):
@@ -40,10 +40,14 @@ def prepare(self, p):
             for module in [self.embed_word, self.gru, self.embed_action, self.combine_arh, self.policy, self.loss_fn]:
                 module.cuda()
             self.LongTensor = torch.cuda.LongTensor
-            cast = lambda t: t.cuda()
+
+            def cast(t):
+                return t.cuda()
         else:
             self.LongTensor = torch.LongTensor
-            cast = lambda t: t
+
+            def cast(t):
+                return t
         self.cast = cast
 
         self.initial_h = Variable(cast(torch.zeros(1, p.hidden_size)), requires_grad=True)
diff --git a/rnns/benchmarks/qrnn.py b/rnns/benchmarks/qrnn.py
index a079e5d997..1f984df1bb 100644
--- a/rnns/benchmarks/qrnn.py
+++ b/rnns/benchmarks/qrnn.py
@@ -49,16 +49,16 @@ def run_qrnn(batch_size=20, input_size=128, seq_len=20,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="PyTorch qrnn benchmark.")
-    parser.add_argument('--batch-size',   type=int, default=20,    help="Batch size")
-    parser.add_argument('--input-size',   type=int, default=128,   help="Input size")
-    parser.add_argument('--hidden-size',  type=int, default=256,   help="Hidden size")
-    parser.add_argument('--num-layers',   type=int, default=10,    help="Hidden size")
-    parser.add_argument('--seq-len',      type=int, default=20,    help="Sequence length")
-    parser.add_argument('--warmup',       type=int, default=10,    help="Warmup iterations")
-    parser.add_argument('--benchmark',    type=int, default=20,    help="Benchmark iterations")
-    parser.add_argument('--cuda',         action='store_true',     help="Use cuda")
-    parser.add_argument('--use-kernel',   action='store_true',     help="Use fused cell")
-    parser.add_argument('--jit',          action='store_true',     help="Use JIT compiler")
+    parser.add_argument('--batch-size', type=int, default=20, help="Batch size")
+    parser.add_argument('--input-size', type=int, default=128, help="Input size")
+    parser.add_argument('--hidden-size', type=int, default=256, help="Hidden size")
+    parser.add_argument('--num-layers', type=int, default=10, help="Hidden size")
+    parser.add_argument('--seq-len', type=int, default=20, help="Sequence length")
+    parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations")
+    parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations")
+    parser.add_argument('--cuda', action='store_true', help="Use cuda")
+    parser.add_argument('--use-kernel', action='store_true', help="Use fused cell")
+    parser.add_argument('--jit', action='store_true', help="Use JIT compiler")
     args = parser.parse_args()
 
     pprint.pprint(vars(args))
diff --git a/rnns/benchmarks/rnn.py b/rnns/benchmarks/rnn.py
index 4e3c7995c4..82e81a4d15 100644
--- a/rnns/benchmarks/rnn.py
+++ b/rnns/benchmarks/rnn.py
@@ -5,6 +5,7 @@
 
 # This file is not in use
 
+
 class WLM(Benchmark):
     default_params = dict(rnn_type='LSTM', num_tokens=10000, embedding_size=200,
                           hidden_size=200, num_layers=2, batch_size=20, bptt=35,
@@ -17,7 +18,8 @@ def get_rnn():
                 return getattr(nn, p.rnn_type)(p.embedding_size, p.hidden_size, p.num_layers, dropout=p.dropout)
             else:
                 nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[p.rnn_type]
-                return nn.RNN(p.embedding_size, p.hidden_size, p.num_layers, nonlinearity=nonlinearity, dropout=p.dropout)
+                return nn.RNN(p.embedding_size, p.hidden_size, p.num_layers, nonlinearity=nonlinearity,
+                              dropout=p.dropout)
 
         class Model(nn.Module):
             def __init__(self):
@@ -54,5 +56,3 @@ def time_word_language_model_example(self, p):
             total_loss += loss.data  # CUDA sync point
         if p.cuda:
             torch.cuda.synchronize()
-
-
diff --git a/rnns/benchmarks/sequence_labeler.py b/rnns/benchmarks/sequence_labeler.py
index 0e81ae1015..912bd8f76b 100644
--- a/rnns/benchmarks/sequence_labeler.py
+++ b/rnns/benchmarks/sequence_labeler.py
@@ -10,7 +10,7 @@
  - most recent word
  - the previous action (aka predicted label).
  - the previous hidden state
- 
+
 Can it be faster?!?!?!?!?!?
 
 (Adapted from https://gist.github.com/hal3/8c170c4400576eb8d0a8bd94ab231232.)
@@ -35,12 +35,14 @@
 # Assuming this script is being called from the benchmark/rnns dir
 wsj_default_path = './wsj.pkl'
 
+
 def reseed(seed=90210):
     random.seed(seed)
     torch.manual_seed(seed)
 
 reseed()
 
+
 @torch.jit.script
 def gru_cell(input_, hidden, w_hh, b_hh):
     gi = input_
@@ -119,11 +121,13 @@ def __init__(self, tokens, labels, n_labels):
         self.labels = labels
         self.n_labels = n_labels
 
+
 def minibatch(data, minibatch_size, reshuffle):
     if reshuffle:
         random.shuffle(data)
     for n in range(0, len(data), minibatch_size):
-        yield data[n:n+minibatch_size]
+        yield data[n:n + minibatch_size]
+
 
 def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False):
     jit_tag = '_jit' if jit else ''
@@ -155,7 +159,7 @@ def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False):
     initial_h_tensor = torch.Tensor(1, d_hid)
     initial_h_tensor.zero_()
     initial_h = Parameter(initial_h_tensor)
-    
+
     initial_actemb_tensor = torch.Tensor(1, d_actemb)
     initial_actemb_tensor.zero_()
     initial_actemb = Parameter(initial_actemb_tensor)
@@ -178,7 +182,7 @@ def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False):
         total_loss = 0
         prof = None
         with iter_timer:
-        #with torch.autograd.profiler.profile() as prof:
+            # with torch.autograd.profiler.profile() as prof:
             for batch in minibatch(data, minibatch_size, True):
                 optimizer.zero_grad()
                 loss = 0
@@ -197,11 +201,11 @@ def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False):
                         all_rnn_out = all_rnn_out.cpu()
                     else:
                         all_rnn_out, _ = gru(all_e)
-                
+
                 for ex in batch:
                     N = len(ex.tokens)
                     if preprocess_minibatch:
-                        rnn_out = all_rnn_out[0,:,:].view(-1, 1, 2 * d_rnn)
+                        rnn_out = all_rnn_out[0, :, :].view(-1, 1, 2 * d_rnn)
                     else:
                         e = embed_word(Variable(torch.LongTensor(ex.tokens), requires_grad=False)).view(N, 1, -1)
                         [rnn_out, _] = gru(e)
@@ -238,7 +242,7 @@ def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False):
             print(prof.key_averages())
         print(total_loss)
     return iter_timer
-    
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--epochs', type=int, default=10,
diff --git a/rnns/benchmarks/sru.py b/rnns/benchmarks/sru.py
index a5147e51df..cd7dc6566c 100644
--- a/rnns/benchmarks/sru.py
+++ b/rnns/benchmarks/sru.py
@@ -1,4 +1,4 @@
-#from builtins import bytes
+# from builtins import bytes
 import time
 
 import numpy as np
@@ -10,7 +10,7 @@
 from collections import namedtuple
 
 
-tmp_ = torch.rand(1,1).cuda()
+tmp_ = torch.rand(1, 1).cuda()
 
 SRU_CODE = """
 extern "C" {
@@ -336,6 +336,7 @@
 Stream = namedtuple('Stream', ['ptr'])
 SRU_STREAM = Stream(ptr=torch.cuda.current_stream().cuda_stream)
 
+
 class SRU_Compute(Function):
 
     def __init__(self, activation_type, d_out, bidirectional=False):
@@ -350,13 +351,13 @@ def forward(self, u, x, bias, init=None, mask_h=None):
         batch = x.size(-2)
         d = self.d_out
         k = u.size(-1) // d
-        k_ = k//2 if self.bidirectional else k
-        ncols = batch*d*bidir
+        k_ = k // 2 if self.bidirectional else k
+        ncols = batch * d * bidir
         thread_per_block = min(512, ncols)
-        num_block = (ncols-1)//thread_per_block+1
+        num_block = (ncols - 1) // thread_per_block + 1
 
         init_ = x.new(ncols).zero_() if init is None else init
-        size = (length, batch, d*bidir) if x.dim() == 3 else (batch, d*bidir)
+        size = (length, batch, d * bidir) if x.dim() == 3 else (batch, d * bidir)
         c = x.new(*size)
         h = x.new(*size)
         FUNC = SRU_FWD_FUNC if not self.bidirectional else SRU_BiFWD_FUNC
@@ -373,7 +374,7 @@ def forward(self, u, x, bias, init=None, mask_h=None):
             h.data_ptr(),
             c.data_ptr(),
             self.activation_type],
-            block = (thread_per_block,1,1), grid = (num_block,1,1),
+            block=(thread_per_block, 1, 1), grid=(num_block, 1, 1),
             stream=SRU_STREAM
         )
 
@@ -382,7 +383,7 @@ def forward(self, u, x, bias, init=None, mask_h=None):
         if x.dim() == 2:
             last_hidden = c
         elif self.bidirectional:
-            last_hidden = torch.cat((c[-1,:,:d], c[0,:,d:]), dim=1)
+            last_hidden = torch.cat((c[-1, :, :d], c[0, :, d:]), dim=1)
         else:
             last_hidden = c[-1]
         return h, last_hidden
@@ -395,19 +396,19 @@ def backward(self, grad_h, grad_last):
         batch = x.size(-2)
         d = self.d_out
         k = u.size(-1) // d
-        k_ = k//2 if self.bidirectional else k
-        ncols = batch*d*bidir
+        k_ = k // 2 if self.bidirectional else k
+        ncols = batch * d * bidir
         thread_per_block = min(512, ncols)
-        num_block = (ncols-1)//thread_per_block+1
+        num_block = (ncols - 1) // thread_per_block + 1
 
         init_ = x.new(ncols).zero_() if init is None else init
         grad_u = u.new(*u.size())
-        grad_bias = x.new(2, batch, d*bidir)
-        grad_init = x.new(batch, d*bidir)
+        grad_bias = x.new(2, batch, d * bidir)
+        grad_init = x.new(batch, d * bidir)
 
         # For DEBUG
-        #size = (length, batch, x.size(-1)) if x.dim() == 3 else (batch, x.size(-1))
-        #grad_x = x.new(*x.size()) if k_ == 3 else x.new(*size).zero_()
+        # size = (length, batch, x.size(-1)) if x.dim() == 3 else (batch, x.size(-1))
+        # grad_x = x.new(*x.size()) if k_ == 3 else x.new(*size).zero_()
 
         # Normal use
         grad_x = x.new(*x.size()) if k_ == 3 else None
@@ -431,7 +432,7 @@ def backward(self, grad_h, grad_last):
             grad_bias.data_ptr(),
             grad_init.data_ptr(),
             self.activation_type],
-            block = (thread_per_block,1,1), grid = (num_block,1,1),
+            block=(thread_per_block, 1, 1), grid=(num_block, 1, 1),
             stream=SRU_STREAM
         )
         return grad_u, grad_x, grad_bias.sum(1).view(-1), grad_init, None
@@ -444,7 +445,7 @@ def forward(u, x, bias, init=None, mask_h=None):
         batch = x.size(-2)
         d = d_out
         k = u.size(-1) // d
-        k_ = k//2 if bidirectional else k
+        k_ = k // 2 if bidirectional else k
 
         u = u.view(length, batch, d, k_)
 
@@ -458,15 +459,15 @@ def forward(u, x, bias, init=None, mask_h=None):
             u0i, u1i, u2i = u_[0][i], u_[1][i], u_[2][i]
             g1 = torch.sigmoid(u1i + bias1)
             g2 = torch.sigmoid(u2i + bias2)
-            cur = (cur - u0i)*g1 + u0i
+            cur = (cur - u0i) * g1 + u0i
             if activation_type == 1:
                 val = torch.tanh(cur)
             elif activation_type == 2:
                 val = torch.relu(cur)
             if mask_h is not None:
-                val = val*mask_h
+                val = val * mask_h
             xi = x_[i]
-            h.append((val - xi)*g2 + xi)
+            h.append((val - xi) * g2 + xi)
 
         if bidirectional:
             assert False
@@ -490,15 +491,15 @@ def __init__(self, n_in, n_out, dropout=0, rnn_dropout=0,
         self.bidirectional = bidirectional
         self.activation_type = 2 if use_relu else (1 if use_tanh else 0)
         self.use_kernel = use_kernel
-        out_size = n_out*2 if bidirectional else n_out
+        out_size = n_out * 2 if bidirectional else n_out
         k = 4 if n_in != out_size else 3
-        self.size_per_dir = n_out*k
+        self.size_per_dir = n_out * k
         self.weight = nn.Parameter(torch.Tensor(
             n_in,
-            self.size_per_dir*2 if bidirectional else self.size_per_dir
+            self.size_per_dir * 2 if bidirectional else self.size_per_dir
         ))
         self.bias = nn.Parameter(torch.Tensor(
-            n_out*4 if bidirectional else n_out*2
+            n_out * 4 if bidirectional else n_out * 2
         ))
         self.init_weight()
         self.jit = jit
@@ -510,14 +511,14 @@ def __init__(self, n_in, n_out, dropout=0, rnn_dropout=0,
             self.sru_jit_traced = False
 
     def init_weight(self):
-        val_range = (3.0/self.n_in)**0.5
+        val_range = (3.0 / self.n_in)**0.5
         self.weight.data.uniform_(-val_range, val_range)
         self.bias.data.zero_()
 
     def set_bias(self, bias_val=0):
         n_out = self.n_out
         if self.bidirectional:
-            self.bias.data[n_out*2:].zero_().add_(bias_val)
+            self.bias.data[n_out * 2:].zero_().add_(bias_val)
         else:
             self.bias.data[n_out:].zero_().add_(bias_val)
 
@@ -527,10 +528,10 @@ def forward(self, input, c0=None):
         batch = input.size(-2)
         if c0 is None:
             c0 = Variable(input.data.new(
-                batch, n_out if not self.bidirectional else n_out*2
+                batch, n_out if not self.bidirectional else n_out * 2
             ).zero_())
 
-        if self.training and (self.rnn_dropout>0):
+        if self.training and (self.rnn_dropout > 0):
             mask = self.get_dropout_mask_((batch, n_in), self.rnn_dropout)
             x = input * mask.expand_as(input)
         else:
@@ -539,9 +540,9 @@ def forward(self, input, c0=None):
         x_2d = x if x.dim() == 2 else x.contiguous().view(-1, n_in)
         u = x_2d.mm(self.weight)
 
-        if self.training and (self.dropout>0):
+        if self.training and (self.dropout > 0):
             bidir = 2 if self.bidirectional else 1
-            mask_h = self.get_dropout_mask_((batch, n_out*bidir), self.dropout)
+            mask_h = self.get_dropout_mask_((batch, n_out * bidir), self.dropout)
             if self.use_kernel:
                 h, c = SRU_Compute(self.activation_type, n_out, self.bidirectional)(u, input, self.bias, c0, mask_h)
                 return h, c
@@ -556,12 +557,13 @@ def forward(self, input, c0=None):
                 h, c = self.sru_jit(u, input, self.bias, c0, mask_h)
                 return h, c
 
-            h, c = SRU_Compute_No_Kernel(self.activation_type, n_out, self.bidirectional)(u, input, self.bias, c0, mask_h)
+            h, c = SRU_Compute_No_Kernel(self.activation_type, n_out, self.bidirectional)(
+                u, input, self.bias, c0, mask_h)
         else:
             if self.use_kernel:
                 h, c = SRU_Compute(self.activation_type, n_out, self.bidirectional)(u, input, self.bias, c0)
                 return h, c
-            
+
             if self.jit:
                 if not self.sru_jit_traced:
                     print("Tracing sru cell without dropout")
@@ -577,13 +579,13 @@ def forward(self, input, c0=None):
 
     def get_dropout_mask_(self, size, p):
         w = self.weight.data
-        return Variable(w.new(*size).bernoulli_(1-p).div_(1-p))
+        return Variable(w.new(*size).bernoulli_(1 - p).div_(1 - p))
 
 
 class SRU(nn.Module):
     def __init__(self, input_size, hidden_size, num_layers=2, dropout=0, rnn_dropout=0,
-                bidirectional=False, use_tanh=1, use_relu=0, use_kernel=True,
-                jit=False):
+                 bidirectional=False, use_tanh=1, use_relu=0, use_kernel=True,
+                 jit=False):
         super(SRU, self).__init__()
         self.n_in = input_size
         self.n_out = hidden_size
@@ -593,18 +595,18 @@ def __init__(self, input_size, hidden_size, num_layers=2, dropout=0, rnn_dropout
         self.rnn_lst = nn.ModuleList()
         self.bidirectional = bidirectional
         self.use_kernel = use_kernel
-        self.out_size = hidden_size*2 if bidirectional else hidden_size
+        self.out_size = hidden_size * 2 if bidirectional else hidden_size
 
         for i in range(num_layers):
             l = SRUCell(
-                n_in = self.n_in if i==0 else self.out_size,
-                n_out = self.n_out,
-                dropout = dropout if i+1 != num_layers else 0,
-                rnn_dropout = rnn_dropout,
-                bidirectional = bidirectional,
-                use_tanh = use_tanh,
-                use_relu = use_relu,
-                use_kernel = use_kernel,
+                n_in=self.n_in if i == 0 else self.out_size,
+                n_out=self.n_out,
+                dropout=dropout if i + 1 != num_layers else 0,
+                rnn_dropout=rnn_dropout,
+                bidirectional=bidirectional,
+                use_tanh=use_tanh,
+                use_relu=use_relu,
+                use_kernel=use_kernel,
                 jit=jit,
             )
             self.rnn_lst.append(l)
@@ -614,16 +616,16 @@ def set_bias(self, bias_val=0):
             l.set_bias(bias_val)
 
     def forward(self, input, c0=None, return_hidden=True):
-        assert input.dim() == 3 # (len, batch, n_in)
+        assert input.dim() == 3  # (len, batch, n_in)
         dir_ = 2 if self.bidirectional else 1
         if c0 is None:
             zeros = Variable(input.data.new(
-                input.size(1), self.n_out*dir_
+                input.size(1), self.n_out * dir_
             ).zero_())
-            c0 = [ zeros for i in range(self.depth) ]
+            c0 = [zeros for i in range(self.depth)]
         else:
             assert c0.dim() == 3    # (depth, batch, n_out*dir_)
-            c0 = [ x.squeeze(0) for x in c0.chunk(self.depth, 0) ]
+            c0 = [x.squeeze(0) for x in c0.chunk(self.depth, 0)]
 
         prevx = input
         lstc = []
diff --git a/rnns/benchmarks/sru_test.py b/rnns/benchmarks/sru_test.py
index 6ef78c71b4..a8efd5a003 100644
--- a/rnns/benchmarks/sru_test.py
+++ b/rnns/benchmarks/sru_test.py
@@ -25,15 +25,15 @@ def run_sru(cpu=0, gpu=0, jit=False, use_kernel=False, backward=False,
     input_size, hidden_size = 128, 128
 
     rnn = SRU(input_size, hidden_size,
-        num_layers = 2,          # number of stacking RNN layers
-        dropout = 0.00001,           # dropout applied between RNN layers
-        rnn_dropout = 0.0001,       # variational dropout applied on linear transformation
-        use_tanh = 1,            # use tanh?
-        use_relu = 0,            # use ReLU?
-        bidirectional = False,    # bidirectional RNN ?
-        use_kernel=use_kernel,
-        jit=jit,
-    )
+              num_layers=2,          # number of stacking RNN layers
+              dropout=0.00001,           # dropout applied between RNN layers
+              rnn_dropout=0.0001,       # variational dropout applied on linear transformation
+              use_tanh=1,            # use tanh?
+              use_relu=0,            # use ReLU?
+              bidirectional=False,    # bidirectional RNN ?
+              use_kernel=use_kernel,
+              jit=jit,
+              )
     rnn.cuda()
 
     kernel_tag = '_kernel' if use_kernel else ''
@@ -55,13 +55,13 @@ def run_sru(cpu=0, gpu=0, jit=False, use_kernel=False, backward=False,
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="PyTorch mLSTM benchmark.")
-    parser.add_argument('--cpu',                     type=int, default=0,     help="CPU to run on")
-    parser.add_argument('--gpu',                     type=int, default=0,     help="GPU to run on")
-    parser.add_argument('--warmup',                  type=int, default=10,    help="Warmup iterations")
-    parser.add_argument('--benchmark',               type=int, default=20,    help="Benchmark iterations")
-    parser.add_argument('--jit',                     action='store_true',     help="Use JIT compiler")
-    parser.add_argument('--use-kernel',              action='store_true',     help="Use specialized kernel")
-    parser.add_argument('--backward',                action='store_true',     help="benchmark forward + backward")
+    parser.add_argument('--cpu', type=int, default=0, help="CPU to run on")
+    parser.add_argument('--gpu', type=int, default=0, help="GPU to run on")
+    parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations")
+    parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations")
+    parser.add_argument('--jit', action='store_true', help="Use JIT compiler")
+    parser.add_argument('--use-kernel', action='store_true', help="Use specialized kernel")
+    parser.add_argument('--backward', action='store_true', help="benchmark forward + backward")
     args = parser.parse_args()
 
     pprint.pprint(vars(args))
diff --git a/rnns/benchmarks/torchqrnn/forget_mult.py b/rnns/benchmarks/torchqrnn/forget_mult.py
index 0b03e87c5a..79d1b6a2d1 100644
--- a/rnns/benchmarks/torchqrnn/forget_mult.py
+++ b/rnns/benchmarks/torchqrnn/forget_mult.py
@@ -39,7 +39,8 @@
   }
 }
 extern "C"
-__global__ void bwd_recurrent_forget_mult(const float *h, const float *f, const float *x, const float *gh, float *gf, float *gx, float *ghinit, int SEQ, int BATCH, int HIDDEN)
+__global__ void bwd_recurrent_forget_mult(const float *h, const float *f, const float *x, const float *gh, float *gf,
+                                          float *gx, float *ghinit, int SEQ, int BATCH, int HIDDEN)
 {
   /*
   Note: h is assumed to be one timestep longer than f, x, gf, gx, or gh where dst[0] = h_{-1}
@@ -172,7 +173,8 @@ class ForgetMult(torch.nn.Module):
         - X (seq_len, batch, input_size): tensor containing the features of the input sequence.
         - F (seq_len, batch, input_size): tensor containing the forget gate values, assumed in range [0, 1].
         - hidden_init (batch, input_size): tensor containing the initial hidden state for the recurrence (h_{t-1}).
-        - use_kernel: If True, use the fast element-wise CUDA kernel for recurrence. If False, uses naive for loop. Default: True.
+        - use_kernel: If True, use the fast element-wise CUDA kernel for recurrence.
+                      If False, uses naive for loop. Default: True.
     """
 
     def __init__(self, use_kernel=False, jit=False):
@@ -217,15 +219,15 @@ def test_accuracy():
     seq, batch, hidden = 35, 20, 650
     # Larger input (batch * seq * hidden) results in excessive memory for gradient check
     seq, batch, hidden = 3, 7, 19
-    a      = Variable(torch.rand(seq, batch, hidden).cuda(), requires_grad=True)
+    a = Variable(torch.rand(seq, batch, hidden).cuda(), requires_grad=True)
     forget = Variable(torch.rand(seq, batch, hidden).cuda(), requires_grad=True)
     last_h = Variable(torch.rand(batch, hidden).cuda(), requires_grad=True)
 
-    #seq, batch, hidden = 4, 1, 1
-    #a = Variable(torch.Tensor([0.75, 0.5, 0.9, 0.8]).view(seq, batch, hidden).cuda(), requires_grad=True)
-    #forget = Variable(torch.Tensor([0.25, 0.25, 0.5, 0.4]).view(seq, batch, hidden).cuda(), requires_grad=True)
-    #last_h = Variable(torch.Tensor([0]).view(batch, hidden).cuda(), requires_grad=True)
-    #print(forget, a, last_h)
+    # seq, batch, hidden = 4, 1, 1
+    # a = Variable(torch.Tensor([0.75, 0.5, 0.9, 0.8]).view(seq, batch, hidden).cuda(), requires_grad=True)
+    # forget = Variable(torch.Tensor([0.25, 0.25, 0.5, 0.4]).view(seq, batch, hidden).cuda(), requires_grad=True)
+    # last_h = Variable(torch.Tensor([0]).view(batch, hidden).cuda(), requires_grad=True)
+    # print(forget, a, last_h)
 
     print('CUDA forget mult')
     print('=-=-' * 5)
diff --git a/rnns/benchmarks/torchqrnn/qrnn.py b/rnns/benchmarks/torchqrnn/qrnn.py
index 0f61a6d289..779327ed2a 100644
--- a/rnns/benchmarks/torchqrnn/qrnn.py
+++ b/rnns/benchmarks/torchqrnn/qrnn.py
@@ -9,15 +9,21 @@
 
 # code copied from https://github.com/salesforce/pytorch-qrnn/tree/master/torchqrnn
 
+
 class QRNNLayer(nn.Module):
     r"""Applies a single layer Quasi-Recurrent Neural Network (QRNN) to an input sequence.
     Args:
         input_size: The number of expected features in the input x.
         hidden_size: The number of features in the hidden state h. If not specified, the input size is used.
-        save_prev_x: Whether to store previous inputs for use in future convolutional windows (i.e. for a continuing sequence such as in language modeling). If true, you must call reset to remove cached previous values of x. Default: False.
-        window: Defines the size of the convolutional window (how many previous tokens to look when computing the QRNN values). Supports 1 and 2. Default: 1.
-        zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state) to the hidden state updates. Default: 0.
-        output_gate: If True, performs QRNN-fo (applying an output gate to the output). If False, performs QRNN-f. Default: True.
+        save_prev_x: Whether to store previous inputs for use in future convolutional windows
+                     (i.e. for a continuing sequence such as in language modeling). If true, you must call
+                     reset to remove cached previous values of x. Default: False.
+        window: Defines the size of the convolutional window (how many previous tokens to look when computing
+                the QRNN values). Supports 1 and 2. Default: 1.
+        zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state)
+                 to the hidden state updates. Default: 0.
+        output_gate: If True, performs QRNN-fo (applying an output gate to the output).
+                     If False, performs QRNN-f. Default: True.
         use_kernel: If True, uses fast custom CUDA kernel. If False, uses naive for loop. Default: True.
     Inputs: X, hidden
         - X (seq_len, batch, input_size): tensor containing the features of the input sequence.
@@ -27,10 +33,12 @@ class QRNNLayer(nn.Module):
         - h_n (batch, hidden_size): tensor containing the hidden state for t=seq_len
     """
 
-    def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, window=1, output_gate=True, use_kernel=True, jit=False):
+    def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, window=1,
+                 output_gate=True, use_kernel=True, jit=False):
         super(QRNNLayer, self).__init__()
 
-        assert window in [1, 2], "This QRNN implementation currently only handles convolutional window of size 1 or size 2"
+        assert window in [
+            1, 2], "This QRNN implementation currently only handles convolutional window of size 1 or size 2"
         self.window = window
         self.input_size = input_size
         self.hidden_size = hidden_size if hidden_size else input_size
@@ -43,7 +51,8 @@ def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, w
         assert not (use_kernel and jit)
 
         # One large matmul with concat is faster than N small matmuls and no concat
-        self.linear = nn.Linear(self.window * self.input_size, 3 * self.hidden_size if self.output_gate else 2 * self.hidden_size)
+        self.linear = nn.Linear(self.window * self.input_size, 3 *
+                                self.hidden_size if self.output_gate else 2 * self.hidden_size)
 
         self.forget_mult = ForgetMult(use_kernel, jit)
 
@@ -120,10 +129,15 @@ class QRNN(torch.nn.Module):
         hidden_size: The number of features in the hidden state h. If not specified, the input size is used.
         num_layers: The number of QRNN layers to produce.
         layers: List of preconstructed QRNN layers to use for the QRNN module (optional).
-        save_prev_x: Whether to store previous inputs for use in future convolutional windows (i.e. for a continuing sequence such as in language modeling). If true, you must call reset to remove cached previous values of x. Default: False.
-        window: Defines the size of the convolutional window (how many previous tokens to look when computing the QRNN values). Supports 1 and 2. Default: 1.
-        zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state) to the hidden state updates. Default: 0.
-        output_gate: If True, performs QRNN-fo (applying an output gate to the output). If False, performs QRNN-f. Default: True.
+        save_prev_x: Whether to store previous inputs for use in future convolutional windows
+                     (i.e. for a continuing sequence such as in language modeling). If true, you must call
+                     reset to remove cached previous values of x. Default: False.
+        window: Defines the size of the convolutional window (how many previous tokens to look when computing
+                the QRNN values). Supports 1 and 2. Default: 1.
+        zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state)
+                 to the hidden state updates. Default: 0.
+        output_gate: If True, performs QRNN-fo (applying an output gate to the output).
+                     If False, performs QRNN-f. Default: True.
         use_kernel: If True, uses fast custom CUDA kernel. If False, uses naive for loop. Default: True.
     Inputs: X, hidden
         - X (seq_len, batch, input_size): tensor containing the features of the input sequence.
@@ -136,13 +150,14 @@ class QRNN(torch.nn.Module):
     def __init__(self, input_size, hidden_size,
                  num_layers=1, bias=True, batch_first=False,
                  dropout=0, bidirectional=False, layers=None, **kwargs):
-        assert bidirectional == False, 'Bidirectional QRNN is not yet supported'
-        assert batch_first == False, 'Batch first mode is not yet supported'
-        assert bias == True, 'Removing underlying bias is not yet supported'
+        assert bidirectional is False, 'Bidirectional QRNN is not yet supported'
+        assert batch_first is False, 'Batch first mode is not yet supported'
+        assert bias is True, 'Removing underlying bias is not yet supported'
 
         super(QRNN, self).__init__()
 
-        self.layers = torch.nn.ModuleList(layers if layers else [QRNNLayer(input_size if l == 0 else hidden_size, hidden_size, **kwargs) for l in range(num_layers)])
+        self.layers = torch.nn.ModuleList(layers if layers else [QRNNLayer(
+            input_size if l == 0 else hidden_size, hidden_size, **kwargs) for l in range(num_layers)])
 
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -201,6 +216,6 @@ def forward(self, input, hidden=None):
     assert diff < 1e-5, 'CUDA and non-CUDA QRNN layers return different results'
 
     from torch.autograd import gradcheck
-    inputs = [X,]
+    inputs = [X, ]
     test = gradcheck(QRNNLayer(hidden_size, hidden_size).cuda(), inputs)
     print(test)
diff --git a/rnns/fastrnns/bench.py b/rnns/fastrnns/bench.py
index 9af6c14aa0..d691906124 100644
--- a/rnns/fastrnns/bench.py
+++ b/rnns/fastrnns/bench.py
@@ -154,7 +154,7 @@ def bench(rnn_runners, group_name, print_json=False, sep=' ', **params):
     vlrnns = ['vl_cudnn', 'vl_jit', 'vl_py']
     cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
     if args.print_json:
-        print_stderr = lambda *args, **kwargs: None
+        print_stderr = lambda *args, **kwargs: None    # noqa
     print_stderr(args)
 
     bench_args = vars(args)
diff --git a/rnns/fastrnns/factory.py b/rnns/fastrnns/factory.py
index b7d6f4fe5c..044eaf513c 100644
--- a/rnns/fastrnns/factory.py
+++ b/rnns/fastrnns/factory.py
@@ -202,7 +202,7 @@ def forward(sequences, hidden):
 
 def varlen_lstm_factory(cell, script):
     def dynamic_rnn(sequences, hiddens, wih, whh, bih, bhh):
-        # type: (List[Tensor], Tuple[Tensor, Tensor], Tensor, Tensor, Tensor, Tensor) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]
+        # type: (List[Tensor], Tuple[Tensor, Tensor], Tensor, Tensor, Tensor, Tensor) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]    # noqa
         hx, cx = hiddens
         hxs = hx.unbind(1)
         cxs = cx.unbind(1)
@@ -234,7 +234,7 @@ def dynamic_rnn(sequences, hiddens, wih, whh, bih, bhh):
 
 
 def varlen_lstm_creator(script=False, **kwargs):
-    sequences, _,  hidden, params, _ = varlen_lstm_inputs(
+    sequences, _, hidden, params, _ = varlen_lstm_inputs(
         return_module=False, **kwargs)
     inputs = [sequences, hidden] + params[0]
     return ModelDef(
@@ -307,7 +307,6 @@ def dynamic_rnn(input, hidden, wih, whh, bih, bhh):
     return dynamic_rnn
 
 
-
 # premul: we're going to premultiply the inputs & weights
 def lstm_factory_premul(premul_cell, script):
     def dynamic_rnn(input, hidden, wih, whh, bih, bhh):
diff --git a/rnns/fastrnns/scratch.py b/rnns/fastrnns/scratch.py
index 07cc803989..c51d71625f 100644
--- a/rnns/fastrnns/scratch.py
+++ b/rnns/fastrnns/scratch.py
@@ -1,5 +1,6 @@
 import torch
 
+
 @torch.jit.script
 def fn(x, scale, shift):
     return scale * x / shift
@@ -25,6 +26,7 @@ def recurrent(x, scale, shift):
 
 import torch
 
+
 @torch.jit.script
 def recurrent_scaleshift(x, scale, shift):
     y = x
@@ -44,6 +46,6 @@ def recurrent_scaleshift(x, scale, shift):
 import torch
 x = torch.tensor([])
 x.requires_grad = True
-x.mean().backward() # no error triggered
+x.mean().backward()  # no error triggered
 x = x.cuda()
 x.mean().backward()
diff --git a/rnns/runner.py b/rnns/runner.py
index e0aee903c2..2b589e4e71 100644
--- a/rnns/runner.py
+++ b/rnns/runner.py
@@ -6,13 +6,13 @@
 from benchmarks.mlstm import run_mlstm
 from benchmarks.lstm import run_lstm
 from benchmarks.cudnn_lstm import run_cudnn_lstm
-from benchmarks.tensor import run_tensor 
+from benchmarks.tensor import run_tensor
 from benchmarks.lstm_variants_test import run_lstm_variant
 from benchmarks.bnlstm import run_bnlstm
 from benchmarks.sru_test import run_sru
 from benchmarks.qrnn import run_qrnn
 
-from benchmarks.sequence_labeler import test_wsj 
+from benchmarks.sequence_labeler import test_wsj
 from benchmarks.sequence_labeler import Example
 
 from benchmarks.common import AttrDict
@@ -124,7 +124,7 @@ def discover_benchmarks():
 
 def title(text='title', width=80):
     reserve = len(text) + 2
-    num_lines = int((width - reserve)/2)
+    num_lines = int((width - reserve) / 2)
     lines = '-' * num_lines
     return '{} {} {}'.format(lines, text, lines)
 
@@ -138,11 +138,11 @@ def summarize(result):
     if gpu_summary.max == 0 and gpu_summary.min == 0:
         use_summary = cpu_summary
 
-    range_middle = (use_summary.max + use_summary.min)/2
+    range_middle = (use_summary.max + use_summary.min) / 2
     deviation = use_summary.max - range_middle
 
     return '{2:10.4f} ± {3:8.4f} msec (average {1:10.4f} msec, {4} samples) [{0}]'.format(
-          result.name, use_summary.mean, range_middle, deviation, samples)
+        result.name, use_summary.mean, range_middle, deviation, samples)
 
 
 def main():
diff --git a/run.py b/run.py
index df34a3ff4e..bfbeb78549 100644
--- a/run.py
+++ b/run.py
@@ -3,10 +3,9 @@
 
 
 def get_docker_run_cmd(image_name):
-  return ["sudo", "docker", "run", "--rm", "--cap-add=SYS_PTRACE",
-          "--security-opt", "seccomp=unconfined", "-v", os.getcwd() + ":/mnt/localdrive",
-          "--cpuset-cpus=0-3", "-t", "--user=jenkins", image_name]
+    return ["sudo", "docker", "run", "--rm", "--cap-add=SYS_PTRACE",
+            "--security-opt", "seccomp=unconfined", "-v", os.getcwd() + ":/mnt/localdrive",
+            "--cpuset-cpus=0-3", "-t", "--user=jenkins", image_name]
 
 if __name__ == "__main__":
-  call(get_docker_run_cmd("tmp-utcnpjpdbsorhktnnnttixmkvlyxirwl") + ["/bin/bash", "/mnt/localdrive/python/run.sh"])
-
+    call(get_docker_run_cmd("tmp-utcnpjpdbsorhktnnnttixmkvlyxirwl") + ["/bin/bash", "/mnt/localdrive/python/run.sh"])
diff --git a/setup/bench_conf.py b/setup/bench_conf.py
index 58f5ac9c59..48b2b7c1ef 100644
--- a/setup/bench_conf.py
+++ b/setup/bench_conf.py
@@ -8,11 +8,12 @@
 
 CPUInfo = namedtuple('CPUInfo', ['processor', 'physical_id', 'core_id'])
 
+
 def get_cpus():
     with open('/proc/cpuinfo', 'r') as f:
         raw_out = f.read()
     relevant_lines = [l for l in raw_out.split('\n')
-                    if 'processor' in l or 'physical id' in l or 'core id' in l]
+                      if 'processor' in l or 'physical id' in l or 'core id' in l]
     assert len(relevant_lines) % 3 == 0
     line_data = [int(l[l.index(':') + 1:].strip()) for l in relevant_lines]
 
@@ -84,6 +85,7 @@ def remove_shield():
 # CPU Turbo Mode
 ################################################################################
 
+
 def set_turbo(value):
     with open('/sys/devices/system/cpu/intel_pstate/no_turbo', 'w') as f:
         f.write('0' if value else '1')
@@ -92,6 +94,7 @@ def set_turbo(value):
 # Helpers
 ################################################################################
 
+
 def isolate_bench_subset(cpus):
     bench_cpus = [cpu for cpu in cpus if cpu.physical_id == 0]
     bg_cpus = [cpu for cpu in cpus if cpu.physical_id != 0]
@@ -103,6 +106,7 @@ def isolate_bench_subset(cpus):
 # Setup/Teardown
 ################################################################################
 
+
 def setup_benchmark_env():
     set_turbo(False)
     all_active_cpus = disable_ht()
@@ -111,11 +115,13 @@ def setup_benchmark_env():
     with open('bench_cpus', 'w') as f:
         f.write(','.join(str(cpu.processor) for cpu in bench_cpus))
 
+
 def teardown_benchmark_env():
     remove_shield()
     enable_ht()
     set_turbo(True)
 
+
 def main():
     parser = argparse.ArgumentParser(description='Configure benchmarking environment')
     parser.add_argument('--setup', action='store_true')
diff --git a/timing/python/benchmarks/__init__.py b/timing/python/benchmarks/__init__.py
index 9534424d74..2de923f41b 100644
--- a/timing/python/benchmarks/__init__.py
+++ b/timing/python/benchmarks/__init__.py
@@ -5,4 +5,3 @@
 from benchmarks.cpu_unary_benchmark import CPUUnaryBench
 from benchmarks.cpu_unary_benchmark import NumpyUnaryComparison
 from benchmarks.cuda_lstm_benchmark import CUDALSTMBench
-
diff --git a/timing/python/benchmarks/misc/mobilenet.py b/timing/python/benchmarks/misc/mobilenet.py
index a7d446aa15..cbbafab54e 100644
--- a/timing/python/benchmarks/misc/mobilenet.py
+++ b/timing/python/benchmarks/misc/mobilenet.py
@@ -52,6 +52,7 @@ class MobileNetV2(nn.Module):
     `"Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation"
     <https://arxiv.org/pdf/1801.04381>`_paper.
     """
+
     def __init__(self, n_class=1000, input_size=224, width_mult=1.):
         super(MobileNetV2, self).__init__()
         # setting of inverted residual blocks
@@ -82,7 +83,7 @@ def __init__(self, n_class=1000, input_size=224, width_mult=1.):
                 input_channel = output_channel
         # building last several layers
         self.features.append(conv_1x1_bn(input_channel, self.last_channel))
-        self.features.append(nn.AvgPool2d(int(input_size/32)))
+        self.features.append(nn.AvgPool2d(int(input_size / 32)))
         # make it nn.Sequential
         self.features = nn.Sequential(*self.features)
 
@@ -114,4 +115,3 @@ def _initialize_weights(self):
                 n = m.weight.size(1)
                 m.weight.data.normal_(0, 0.01)
                 m.bias.data.zero_()
-
diff --git a/timing/python/framework/maybe_garbage.py b/timing/python/framework/maybe_garbage.py
index 8dfb605f57..72d2973dde 100644
--- a/timing/python/framework/maybe_garbage.py
+++ b/timing/python/framework/maybe_garbage.py
@@ -2,13 +2,13 @@
 
 # cpu_pin(cpu)
 
-#class Benchmark(object):
+# class Benchmark(object):
 #    # default_params = []
 #    # params = make_params()
 #    # param_names = ['config']
 
-# 
-# 
+#
+#
 # def start_stats(common_name, framework_name, fname, mag, count, tv):
 #     status = ""
 #     status += "tag: {:<15}".format(common_name)
@@ -20,14 +20,14 @@
 #     status += " stride: {:<60}".format(list(map(lambda x: "{:>7}".format(x), list(tv.stride()))))
 #     status += " numel: {:<9}".format(tv.numel())
 #     return status
-# 
+#
 # def finish_stats(dtype, dim, elapsed):
 #     status = ""
 #     status += " type: {:<18}".format(dtype)
 #     status += " dim: {:<5}".format(dim)
 #     status += " elapsed: {:8.4f}".format(elapsed)
 #     return status
-# 
+#
 # def lambda_benchmark(common_name, types, fun, name, framework_name, cast):
 #     goal_size = 1000
 #     onek = 1000
@@ -57,10 +57,10 @@
 # class over(object):
 #     def __init__(self, *args):
 #         self.values = args
-# 
-# 
-# 
-# 
+#
+#
+#
+#
 # def make_params(**kwargs):
 #     keys = list(kwargs.keys())
 #     iterables = [kwargs[k].values if isinstance(kwargs[k], over) else (kwargs[k],) for k in keys]
@@ -73,61 +73,60 @@
 #         for k, v in self.default_params.items():
 #             params.setdefault(k, v)
 #         self.prepare(params)
-# 
-# 
+#
+#
 # def get_env_pytorch_examples():
 #     pytorch_examples_home = os.environ.get('EXAMPLES_HOME')
 #     if pytorch_examples_home is None:
 #         print('EXAMPLES_HOME not found')
 #         sys.exit()
-# 
+#
 #     return pytorch_examples_home
-# 
-# 
+#
+#
 # def execution(cmd, log_path):
 #     gc.collect()
-# 
+#
 #     # logging
 #     log_file = open(log_path, "w+")
 #     log_file.write(cmd)
 #     log_file.write('\n')
-# 
+#
 #     exec_command = shlex.split(cmd)
 #     proc = subprocess.Popen(exec_command, stdout=log_file, stderr=subprocess.STDOUT)
 #     proc.wait()
 #     return_code = proc.returncode
 #     log_file.close()
-# 
+#
 #     log_file = open(log_path, 'r+')
-# 
+#
 #     if return_code == 0:
 #         acc = parse_accuracy(log_file)
 #     else:
 #         acc = ('NA', 'NA')
-# 
+#
 #     return acc
-# 
-# 
+#
+#
 # def parse_accuracy(log_file):
 #     output_data = log_file.readlines()
 #     _, _, prec1, _, prec2 = output_data[-2].split()
 #     return (prec1, prec2)
-# 
-# 
+#
+#
 # def config_runs(model, no_iter):
 #     iters = [i for i in range(no_iter)]
 #     if model == 'all':
 #         model = model_names
-# 
+#
 #     return list(itertools.product(model, iters))
-# 
-# 
+#
+#
 # def cmd_string(examples_home, model, data_path):
 #     lr = 0.1
 #     if model in ['alexnet', 'vgg11', 'vgg11_bn', 'vgg13_bn',
 #                  'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']:
 #         lr = 0.01
-# 
+#
 #     cmd = ' '.join(['python3', examples_home, '-a', model, '--lr', str(lr), data_path])
 #     return cmd
-