From 7c381e393b1fc2ed8874a5e8f726ea9683149680 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Tue, 5 Feb 2019 16:24:10 -0800 Subject: [PATCH] add flake8 config from pytorch and format the code --- .flake8 | 4 + accuracy/model_accuracy.py | 3 +- plot/main.py | 47 ++++++----- rnns/benchmarks/benchmark_common.py | 5 +- rnns/benchmarks/bnlstm.py | 15 ++-- rnns/benchmarks/common.py | 2 +- rnns/benchmarks/cudnn_lstm.py | 28 +++---- rnns/benchmarks/lstm.py | 53 +++++++----- rnns/benchmarks/lstm_variants/container.py | 3 +- rnns/benchmarks/lstm_variants/lstm.py | 25 +++--- rnns/benchmarks/lstm_variants/normalize.py | 2 +- rnns/benchmarks/memnn.py | 10 +-- rnns/benchmarks/mlstm.py | 39 ++++----- rnns/benchmarks/models/bnlstm.py | 10 +-- rnns/benchmarks/models/memnn.py | 14 ++-- rnns/benchmarks/models/mlstm.py | 3 + rnns/benchmarks/nlp.py | 26 +++--- rnns/benchmarks/qrnn.py | 20 ++--- rnns/benchmarks/rnn.py | 6 +- rnns/benchmarks/sequence_labeler.py | 18 ++-- rnns/benchmarks/sru.py | 98 +++++++++++----------- rnns/benchmarks/sru_test.py | 32 +++---- rnns/benchmarks/torchqrnn/forget_mult.py | 18 ++-- rnns/benchmarks/torchqrnn/qrnn.py | 47 +++++++---- rnns/fastrnns/bench.py | 2 +- rnns/fastrnns/factory.py | 5 +- rnns/fastrnns/scratch.py | 4 +- rnns/runner.py | 10 +-- run.py | 9 +- setup/bench_conf.py | 8 +- timing/python/benchmarks/__init__.py | 1 - timing/python/benchmarks/misc/mobilenet.py | 4 +- timing/python/framework/maybe_garbage.py | 55 ++++++------ 33 files changed, 344 insertions(+), 282 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..1c753d1568 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +max-line-length = 120 +ignore = E203,E305,E402,E721,E741,F401,F403,F405,F821,F841,F999,W503,W504 +exclude = third_party diff --git a/accuracy/model_accuracy.py b/accuracy/model_accuracy.py index c415c3a1ce..232182c107 100755 --- a/accuracy/model_accuracy.py +++ b/accuracy/model_accuracy.py @@ -6,7 +6,6 @@ from datetime import datetime import logging from tqdm import tqdm -import os import gc import itertools import sys @@ -15,7 +14,6 @@ import torchvision.models as models - parser = argparse.ArgumentParser(description="PyTorch model accuracy benchmark.") parser.add_argument('--repeat', type=int, default=5, help="Number of Runs") @@ -90,6 +88,7 @@ def cmd_string(examples_home, model, data_path): cmd = ' '.join(['python3', examples_home, '-a', model, '--lr', str(lr), data_path]) return cmd + def log_init(): if not os.path.exists(temp_dir): os.makedirs(temp_dir) diff --git a/plot/main.py b/plot/main.py index 8d363e6956..85f6d4f5e0 100644 --- a/plot/main.py +++ b/plot/main.py @@ -14,13 +14,14 @@ HERE = os.path.dirname(os.path.abspath(__file__)) MAX_BENCHES = 160 BENCH_TIMES = 4 -BENCH_EVERY = 10 # th commit +BENCH_EVERY = 10 # th commit run = partial(subprocess.check_call, cwd=REPO_DIR) run_with_output = partial(subprocess.check_output, cwd=REPO_DIR) run_toplevel = subprocess.check_call silent = dict(stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def fetch_repo(): if os.path.exists(REPO_DIR): print('> Pulling new changes...') @@ -30,6 +31,7 @@ def fetch_repo(): print('> Cloning repository...') run_toplevel(['git', 'clone', '--recursive', REPO_URL, REPO_DIR], **silent) + def get_history(): # git log --format='%H %an %ae %at' -n fields = [ @@ -56,11 +58,11 @@ def build(commit_hash): start = time.time() cname = container_name(commit_hash) run(['docker', 'run', - '--runtime=nvidia', - '-v', os.path.join(HERE, '..') + ':/mnt/localdrive', - '--name', cname, - '-t', 'pytorch_bench', - '/bin/bash', '/mnt/localdrive/timing/python/install_pytorch.sh', commit_hash], **silent) + '--runtime=nvidia', + '-v', os.path.join(HERE, '..') + ':/mnt/localdrive', + '--name', cname, + '-t', 'pytorch_bench', + '/bin/bash', '/mnt/localdrive/timing/python/install_pytorch.sh', commit_hash], **silent) run(['docker', 'commit', cname, cname], **silent) end = time.time() diff = int(end - start) @@ -78,16 +80,16 @@ def run_benchmark(commit_hash, args, **kwargs): BENCH_CPUS = '0-11' BENCH_MEMS = '0' return run_with_output(['docker', 'run', - '--cap-add=SYS_PTRACE', - '--runtime=nvidia', - '--security-opt', - 'seccomp=unconfined', - '-v', os.path.join(HERE, '..') + ':/mnt/localdrive', - '-w', '/mnt/localdrive', - '--cpuset-cpus=' + BENCH_CPUS, - '--cpuset-mems=' + BENCH_MEMS, - '-t', container_name(commit_hash), - *args], **kwargs).decode('utf8') + '--cap-add=SYS_PTRACE', + '--runtime=nvidia', + '--security-opt', + 'seccomp=unconfined', + '-v', os.path.join(HERE, '..') + ':/mnt/localdrive', + '-w', '/mnt/localdrive', + '--cpuset-cpus=' + BENCH_CPUS, + '--cpuset-mems=' + BENCH_MEMS, + '-t', container_name(commit_hash), + *args], **kwargs).decode('utf8') def load_results(): @@ -137,12 +139,14 @@ def merge_into(original, new): else: original[key] = new[key] + def print_plan(to_bench): if not to_bench: print('> Nothing to do!') return print('> Building {} commits:'.format(len(to_bench))) - print('\n'.join(' - {} from {}'.format(result['hash'], datetime.fromtimestamp(result['commit_time'])) for result in to_bench)) + print('\n'.join(' - {} from {}'.format(result['hash'], + datetime.fromtimestamp(result['commit_time'])) for result in to_bench)) BENCHMARKS = [ @@ -150,22 +154,27 @@ def print_plan(to_bench): ] # List[Dict[Dict[Int]]] -> Dict[Dict[List[Int]]] + + def transpose_results(results): def get_keys(result): return sorted([(outer, inner) for outer in result for inner in result[outer]]) keys = get_keys(results[0]) assert all(get_keys(result) == keys for result in results) any_result = results[0] - return {outer: {inner: [result[outer][inner] for result in results] for inner in any_result[outer]} for outer in any_result} + return {outer: {inner: [result[outer][inner] for result in results] for inner in any_result[outer]} + for outer in any_result} def result_stats(result): def mean(l): return sum(l) / len(l) + def std(l): m = mean(l) return math.sqrt(sum([(v - m) ** 2 for v in l])) - return {outer: {inner: (mean(innerv), std(innerv)) for inner, innerv in outerv.items()} for outer, outerv in result.items()} + return {outer: {inner: (mean(innerv), std(innerv)) for inner, innerv in outerv.items()} + for outer, outerv in result.items()} if __name__ == '__main__': diff --git a/rnns/benchmarks/benchmark_common.py b/rnns/benchmarks/benchmark_common.py index be9cfef956..34b412d44c 100644 --- a/rnns/benchmarks/benchmark_common.py +++ b/rnns/benchmarks/benchmark_common.py @@ -4,6 +4,7 @@ # Copied and pasted from benchmark_common under benchmark/scripts + def benchmark_init(cpu, gpu, skip_cpu_governor_check=False): cpu_pin(cpu) if not skip_cpu_governor_check: @@ -32,5 +33,7 @@ def check_cpu_governor(cpu): "The file '{}' is not readable.\n" "More information:\n\n{}".format(fp, e)) + def print_results_usecs(name, i, gpu_usecs, cpu_usecs, divide_by): - print("{}({:2d}): {:8.3f} usecs ({:8.3f} usecs cpu)".format(name, i, gpu_usecs/divide_by, cpu_usecs/divide_by, file=sys.stderr)) + print("{}({:2d}): {:8.3f} usecs ({:8.3f} usecs cpu)".format( + name, i, gpu_usecs / divide_by, cpu_usecs / divide_by, file=sys.stderr)) diff --git a/rnns/benchmarks/bnlstm.py b/rnns/benchmarks/bnlstm.py index d0268b40dc..4b84dc7d3c 100644 --- a/rnns/benchmarks/bnlstm.py +++ b/rnns/benchmarks/bnlstm.py @@ -56,7 +56,6 @@ def cast(tensor): model.cuda() criterion.cuda() - total_loss = 0 for data, targets in zip(data_batches, target_batches): gc.collect() @@ -71,13 +70,13 @@ def cast(tensor): if __name__ == '__main__': parser = argparse.ArgumentParser(description="PyTorch BNLSTM benchmark.") - parser.add_argument('--num_batches', type=int, default=1, help="num batches") - parser.add_argument('--hidden-size', type=int, default=100, help="Hidden size") - parser.add_argument('--max-length', type=int, default=784, help="max seq len") - parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") - parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") - parser.add_argument('--jit', action='store_true', help="Use JIT") - parser.add_argument('--cuda', action='store_true', help="Use cuda") + parser.add_argument('--num_batches', type=int, default=1, help="num batches") + parser.add_argument('--hidden-size', type=int, default=100, help="Hidden size") + parser.add_argument('--max-length', type=int, default=784, help="max seq len") + parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") + parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") + parser.add_argument('--jit', action='store_true', help="Use JIT") + parser.add_argument('--cuda', action='store_true', help="Use cuda") args = parser.parse_args() pprint.pprint(vars(args)) diff --git a/rnns/benchmarks/common.py b/rnns/benchmarks/common.py index 7a2579ce11..ade1bc7850 100644 --- a/rnns/benchmarks/common.py +++ b/rnns/benchmarks/common.py @@ -84,7 +84,7 @@ def summary(self): assert not self.timing def mean_min_max(lst): - return SummaryStats(sum(lst)/len(lst), min(lst), max(lst)) + return SummaryStats(sum(lst) / len(lst), min(lst), max(lst)) gpu_msecs, cpu_msecs = zip(*self.results) warmup = self.warmup_iters diff --git a/rnns/benchmarks/cudnn_lstm.py b/rnns/benchmarks/cudnn_lstm.py index 05a01ae734..032719c030 100644 --- a/rnns/benchmarks/cudnn_lstm.py +++ b/rnns/benchmarks/cudnn_lstm.py @@ -23,13 +23,12 @@ def run_cudnn_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512, benchmark_init(cpu, gpu, skip_cpu_governor_check) - def V(x): return Variable(x) # mandatory input = V(torch.randn(seq_len, batch_size, input_size).cuda(gpu)) - hx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu)) - cx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu)) + hx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu)) + cx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu)) lstm = torch.nn.LSTM(input_size, hidden_size, layers).cuda(gpu) lstm.flatten_parameters() @@ -48,17 +47,18 @@ def V(x): if __name__ == "__main__": parser = argparse.ArgumentParser(description="PyTorch CuDNN LSTM benchmark.") - parser.add_argument('--cpu', type=int, default=0, help="CPU to run on") - parser.add_argument('--gpu', type=int, default=0, help="GPU to run on") - parser.add_argument('--batch-size', type=int, default=1, help="Batch size") - parser.add_argument('--input-size', type=int, default=256, help="Input size") - parser.add_argument('--hidden-size', type=int, default=512, help="Hidden size") - parser.add_argument('--layers', type=int, default=1, help="Layers") - parser.add_argument('--seq-len', type=int, default=512, help="Sequence length") - parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") - parser.add_argument('--benchmark', type=int, default=30, help="Benchmark iterations") - parser.add_argument('--skip-cpu-governor-check', action='store_true', help="Skip checking whether CPU governor is set to `performance`") - parser.add_argument('--backward', action='store_true', help="time backward") + parser.add_argument('--cpu', type=int, default=0, help="CPU to run on") + parser.add_argument('--gpu', type=int, default=0, help="GPU to run on") + parser.add_argument('--batch-size', type=int, default=1, help="Batch size") + parser.add_argument('--input-size', type=int, default=256, help="Input size") + parser.add_argument('--hidden-size', type=int, default=512, help="Hidden size") + parser.add_argument('--layers', type=int, default=1, help="Layers") + parser.add_argument('--seq-len', type=int, default=512, help="Sequence length") + parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") + parser.add_argument('--benchmark', type=int, default=30, help="Benchmark iterations") + parser.add_argument('--skip-cpu-governor-check', action='store_true', + help="Skip checking whether CPU governor is set to `performance`") + parser.add_argument('--backward', action='store_true', help="time backward") args = parser.parse_args() pprint.pprint(vars(args)) diff --git a/rnns/benchmarks/lstm.py b/rnns/benchmarks/lstm.py index fc02571398..68f2173c73 100644 --- a/rnns/benchmarks/lstm.py +++ b/rnns/benchmarks/lstm.py @@ -22,8 +22,12 @@ # This file copied from scripts/lstm.py. # If you swap the transpose here, you can test the effect of pre-transposing. + + def t_use(x): return x + + def t_def(x): return x.t() @@ -66,7 +70,7 @@ def lstm(input, hidden, w_ih, w_hh): def _unfused_lstm(input, hx, cx, w_ih, w_hh): hx, cx - #return hx.clone(), cx.clone() + # return hx.clone(), cx.clone() gates = input.mm(t_use(w_ih)) + hx.mm(t_use(w_hh)) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) @@ -104,17 +108,20 @@ def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512, benchmark_init(cpu, gpu, skip_cpu_governor_check) if variable: - V = lambda x, requires_grad=False: Variable(x, requires_grad=False) + def V(x, requires_grad=False): + return Variable(x, requires_grad=False) elif autograd: - V = lambda x, requires_grad=False: Variable(x, requires_grad=requires_grad) + def V(x, requires_grad=False): + return Variable(x, requires_grad=requires_grad) else: - V = lambda x, requires_grad=False: x + def V(x, requires_grad=False): + return x input = V(torch.randn(batch_size, input_size).cuda(device=gpu)) - hx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) - cx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) - w_ih = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True) - w_hh = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True) + hx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) + cx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) + w_ih = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True) + w_hh = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True) if fused: if backward: @@ -148,20 +155,22 @@ def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512, if __name__ == "__main__": parser = argparse.ArgumentParser(description="PyTorch LSTM benchmark.") - parser.add_argument('--cpu', type=int, default=0, help="CPU to run on") - parser.add_argument('--gpu', type=int, default=0, help="GPU to run on") - parser.add_argument('--batch-size', type=int, default=1, help="Batch size") - parser.add_argument('--input-size', type=int, default=256, help="Input size") - parser.add_argument('--hidden-size', type=int, default=512, help="Hidden size") - parser.add_argument('--seq-len', type=int, default=None, help="Sequence length") - parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") - parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") - parser.add_argument('--autograd', action='store_true', help="Use autograd") - parser.add_argument('--variable', action='store_true', help="Use Variable, but not autograd (measure baseline overhead)") - parser.add_argument('--fused', action='store_true', help="Use fused cell") - parser.add_argument('--jit', action='store_true', help="Use JIT compiler (implies --autograd)") - parser.add_argument('--backward', action='store_true', help="Run backwards computation") - parser.add_argument('--skip-cpu-governor-check', action='store_true', help="Skip checking whether CPU governor is set to `performance`") + parser.add_argument('--cpu', type=int, default=0, help="CPU to run on") + parser.add_argument('--gpu', type=int, default=0, help="GPU to run on") + parser.add_argument('--batch-size', type=int, default=1, help="Batch size") + parser.add_argument('--input-size', type=int, default=256, help="Input size") + parser.add_argument('--hidden-size', type=int, default=512, help="Hidden size") + parser.add_argument('--seq-len', type=int, default=None, help="Sequence length") + parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") + parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") + parser.add_argument('--autograd', action='store_true', help="Use autograd") + parser.add_argument('--variable', action='store_true', + help="Use Variable, but not autograd (measure baseline overhead)") + parser.add_argument('--fused', action='store_true', help="Use fused cell") + parser.add_argument('--jit', action='store_true', help="Use JIT compiler (implies --autograd)") + parser.add_argument('--backward', action='store_true', help="Run backwards computation") + parser.add_argument('--skip-cpu-governor-check', action='store_true', + help="Skip checking whether CPU governor is set to `performance`") args = parser.parse_args() pprint.pprint(vars(args)) diff --git a/rnns/benchmarks/lstm_variants/container.py b/rnns/benchmarks/lstm_variants/container.py index e94138453a..7aa4bc6658 100644 --- a/rnns/benchmarks/lstm_variants/container.py +++ b/rnns/benchmarks/lstm_variants/container.py @@ -17,7 +17,7 @@ class MultiLayerLSTM(nn.Module): """ MultiLayer LSTM of any type. - + Note: Dropout is deactivated on the last layer. """ @@ -64,4 +64,3 @@ def forward(self, x, hiddens): x, new_h = l(x, h) new_hiddens.append(new_h) return x, new_hiddens - diff --git a/rnns/benchmarks/lstm_variants/lstm.py b/rnns/benchmarks/lstm_variants/lstm.py index 118746d911..6b3ef7adcc 100644 --- a/rnns/benchmarks/lstm_variants/lstm.py +++ b/rnns/benchmarks/lstm_variants/lstm.py @@ -198,8 +198,8 @@ def forward(self, x, hidden): c_t = th.mul(c, f_t) + th.mul(i_t, g_t) if do_dropout and self.dropout_method == 'moon': - c_t.data.set_(th.mul(c_t, self.mask).data) - c_t.data *= 1.0/(1.0 - self.dropout) + c_t.data.set_(th.mul(c_t, self.mask).data) + c_t.data *= 1.0 / (1.0 - self.dropout) h_t = th.mul(o_t, c_t.tanh()) @@ -208,8 +208,8 @@ def forward(self, x, hidden): if self.dropout_method == 'pytorch': F.dropout(h_t, p=self.dropout, training=self.training, inplace=True) if self.dropout_method == 'gal': - h_t.data.set_(th.mul(h_t, self.mask).data) - h_t.data *= 1.0/(1.0 - self.dropout) + h_t.data.set_(th.mul(h_t, self.mask).data) + h_t.data *= 1.0 / (1.0 - self.dropout) h_t = h_t.view(1, h_t.size(0), -1) c_t = c_t.view(1, c_t.size(0), -1) @@ -237,6 +237,7 @@ class MoonLSTM(LSTM): 'RNNDrop: A Novel Dropout for RNNs in ASR' https://www.stat.berkeley.edu/~tsmoon/files/Conference/asru2015.pdf """ + def __init__(self, *args, **kwargs): super(MoonLSTM, self).__init__(*args, **kwargs) self.dropout_method = 'moon' @@ -249,6 +250,7 @@ class SemeniutaLSTM(LSTM): 'Recurrent Dropout without Memory Loss' https://arxiv.org/pdf/1603.05118.pdf """ + def __init__(self, *args, **kwargs): super(SemeniutaLSTM, self).__init__(*args, **kwargs) self.dropout_method = 'semeniuta' @@ -275,8 +277,8 @@ def __init__(self, input_size, hidden_size, bias=True, dropout=0.0, dropout=dropout, dropout_method=dropout_method) if ln_preact: - self.ln_i2h = LayerNorm(4*hidden_size, learnable=learnable) - self.ln_h2h = LayerNorm(4*hidden_size, learnable=learnable) + self.ln_i2h = LayerNorm(4 * hidden_size, learnable=learnable) + self.ln_h2h = LayerNorm(4 * hidden_size, learnable=learnable) self.ln_preact = ln_preact self.ln_cell = LayerNorm(hidden_size, learnable=learnable) @@ -309,8 +311,8 @@ def forward(self, x, hidden): c_t = th.mul(c, f_t) + th.mul(i_t, g_t) if do_dropout and self.dropout_method == 'moon': - c_t.data.set_(th.mul(c_t, self.mask).data) - c_t.data *= 1.0/(1.0 - self.dropout) + c_t.data.set_(th.mul(c_t, self.mask).data) + c_t.data *= 1.0 / (1.0 - self.dropout) c_t = self.ln_cell(c_t) h_t = th.mul(o_t, c_t.tanh()) @@ -320,8 +322,8 @@ def forward(self, x, hidden): if self.dropout_method == 'pytorch': F.dropout(h_t, p=self.dropout, training=self.training, inplace=True) if self.dropout_method == 'gal': - h_t.data.set_(th.mul(h_t, self.mask).data) - h_t.data *= 1.0/(1.0 - self.dropout) + h_t.data.set_(th.mul(h_t, self.mask).data) + h_t.data *= 1.0 / (1.0 - self.dropout) h_t = h_t.view(1, h_t.size(0), -1) c_t = c_t.view(1, c_t.size(0), -1) @@ -333,6 +335,7 @@ class LayerNormGalLSTM(LayerNormLSTM): """ Mixes GalLSTM's Dropout with Layer Normalization """ + def __init__(self, *args, **kwargs): super(LayerNormGalLSTM, self).__init__(*args, **kwargs) self.dropout_method = 'gal' @@ -344,6 +347,7 @@ class LayerNormMoonLSTM(LayerNormLSTM): """ Mixes MoonLSTM's Dropout with Layer Normalization """ + def __init__(self, *args, **kwargs): super(LayerNormMoonLSTM, self).__init__(*args, **kwargs) self.dropout_method = 'moon' @@ -355,6 +359,7 @@ class LayerNormSemeniutaLSTM(LayerNormLSTM): """ Mixes SemeniutaLSTM's Dropout with Layer Normalization """ + def __init__(self, *args, **kwargs): super(LayerNormSemeniutaLSTM, self).__init__(*args, **kwargs) self.dropout_method = 'semeniuta' diff --git a/rnns/benchmarks/lstm_variants/normalize.py b/rnns/benchmarks/lstm_variants/normalize.py index 161d204e2a..b92afcebc9 100644 --- a/rnns/benchmarks/lstm_variants/normalize.py +++ b/rnns/benchmarks/lstm_variants/normalize.py @@ -48,5 +48,5 @@ def forward(self, x): x = x.view(x.size(0), -1) x = (x - th.mean(x, 1).unsqueeze(1)) / th.sqrt(th.var(x, 1).unsqueeze(1) + self.epsilon) if self.learnable: - x = self.alpha.expand_as(x) * x + self.beta.expand_as(x) + x = self.alpha.expand_as(x) * x + self.beta.expand_as(x) return x.view(size) diff --git a/rnns/benchmarks/memnn.py b/rnns/benchmarks/memnn.py index 040489bcbc..dd4a0bad41 100644 --- a/rnns/benchmarks/memnn.py +++ b/rnns/benchmarks/memnn.py @@ -44,7 +44,7 @@ def run_memnn(warmup=2, benchmark=18, jit=False, cuda=False): [ # memories, queries, memory_lengths, query_lengths torch.zeros(params.batch_size * params.mem_size, dtype=torch.long, device=device), torch.zeros(params.batch_size * 28 , dtype=torch.long, device=device), - torch.ones (params.batch_size, params.mem_size , dtype=torch.long, device=device), + torch.ones(params.batch_size, params.mem_size , dtype=torch.long, device=device), torch.full((params.batch_size,), 28 , dtype=torch.long, device=device), ] for _ in range(params.num_batches) @@ -87,10 +87,10 @@ def run_memnn(warmup=2, benchmark=18, jit=False, cuda=False): if __name__ == '__main__': parser = argparse.ArgumentParser(description="PyTorch memnn bench") - parser.add_argument('--warmup', type=int, default=2, help="Warmup iterations") - parser.add_argument('--benchmark', type=int, default=10, help="Benchmark iterations") - parser.add_argument('--jit', action='store_true', help="Use JIT compiler") - parser.add_argument('--cuda', action='store_true', help="use cuda") + parser.add_argument('--warmup', type=int, default=2, help="Warmup iterations") + parser.add_argument('--benchmark', type=int, default=10, help="Benchmark iterations") + parser.add_argument('--jit', action='store_true', help="Use JIT compiler") + parser.add_argument('--cuda', action='store_true', help="use cuda") args = parser.parse_args() pprint.pprint(vars(args)) diff --git a/rnns/benchmarks/mlstm.py b/rnns/benchmarks/mlstm.py index 2900afb8cc..e17acc0413 100644 --- a/rnns/benchmarks/mlstm.py +++ b/rnns/benchmarks/mlstm.py @@ -52,12 +52,12 @@ def run_mlstm(cpu=0, gpu=0, batch_size=1, input_size=205, hidden_size=1900, embe device = torch.device(gpu) input = torch.randn(seq_len, batch_size, input_size, requires_grad=requires_grad, device=device) - hx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device) - cx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device) - w_xm = torch.randn(embed_size, input_size, requires_grad=requires_grad, device=device) - w_hm = torch.randn(embed_size, hidden_size, requires_grad=requires_grad, device=device) - w_ih = torch.randn(4 * hidden_size, input_size, requires_grad=requires_grad, device=device) - w_mh = torch.randn(4 * hidden_size, embed_size, requires_grad=requires_grad, device=device) + hx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device) + cx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device) + w_xm = torch.randn(embed_size, input_size, requires_grad=requires_grad, device=device) + w_hm = torch.randn(embed_size, hidden_size, requires_grad=requires_grad, device=device) + w_ih = torch.randn(4 * hidden_size, input_size, requires_grad=requires_grad, device=device) + w_mh = torch.randn(4 * hidden_size, embed_size, requires_grad=requires_grad, device=device) params = [input, hx, cx, w_xm, w_hm, w_ih, w_mh] if jit: @@ -81,19 +81,20 @@ def run_mlstm(cpu=0, gpu=0, batch_size=1, input_size=205, hidden_size=1900, embe if __name__ == "__main__": parser = argparse.ArgumentParser(description="PyTorch mLSTM benchmark.") - parser.add_argument('--cpu', type=int, default=0, help="CPU to run on") - parser.add_argument('--gpu', type=int, default=0, help="GPU to run on") - parser.add_argument('--batch-size', type=int, default=1, help="Batch size") - parser.add_argument('--input-size', type=int, default=205, help="Input size") - parser.add_argument('--hidden-size', type=int, default=1900, help="Hidden size") - parser.add_argument('--embed-size', type=int, default=None, help="Embed size") - parser.add_argument('--seq-len', type=int, default=20, help="Sequence length") - parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") - parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") - parser.add_argument('--autograd', action='store_true', help="Use autograd") - parser.add_argument('--jit', action='store_true', help="Use JIT compiler (implies --autograd)") - parser.add_argument('--backward', action='store_true', help="benchmark forward + backward (implies --autograd)") - parser.add_argument('--skip-cpu-governor-check', action='store_true', help="Skip checking whether CPU governor is set to `performance`") + parser.add_argument('--cpu', type=int, default=0, help="CPU to run on") + parser.add_argument('--gpu', type=int, default=0, help="GPU to run on") + parser.add_argument('--batch-size', type=int, default=1, help="Batch size") + parser.add_argument('--input-size', type=int, default=205, help="Input size") + parser.add_argument('--hidden-size', type=int, default=1900, help="Hidden size") + parser.add_argument('--embed-size', type=int, default=None, help="Embed size") + parser.add_argument('--seq-len', type=int, default=20, help="Sequence length") + parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") + parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") + parser.add_argument('--autograd', action='store_true', help="Use autograd") + parser.add_argument('--jit', action='store_true', help="Use JIT compiler (implies --autograd)") + parser.add_argument('--backward', action='store_true', help="benchmark forward + backward (implies --autograd)") + parser.add_argument('--skip-cpu-governor-check', action='store_true', + help="Skip checking whether CPU governor is set to `performance`") args = parser.parse_args() pprint.pprint(vars(args)) diff --git a/rnns/benchmarks/models/bnlstm.py b/rnns/benchmarks/models/bnlstm.py index f577fd4d3f..6ffb650223 100644 --- a/rnns/benchmarks/models/bnlstm.py +++ b/rnns/benchmarks/models/bnlstm.py @@ -127,7 +127,7 @@ def forward(self, input_, hx): wi = torch.mm(input_, self.weight_ih) f, i, o, g = torch.split(wh_b + wi, self.hidden_size, dim=1) - c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g) + c_1 = torch.sigmoid(f) * c_0 + torch.sigmoid(i) * torch.tanh(g) h_1 = torch.sigmoid(o) * torch.tanh(c_1) return h_1, c_1 @@ -140,7 +140,7 @@ def __repr__(self): def bnlstm_helper(c_0, bn_wh, bn_wi, bias_batch): f, i, o, g = torch.chunk(bn_wh + bn_wi + bias_batch, chunks=4, dim=1) - c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g) + c_1 = torch.sigmoid(f) * c_0 + torch.sigmoid(i) * torch.tanh(g) return c_1, o @@ -230,7 +230,7 @@ def forward(self, input_, hx, time): f, i, o, g = torch.split(bn_wh + bn_wi + bias_batch, self.hidden_size, dim=1) - c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g) + c_1 = torch.sigmoid(f) * c_0 + torch.sigmoid(i) * torch.tanh(g) h_1 = torch.sigmoid(o) * torch.tanh(self.bn_c(c_1, time=time)) return h_1, c_1 @@ -277,8 +277,8 @@ def _forward_rnn(cell, input_, length, hx): else: h_next, c_next = cell(input_=input_[time], hx=hx) mask = (time < length).float().unsqueeze(1).expand_as(h_next) - h_next = h_next*mask + hx[0]*(1 - mask) - c_next = c_next*mask + hx[1]*(1 - mask) + h_next = h_next * mask + hx[0] * (1 - mask) + c_next = c_next * mask + hx[1] * (1 - mask) hx_next = (h_next, c_next) output.append(h_next) hx = hx_next diff --git a/rnns/benchmarks/models/memnn.py b/rnns/benchmarks/models/memnn.py index a2d5e28055..c8ab1afe66 100644 --- a/rnns/benchmarks/models/memnn.py +++ b/rnns/benchmarks/models/memnn.py @@ -24,13 +24,13 @@ def __init__(self, opt, num_features): self.extra_features_slots = 0 if opt['time_features']: self.time_features = torch.LongTensor(range(num_features, - num_features + self.num_time_features)) + num_features + self.num_time_features)) num_features += self.num_time_features self.extra_features_slots += 1 def embedding(): return Embed(num_features, opt['embedding_size'], - position_encoding=opt['position_encoding'], padding_idx=0) + position_encoding=opt['position_encoding'], padding_idx=0) self.query_embedder = embedding() self.answer_embedder = embedding() @@ -83,7 +83,7 @@ def forward(self, memories, queries, memory_lengths, query_lengths): for _ in range(self.opt['hops']): query_embeddings = self.memory_hop(query_embeddings, - in_memory_embeddings, out_memory_embeddings, attention_mask) + in_memory_embeddings, out_memory_embeddings, attention_mask) return query_embeddings @@ -106,7 +106,7 @@ def forward(self, lengths, indices): for i, row in enumerate(lengths_mat): for j, length in enumerate(row): if length > 0: - input[i, j, :length] = indices[offset:offset+length] + input[i, j, :length] = indices[offset:offset + length] offset += length for i, row in enumerate(lengths_mat): @@ -137,11 +137,11 @@ def position_matrix(J, d): # for k in range(1, d+1): # for j in range(1, J+1): # m[j-1, k-1] = (1 - j/J) - (k/d) * (1 - 2 * j/J) - k = torch.arange(d+1, dtype=torch.float)[1:].unsqueeze(0).expand(J, d) - j = torch.arange(J+1, dtype=torch.float)[1:].unsqueeze(1).expand(J, d) + k = torch.arange(d + 1, dtype=torch.float)[1:].unsqueeze(0).expand(J, d) + j = torch.arange(J + 1, dtype=torch.float)[1:].unsqueeze(1).expand(J, d) J = float(J) d = float(d) - out = (1. - j/J) - (k/d) * (1. - 2. * j/J) + out = (1. - j / J) - (k / d) * (1. - 2. * j / J) return out @staticmethod diff --git a/rnns/benchmarks/models/mlstm.py b/rnns/benchmarks/models/mlstm.py index 927a2844a6..989425cea4 100644 --- a/rnns/benchmarks/models/mlstm.py +++ b/rnns/benchmarks/models/mlstm.py @@ -7,6 +7,8 @@ # This is slightly different to the most commonly used LSTM variant, where the output gate is # applied after the hyperbolic tangent. + + def KrauseLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): # Terminology matchup: # - This implementation uses the trick of having all gates concatenated @@ -29,6 +31,7 @@ def KrauseLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): return hy, cy + def MultiplicativeLSTMCell(input, hidden, w_xm, w_hm, w_ih, w_mh, b_xm=None, b_hm=None, b_ih=None, b_mh=None): # w_ih holds W_hx, W_ix, W_ox, W_fx # w_mh holds W_hm, W_im, W_om, W_fm diff --git a/rnns/benchmarks/nlp.py b/rnns/benchmarks/nlp.py index 96069163f1..6b7da1fbd9 100644 --- a/rnns/benchmarks/nlp.py +++ b/rnns/benchmarks/nlp.py @@ -16,15 +16,15 @@ class POSTagger(Benchmark): 18, 18, 31, 26, 5, 29, 16, 19] default_params = dict( - embedding_size = 50, - rnn_size = 51, - hidden_size = 52, - action_embedding_size = 5, - num_input_tokens = 32, - num_labels = 32, - minibatch_size = 5, - preprocess_minibatch = True, - cuda = False) + embedding_size=50, + rnn_size=51, + hidden_size=52, + action_embedding_size=5, + num_input_tokens=32, + num_labels=32, + minibatch_size=5, + preprocess_minibatch=True, + cuda=False) params = make_params(preprocess_minibatch=over(True, False)) def prepare(self, p): @@ -40,10 +40,14 @@ def prepare(self, p): for module in [self.embed_word, self.gru, self.embed_action, self.combine_arh, self.policy, self.loss_fn]: module.cuda() self.LongTensor = torch.cuda.LongTensor - cast = lambda t: t.cuda() + + def cast(t): + return t.cuda() else: self.LongTensor = torch.LongTensor - cast = lambda t: t + + def cast(t): + return t self.cast = cast self.initial_h = Variable(cast(torch.zeros(1, p.hidden_size)), requires_grad=True) diff --git a/rnns/benchmarks/qrnn.py b/rnns/benchmarks/qrnn.py index a079e5d997..1f984df1bb 100644 --- a/rnns/benchmarks/qrnn.py +++ b/rnns/benchmarks/qrnn.py @@ -49,16 +49,16 @@ def run_qrnn(batch_size=20, input_size=128, seq_len=20, if __name__ == "__main__": parser = argparse.ArgumentParser(description="PyTorch qrnn benchmark.") - parser.add_argument('--batch-size', type=int, default=20, help="Batch size") - parser.add_argument('--input-size', type=int, default=128, help="Input size") - parser.add_argument('--hidden-size', type=int, default=256, help="Hidden size") - parser.add_argument('--num-layers', type=int, default=10, help="Hidden size") - parser.add_argument('--seq-len', type=int, default=20, help="Sequence length") - parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") - parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") - parser.add_argument('--cuda', action='store_true', help="Use cuda") - parser.add_argument('--use-kernel', action='store_true', help="Use fused cell") - parser.add_argument('--jit', action='store_true', help="Use JIT compiler") + parser.add_argument('--batch-size', type=int, default=20, help="Batch size") + parser.add_argument('--input-size', type=int, default=128, help="Input size") + parser.add_argument('--hidden-size', type=int, default=256, help="Hidden size") + parser.add_argument('--num-layers', type=int, default=10, help="Hidden size") + parser.add_argument('--seq-len', type=int, default=20, help="Sequence length") + parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") + parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") + parser.add_argument('--cuda', action='store_true', help="Use cuda") + parser.add_argument('--use-kernel', action='store_true', help="Use fused cell") + parser.add_argument('--jit', action='store_true', help="Use JIT compiler") args = parser.parse_args() pprint.pprint(vars(args)) diff --git a/rnns/benchmarks/rnn.py b/rnns/benchmarks/rnn.py index 4e3c7995c4..82e81a4d15 100644 --- a/rnns/benchmarks/rnn.py +++ b/rnns/benchmarks/rnn.py @@ -5,6 +5,7 @@ # This file is not in use + class WLM(Benchmark): default_params = dict(rnn_type='LSTM', num_tokens=10000, embedding_size=200, hidden_size=200, num_layers=2, batch_size=20, bptt=35, @@ -17,7 +18,8 @@ def get_rnn(): return getattr(nn, p.rnn_type)(p.embedding_size, p.hidden_size, p.num_layers, dropout=p.dropout) else: nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[p.rnn_type] - return nn.RNN(p.embedding_size, p.hidden_size, p.num_layers, nonlinearity=nonlinearity, dropout=p.dropout) + return nn.RNN(p.embedding_size, p.hidden_size, p.num_layers, nonlinearity=nonlinearity, + dropout=p.dropout) class Model(nn.Module): def __init__(self): @@ -54,5 +56,3 @@ def time_word_language_model_example(self, p): total_loss += loss.data # CUDA sync point if p.cuda: torch.cuda.synchronize() - - diff --git a/rnns/benchmarks/sequence_labeler.py b/rnns/benchmarks/sequence_labeler.py index 0e81ae1015..912bd8f76b 100644 --- a/rnns/benchmarks/sequence_labeler.py +++ b/rnns/benchmarks/sequence_labeler.py @@ -10,7 +10,7 @@ - most recent word - the previous action (aka predicted label). - the previous hidden state - + Can it be faster?!?!?!?!?!? (Adapted from https://gist.github.com/hal3/8c170c4400576eb8d0a8bd94ab231232.) @@ -35,12 +35,14 @@ # Assuming this script is being called from the benchmark/rnns dir wsj_default_path = './wsj.pkl' + def reseed(seed=90210): random.seed(seed) torch.manual_seed(seed) reseed() + @torch.jit.script def gru_cell(input_, hidden, w_hh, b_hh): gi = input_ @@ -119,11 +121,13 @@ def __init__(self, tokens, labels, n_labels): self.labels = labels self.n_labels = n_labels + def minibatch(data, minibatch_size, reshuffle): if reshuffle: random.shuffle(data) for n in range(0, len(data), minibatch_size): - yield data[n:n+minibatch_size] + yield data[n:n + minibatch_size] + def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False): jit_tag = '_jit' if jit else '' @@ -155,7 +159,7 @@ def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False): initial_h_tensor = torch.Tensor(1, d_hid) initial_h_tensor.zero_() initial_h = Parameter(initial_h_tensor) - + initial_actemb_tensor = torch.Tensor(1, d_actemb) initial_actemb_tensor.zero_() initial_actemb = Parameter(initial_actemb_tensor) @@ -178,7 +182,7 @@ def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False): total_loss = 0 prof = None with iter_timer: - #with torch.autograd.profiler.profile() as prof: + # with torch.autograd.profiler.profile() as prof: for batch in minibatch(data, minibatch_size, True): optimizer.zero_grad() loss = 0 @@ -197,11 +201,11 @@ def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False): all_rnn_out = all_rnn_out.cpu() else: all_rnn_out, _ = gru(all_e) - + for ex in batch: N = len(ex.tokens) if preprocess_minibatch: - rnn_out = all_rnn_out[0,:,:].view(-1, 1, 2 * d_rnn) + rnn_out = all_rnn_out[0, :, :].view(-1, 1, 2 * d_rnn) else: e = embed_word(Variable(torch.LongTensor(ex.tokens), requires_grad=False)).view(N, 1, -1) [rnn_out, _] = gru(e) @@ -238,7 +242,7 @@ def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False): print(prof.key_averages()) print(total_loss) return iter_timer - + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--epochs', type=int, default=10, diff --git a/rnns/benchmarks/sru.py b/rnns/benchmarks/sru.py index a5147e51df..cd7dc6566c 100644 --- a/rnns/benchmarks/sru.py +++ b/rnns/benchmarks/sru.py @@ -1,4 +1,4 @@ -#from builtins import bytes +# from builtins import bytes import time import numpy as np @@ -10,7 +10,7 @@ from collections import namedtuple -tmp_ = torch.rand(1,1).cuda() +tmp_ = torch.rand(1, 1).cuda() SRU_CODE = """ extern "C" { @@ -336,6 +336,7 @@ Stream = namedtuple('Stream', ['ptr']) SRU_STREAM = Stream(ptr=torch.cuda.current_stream().cuda_stream) + class SRU_Compute(Function): def __init__(self, activation_type, d_out, bidirectional=False): @@ -350,13 +351,13 @@ def forward(self, u, x, bias, init=None, mask_h=None): batch = x.size(-2) d = self.d_out k = u.size(-1) // d - k_ = k//2 if self.bidirectional else k - ncols = batch*d*bidir + k_ = k // 2 if self.bidirectional else k + ncols = batch * d * bidir thread_per_block = min(512, ncols) - num_block = (ncols-1)//thread_per_block+1 + num_block = (ncols - 1) // thread_per_block + 1 init_ = x.new(ncols).zero_() if init is None else init - size = (length, batch, d*bidir) if x.dim() == 3 else (batch, d*bidir) + size = (length, batch, d * bidir) if x.dim() == 3 else (batch, d * bidir) c = x.new(*size) h = x.new(*size) FUNC = SRU_FWD_FUNC if not self.bidirectional else SRU_BiFWD_FUNC @@ -373,7 +374,7 @@ def forward(self, u, x, bias, init=None, mask_h=None): h.data_ptr(), c.data_ptr(), self.activation_type], - block = (thread_per_block,1,1), grid = (num_block,1,1), + block=(thread_per_block, 1, 1), grid=(num_block, 1, 1), stream=SRU_STREAM ) @@ -382,7 +383,7 @@ def forward(self, u, x, bias, init=None, mask_h=None): if x.dim() == 2: last_hidden = c elif self.bidirectional: - last_hidden = torch.cat((c[-1,:,:d], c[0,:,d:]), dim=1) + last_hidden = torch.cat((c[-1, :, :d], c[0, :, d:]), dim=1) else: last_hidden = c[-1] return h, last_hidden @@ -395,19 +396,19 @@ def backward(self, grad_h, grad_last): batch = x.size(-2) d = self.d_out k = u.size(-1) // d - k_ = k//2 if self.bidirectional else k - ncols = batch*d*bidir + k_ = k // 2 if self.bidirectional else k + ncols = batch * d * bidir thread_per_block = min(512, ncols) - num_block = (ncols-1)//thread_per_block+1 + num_block = (ncols - 1) // thread_per_block + 1 init_ = x.new(ncols).zero_() if init is None else init grad_u = u.new(*u.size()) - grad_bias = x.new(2, batch, d*bidir) - grad_init = x.new(batch, d*bidir) + grad_bias = x.new(2, batch, d * bidir) + grad_init = x.new(batch, d * bidir) # For DEBUG - #size = (length, batch, x.size(-1)) if x.dim() == 3 else (batch, x.size(-1)) - #grad_x = x.new(*x.size()) if k_ == 3 else x.new(*size).zero_() + # size = (length, batch, x.size(-1)) if x.dim() == 3 else (batch, x.size(-1)) + # grad_x = x.new(*x.size()) if k_ == 3 else x.new(*size).zero_() # Normal use grad_x = x.new(*x.size()) if k_ == 3 else None @@ -431,7 +432,7 @@ def backward(self, grad_h, grad_last): grad_bias.data_ptr(), grad_init.data_ptr(), self.activation_type], - block = (thread_per_block,1,1), grid = (num_block,1,1), + block=(thread_per_block, 1, 1), grid=(num_block, 1, 1), stream=SRU_STREAM ) return grad_u, grad_x, grad_bias.sum(1).view(-1), grad_init, None @@ -444,7 +445,7 @@ def forward(u, x, bias, init=None, mask_h=None): batch = x.size(-2) d = d_out k = u.size(-1) // d - k_ = k//2 if bidirectional else k + k_ = k // 2 if bidirectional else k u = u.view(length, batch, d, k_) @@ -458,15 +459,15 @@ def forward(u, x, bias, init=None, mask_h=None): u0i, u1i, u2i = u_[0][i], u_[1][i], u_[2][i] g1 = torch.sigmoid(u1i + bias1) g2 = torch.sigmoid(u2i + bias2) - cur = (cur - u0i)*g1 + u0i + cur = (cur - u0i) * g1 + u0i if activation_type == 1: val = torch.tanh(cur) elif activation_type == 2: val = torch.relu(cur) if mask_h is not None: - val = val*mask_h + val = val * mask_h xi = x_[i] - h.append((val - xi)*g2 + xi) + h.append((val - xi) * g2 + xi) if bidirectional: assert False @@ -490,15 +491,15 @@ def __init__(self, n_in, n_out, dropout=0, rnn_dropout=0, self.bidirectional = bidirectional self.activation_type = 2 if use_relu else (1 if use_tanh else 0) self.use_kernel = use_kernel - out_size = n_out*2 if bidirectional else n_out + out_size = n_out * 2 if bidirectional else n_out k = 4 if n_in != out_size else 3 - self.size_per_dir = n_out*k + self.size_per_dir = n_out * k self.weight = nn.Parameter(torch.Tensor( n_in, - self.size_per_dir*2 if bidirectional else self.size_per_dir + self.size_per_dir * 2 if bidirectional else self.size_per_dir )) self.bias = nn.Parameter(torch.Tensor( - n_out*4 if bidirectional else n_out*2 + n_out * 4 if bidirectional else n_out * 2 )) self.init_weight() self.jit = jit @@ -510,14 +511,14 @@ def __init__(self, n_in, n_out, dropout=0, rnn_dropout=0, self.sru_jit_traced = False def init_weight(self): - val_range = (3.0/self.n_in)**0.5 + val_range = (3.0 / self.n_in)**0.5 self.weight.data.uniform_(-val_range, val_range) self.bias.data.zero_() def set_bias(self, bias_val=0): n_out = self.n_out if self.bidirectional: - self.bias.data[n_out*2:].zero_().add_(bias_val) + self.bias.data[n_out * 2:].zero_().add_(bias_val) else: self.bias.data[n_out:].zero_().add_(bias_val) @@ -527,10 +528,10 @@ def forward(self, input, c0=None): batch = input.size(-2) if c0 is None: c0 = Variable(input.data.new( - batch, n_out if not self.bidirectional else n_out*2 + batch, n_out if not self.bidirectional else n_out * 2 ).zero_()) - if self.training and (self.rnn_dropout>0): + if self.training and (self.rnn_dropout > 0): mask = self.get_dropout_mask_((batch, n_in), self.rnn_dropout) x = input * mask.expand_as(input) else: @@ -539,9 +540,9 @@ def forward(self, input, c0=None): x_2d = x if x.dim() == 2 else x.contiguous().view(-1, n_in) u = x_2d.mm(self.weight) - if self.training and (self.dropout>0): + if self.training and (self.dropout > 0): bidir = 2 if self.bidirectional else 1 - mask_h = self.get_dropout_mask_((batch, n_out*bidir), self.dropout) + mask_h = self.get_dropout_mask_((batch, n_out * bidir), self.dropout) if self.use_kernel: h, c = SRU_Compute(self.activation_type, n_out, self.bidirectional)(u, input, self.bias, c0, mask_h) return h, c @@ -556,12 +557,13 @@ def forward(self, input, c0=None): h, c = self.sru_jit(u, input, self.bias, c0, mask_h) return h, c - h, c = SRU_Compute_No_Kernel(self.activation_type, n_out, self.bidirectional)(u, input, self.bias, c0, mask_h) + h, c = SRU_Compute_No_Kernel(self.activation_type, n_out, self.bidirectional)( + u, input, self.bias, c0, mask_h) else: if self.use_kernel: h, c = SRU_Compute(self.activation_type, n_out, self.bidirectional)(u, input, self.bias, c0) return h, c - + if self.jit: if not self.sru_jit_traced: print("Tracing sru cell without dropout") @@ -577,13 +579,13 @@ def forward(self, input, c0=None): def get_dropout_mask_(self, size, p): w = self.weight.data - return Variable(w.new(*size).bernoulli_(1-p).div_(1-p)) + return Variable(w.new(*size).bernoulli_(1 - p).div_(1 - p)) class SRU(nn.Module): def __init__(self, input_size, hidden_size, num_layers=2, dropout=0, rnn_dropout=0, - bidirectional=False, use_tanh=1, use_relu=0, use_kernel=True, - jit=False): + bidirectional=False, use_tanh=1, use_relu=0, use_kernel=True, + jit=False): super(SRU, self).__init__() self.n_in = input_size self.n_out = hidden_size @@ -593,18 +595,18 @@ def __init__(self, input_size, hidden_size, num_layers=2, dropout=0, rnn_dropout self.rnn_lst = nn.ModuleList() self.bidirectional = bidirectional self.use_kernel = use_kernel - self.out_size = hidden_size*2 if bidirectional else hidden_size + self.out_size = hidden_size * 2 if bidirectional else hidden_size for i in range(num_layers): l = SRUCell( - n_in = self.n_in if i==0 else self.out_size, - n_out = self.n_out, - dropout = dropout if i+1 != num_layers else 0, - rnn_dropout = rnn_dropout, - bidirectional = bidirectional, - use_tanh = use_tanh, - use_relu = use_relu, - use_kernel = use_kernel, + n_in=self.n_in if i == 0 else self.out_size, + n_out=self.n_out, + dropout=dropout if i + 1 != num_layers else 0, + rnn_dropout=rnn_dropout, + bidirectional=bidirectional, + use_tanh=use_tanh, + use_relu=use_relu, + use_kernel=use_kernel, jit=jit, ) self.rnn_lst.append(l) @@ -614,16 +616,16 @@ def set_bias(self, bias_val=0): l.set_bias(bias_val) def forward(self, input, c0=None, return_hidden=True): - assert input.dim() == 3 # (len, batch, n_in) + assert input.dim() == 3 # (len, batch, n_in) dir_ = 2 if self.bidirectional else 1 if c0 is None: zeros = Variable(input.data.new( - input.size(1), self.n_out*dir_ + input.size(1), self.n_out * dir_ ).zero_()) - c0 = [ zeros for i in range(self.depth) ] + c0 = [zeros for i in range(self.depth)] else: assert c0.dim() == 3 # (depth, batch, n_out*dir_) - c0 = [ x.squeeze(0) for x in c0.chunk(self.depth, 0) ] + c0 = [x.squeeze(0) for x in c0.chunk(self.depth, 0)] prevx = input lstc = [] diff --git a/rnns/benchmarks/sru_test.py b/rnns/benchmarks/sru_test.py index 6ef78c71b4..a8efd5a003 100644 --- a/rnns/benchmarks/sru_test.py +++ b/rnns/benchmarks/sru_test.py @@ -25,15 +25,15 @@ def run_sru(cpu=0, gpu=0, jit=False, use_kernel=False, backward=False, input_size, hidden_size = 128, 128 rnn = SRU(input_size, hidden_size, - num_layers = 2, # number of stacking RNN layers - dropout = 0.00001, # dropout applied between RNN layers - rnn_dropout = 0.0001, # variational dropout applied on linear transformation - use_tanh = 1, # use tanh? - use_relu = 0, # use ReLU? - bidirectional = False, # bidirectional RNN ? - use_kernel=use_kernel, - jit=jit, - ) + num_layers=2, # number of stacking RNN layers + dropout=0.00001, # dropout applied between RNN layers + rnn_dropout=0.0001, # variational dropout applied on linear transformation + use_tanh=1, # use tanh? + use_relu=0, # use ReLU? + bidirectional=False, # bidirectional RNN ? + use_kernel=use_kernel, + jit=jit, + ) rnn.cuda() kernel_tag = '_kernel' if use_kernel else '' @@ -55,13 +55,13 @@ def run_sru(cpu=0, gpu=0, jit=False, use_kernel=False, backward=False, if __name__ == '__main__': parser = argparse.ArgumentParser(description="PyTorch mLSTM benchmark.") - parser.add_argument('--cpu', type=int, default=0, help="CPU to run on") - parser.add_argument('--gpu', type=int, default=0, help="GPU to run on") - parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") - parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") - parser.add_argument('--jit', action='store_true', help="Use JIT compiler") - parser.add_argument('--use-kernel', action='store_true', help="Use specialized kernel") - parser.add_argument('--backward', action='store_true', help="benchmark forward + backward") + parser.add_argument('--cpu', type=int, default=0, help="CPU to run on") + parser.add_argument('--gpu', type=int, default=0, help="GPU to run on") + parser.add_argument('--warmup', type=int, default=10, help="Warmup iterations") + parser.add_argument('--benchmark', type=int, default=20, help="Benchmark iterations") + parser.add_argument('--jit', action='store_true', help="Use JIT compiler") + parser.add_argument('--use-kernel', action='store_true', help="Use specialized kernel") + parser.add_argument('--backward', action='store_true', help="benchmark forward + backward") args = parser.parse_args() pprint.pprint(vars(args)) diff --git a/rnns/benchmarks/torchqrnn/forget_mult.py b/rnns/benchmarks/torchqrnn/forget_mult.py index 0b03e87c5a..79d1b6a2d1 100644 --- a/rnns/benchmarks/torchqrnn/forget_mult.py +++ b/rnns/benchmarks/torchqrnn/forget_mult.py @@ -39,7 +39,8 @@ } } extern "C" -__global__ void bwd_recurrent_forget_mult(const float *h, const float *f, const float *x, const float *gh, float *gf, float *gx, float *ghinit, int SEQ, int BATCH, int HIDDEN) +__global__ void bwd_recurrent_forget_mult(const float *h, const float *f, const float *x, const float *gh, float *gf, + float *gx, float *ghinit, int SEQ, int BATCH, int HIDDEN) { /* Note: h is assumed to be one timestep longer than f, x, gf, gx, or gh where dst[0] = h_{-1} @@ -172,7 +173,8 @@ class ForgetMult(torch.nn.Module): - X (seq_len, batch, input_size): tensor containing the features of the input sequence. - F (seq_len, batch, input_size): tensor containing the forget gate values, assumed in range [0, 1]. - hidden_init (batch, input_size): tensor containing the initial hidden state for the recurrence (h_{t-1}). - - use_kernel: If True, use the fast element-wise CUDA kernel for recurrence. If False, uses naive for loop. Default: True. + - use_kernel: If True, use the fast element-wise CUDA kernel for recurrence. + If False, uses naive for loop. Default: True. """ def __init__(self, use_kernel=False, jit=False): @@ -217,15 +219,15 @@ def test_accuracy(): seq, batch, hidden = 35, 20, 650 # Larger input (batch * seq * hidden) results in excessive memory for gradient check seq, batch, hidden = 3, 7, 19 - a = Variable(torch.rand(seq, batch, hidden).cuda(), requires_grad=True) + a = Variable(torch.rand(seq, batch, hidden).cuda(), requires_grad=True) forget = Variable(torch.rand(seq, batch, hidden).cuda(), requires_grad=True) last_h = Variable(torch.rand(batch, hidden).cuda(), requires_grad=True) - #seq, batch, hidden = 4, 1, 1 - #a = Variable(torch.Tensor([0.75, 0.5, 0.9, 0.8]).view(seq, batch, hidden).cuda(), requires_grad=True) - #forget = Variable(torch.Tensor([0.25, 0.25, 0.5, 0.4]).view(seq, batch, hidden).cuda(), requires_grad=True) - #last_h = Variable(torch.Tensor([0]).view(batch, hidden).cuda(), requires_grad=True) - #print(forget, a, last_h) + # seq, batch, hidden = 4, 1, 1 + # a = Variable(torch.Tensor([0.75, 0.5, 0.9, 0.8]).view(seq, batch, hidden).cuda(), requires_grad=True) + # forget = Variable(torch.Tensor([0.25, 0.25, 0.5, 0.4]).view(seq, batch, hidden).cuda(), requires_grad=True) + # last_h = Variable(torch.Tensor([0]).view(batch, hidden).cuda(), requires_grad=True) + # print(forget, a, last_h) print('CUDA forget mult') print('=-=-' * 5) diff --git a/rnns/benchmarks/torchqrnn/qrnn.py b/rnns/benchmarks/torchqrnn/qrnn.py index 0f61a6d289..779327ed2a 100644 --- a/rnns/benchmarks/torchqrnn/qrnn.py +++ b/rnns/benchmarks/torchqrnn/qrnn.py @@ -9,15 +9,21 @@ # code copied from https://github.com/salesforce/pytorch-qrnn/tree/master/torchqrnn + class QRNNLayer(nn.Module): r"""Applies a single layer Quasi-Recurrent Neural Network (QRNN) to an input sequence. Args: input_size: The number of expected features in the input x. hidden_size: The number of features in the hidden state h. If not specified, the input size is used. - save_prev_x: Whether to store previous inputs for use in future convolutional windows (i.e. for a continuing sequence such as in language modeling). If true, you must call reset to remove cached previous values of x. Default: False. - window: Defines the size of the convolutional window (how many previous tokens to look when computing the QRNN values). Supports 1 and 2. Default: 1. - zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state) to the hidden state updates. Default: 0. - output_gate: If True, performs QRNN-fo (applying an output gate to the output). If False, performs QRNN-f. Default: True. + save_prev_x: Whether to store previous inputs for use in future convolutional windows + (i.e. for a continuing sequence such as in language modeling). If true, you must call + reset to remove cached previous values of x. Default: False. + window: Defines the size of the convolutional window (how many previous tokens to look when computing + the QRNN values). Supports 1 and 2. Default: 1. + zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state) + to the hidden state updates. Default: 0. + output_gate: If True, performs QRNN-fo (applying an output gate to the output). + If False, performs QRNN-f. Default: True. use_kernel: If True, uses fast custom CUDA kernel. If False, uses naive for loop. Default: True. Inputs: X, hidden - X (seq_len, batch, input_size): tensor containing the features of the input sequence. @@ -27,10 +33,12 @@ class QRNNLayer(nn.Module): - h_n (batch, hidden_size): tensor containing the hidden state for t=seq_len """ - def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, window=1, output_gate=True, use_kernel=True, jit=False): + def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, window=1, + output_gate=True, use_kernel=True, jit=False): super(QRNNLayer, self).__init__() - assert window in [1, 2], "This QRNN implementation currently only handles convolutional window of size 1 or size 2" + assert window in [ + 1, 2], "This QRNN implementation currently only handles convolutional window of size 1 or size 2" self.window = window self.input_size = input_size self.hidden_size = hidden_size if hidden_size else input_size @@ -43,7 +51,8 @@ def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, w assert not (use_kernel and jit) # One large matmul with concat is faster than N small matmuls and no concat - self.linear = nn.Linear(self.window * self.input_size, 3 * self.hidden_size if self.output_gate else 2 * self.hidden_size) + self.linear = nn.Linear(self.window * self.input_size, 3 * + self.hidden_size if self.output_gate else 2 * self.hidden_size) self.forget_mult = ForgetMult(use_kernel, jit) @@ -120,10 +129,15 @@ class QRNN(torch.nn.Module): hidden_size: The number of features in the hidden state h. If not specified, the input size is used. num_layers: The number of QRNN layers to produce. layers: List of preconstructed QRNN layers to use for the QRNN module (optional). - save_prev_x: Whether to store previous inputs for use in future convolutional windows (i.e. for a continuing sequence such as in language modeling). If true, you must call reset to remove cached previous values of x. Default: False. - window: Defines the size of the convolutional window (how many previous tokens to look when computing the QRNN values). Supports 1 and 2. Default: 1. - zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state) to the hidden state updates. Default: 0. - output_gate: If True, performs QRNN-fo (applying an output gate to the output). If False, performs QRNN-f. Default: True. + save_prev_x: Whether to store previous inputs for use in future convolutional windows + (i.e. for a continuing sequence such as in language modeling). If true, you must call + reset to remove cached previous values of x. Default: False. + window: Defines the size of the convolutional window (how many previous tokens to look when computing + the QRNN values). Supports 1 and 2. Default: 1. + zoneout: Whether to apply zoneout (i.e. failing to update elements in the hidden state) + to the hidden state updates. Default: 0. + output_gate: If True, performs QRNN-fo (applying an output gate to the output). + If False, performs QRNN-f. Default: True. use_kernel: If True, uses fast custom CUDA kernel. If False, uses naive for loop. Default: True. Inputs: X, hidden - X (seq_len, batch, input_size): tensor containing the features of the input sequence. @@ -136,13 +150,14 @@ class QRNN(torch.nn.Module): def __init__(self, input_size, hidden_size, num_layers=1, bias=True, batch_first=False, dropout=0, bidirectional=False, layers=None, **kwargs): - assert bidirectional == False, 'Bidirectional QRNN is not yet supported' - assert batch_first == False, 'Batch first mode is not yet supported' - assert bias == True, 'Removing underlying bias is not yet supported' + assert bidirectional is False, 'Bidirectional QRNN is not yet supported' + assert batch_first is False, 'Batch first mode is not yet supported' + assert bias is True, 'Removing underlying bias is not yet supported' super(QRNN, self).__init__() - self.layers = torch.nn.ModuleList(layers if layers else [QRNNLayer(input_size if l == 0 else hidden_size, hidden_size, **kwargs) for l in range(num_layers)]) + self.layers = torch.nn.ModuleList(layers if layers else [QRNNLayer( + input_size if l == 0 else hidden_size, hidden_size, **kwargs) for l in range(num_layers)]) self.input_size = input_size self.hidden_size = hidden_size @@ -201,6 +216,6 @@ def forward(self, input, hidden=None): assert diff < 1e-5, 'CUDA and non-CUDA QRNN layers return different results' from torch.autograd import gradcheck - inputs = [X,] + inputs = [X, ] test = gradcheck(QRNNLayer(hidden_size, hidden_size).cuda(), inputs) print(test) diff --git a/rnns/fastrnns/bench.py b/rnns/fastrnns/bench.py index 9af6c14aa0..d691906124 100644 --- a/rnns/fastrnns/bench.py +++ b/rnns/fastrnns/bench.py @@ -154,7 +154,7 @@ def bench(rnn_runners, group_name, print_json=False, sep=' ', **params): vlrnns = ['vl_cudnn', 'vl_jit', 'vl_py'] cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit'] if args.print_json: - print_stderr = lambda *args, **kwargs: None + print_stderr = lambda *args, **kwargs: None # noqa print_stderr(args) bench_args = vars(args) diff --git a/rnns/fastrnns/factory.py b/rnns/fastrnns/factory.py index b7d6f4fe5c..044eaf513c 100644 --- a/rnns/fastrnns/factory.py +++ b/rnns/fastrnns/factory.py @@ -202,7 +202,7 @@ def forward(sequences, hidden): def varlen_lstm_factory(cell, script): def dynamic_rnn(sequences, hiddens, wih, whh, bih, bhh): - # type: (List[Tensor], Tuple[Tensor, Tensor], Tensor, Tensor, Tensor, Tensor) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]] + # type: (List[Tensor], Tuple[Tensor, Tensor], Tensor, Tensor, Tensor, Tensor) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]] # noqa hx, cx = hiddens hxs = hx.unbind(1) cxs = cx.unbind(1) @@ -234,7 +234,7 @@ def dynamic_rnn(sequences, hiddens, wih, whh, bih, bhh): def varlen_lstm_creator(script=False, **kwargs): - sequences, _, hidden, params, _ = varlen_lstm_inputs( + sequences, _, hidden, params, _ = varlen_lstm_inputs( return_module=False, **kwargs) inputs = [sequences, hidden] + params[0] return ModelDef( @@ -307,7 +307,6 @@ def dynamic_rnn(input, hidden, wih, whh, bih, bhh): return dynamic_rnn - # premul: we're going to premultiply the inputs & weights def lstm_factory_premul(premul_cell, script): def dynamic_rnn(input, hidden, wih, whh, bih, bhh): diff --git a/rnns/fastrnns/scratch.py b/rnns/fastrnns/scratch.py index 07cc803989..c51d71625f 100644 --- a/rnns/fastrnns/scratch.py +++ b/rnns/fastrnns/scratch.py @@ -1,5 +1,6 @@ import torch + @torch.jit.script def fn(x, scale, shift): return scale * x / shift @@ -25,6 +26,7 @@ def recurrent(x, scale, shift): import torch + @torch.jit.script def recurrent_scaleshift(x, scale, shift): y = x @@ -44,6 +46,6 @@ def recurrent_scaleshift(x, scale, shift): import torch x = torch.tensor([]) x.requires_grad = True -x.mean().backward() # no error triggered +x.mean().backward() # no error triggered x = x.cuda() x.mean().backward() diff --git a/rnns/runner.py b/rnns/runner.py index e0aee903c2..2b589e4e71 100644 --- a/rnns/runner.py +++ b/rnns/runner.py @@ -6,13 +6,13 @@ from benchmarks.mlstm import run_mlstm from benchmarks.lstm import run_lstm from benchmarks.cudnn_lstm import run_cudnn_lstm -from benchmarks.tensor import run_tensor +from benchmarks.tensor import run_tensor from benchmarks.lstm_variants_test import run_lstm_variant from benchmarks.bnlstm import run_bnlstm from benchmarks.sru_test import run_sru from benchmarks.qrnn import run_qrnn -from benchmarks.sequence_labeler import test_wsj +from benchmarks.sequence_labeler import test_wsj from benchmarks.sequence_labeler import Example from benchmarks.common import AttrDict @@ -124,7 +124,7 @@ def discover_benchmarks(): def title(text='title', width=80): reserve = len(text) + 2 - num_lines = int((width - reserve)/2) + num_lines = int((width - reserve) / 2) lines = '-' * num_lines return '{} {} {}'.format(lines, text, lines) @@ -138,11 +138,11 @@ def summarize(result): if gpu_summary.max == 0 and gpu_summary.min == 0: use_summary = cpu_summary - range_middle = (use_summary.max + use_summary.min)/2 + range_middle = (use_summary.max + use_summary.min) / 2 deviation = use_summary.max - range_middle return '{2:10.4f} ± {3:8.4f} msec (average {1:10.4f} msec, {4} samples) [{0}]'.format( - result.name, use_summary.mean, range_middle, deviation, samples) + result.name, use_summary.mean, range_middle, deviation, samples) def main(): diff --git a/run.py b/run.py index df34a3ff4e..bfbeb78549 100644 --- a/run.py +++ b/run.py @@ -3,10 +3,9 @@ def get_docker_run_cmd(image_name): - return ["sudo", "docker", "run", "--rm", "--cap-add=SYS_PTRACE", - "--security-opt", "seccomp=unconfined", "-v", os.getcwd() + ":/mnt/localdrive", - "--cpuset-cpus=0-3", "-t", "--user=jenkins", image_name] + return ["sudo", "docker", "run", "--rm", "--cap-add=SYS_PTRACE", + "--security-opt", "seccomp=unconfined", "-v", os.getcwd() + ":/mnt/localdrive", + "--cpuset-cpus=0-3", "-t", "--user=jenkins", image_name] if __name__ == "__main__": - call(get_docker_run_cmd("tmp-utcnpjpdbsorhktnnnttixmkvlyxirwl") + ["/bin/bash", "/mnt/localdrive/python/run.sh"]) - + call(get_docker_run_cmd("tmp-utcnpjpdbsorhktnnnttixmkvlyxirwl") + ["/bin/bash", "/mnt/localdrive/python/run.sh"]) diff --git a/setup/bench_conf.py b/setup/bench_conf.py index 58f5ac9c59..48b2b7c1ef 100644 --- a/setup/bench_conf.py +++ b/setup/bench_conf.py @@ -8,11 +8,12 @@ CPUInfo = namedtuple('CPUInfo', ['processor', 'physical_id', 'core_id']) + def get_cpus(): with open('/proc/cpuinfo', 'r') as f: raw_out = f.read() relevant_lines = [l for l in raw_out.split('\n') - if 'processor' in l or 'physical id' in l or 'core id' in l] + if 'processor' in l or 'physical id' in l or 'core id' in l] assert len(relevant_lines) % 3 == 0 line_data = [int(l[l.index(':') + 1:].strip()) for l in relevant_lines] @@ -84,6 +85,7 @@ def remove_shield(): # CPU Turbo Mode ################################################################################ + def set_turbo(value): with open('/sys/devices/system/cpu/intel_pstate/no_turbo', 'w') as f: f.write('0' if value else '1') @@ -92,6 +94,7 @@ def set_turbo(value): # Helpers ################################################################################ + def isolate_bench_subset(cpus): bench_cpus = [cpu for cpu in cpus if cpu.physical_id == 0] bg_cpus = [cpu for cpu in cpus if cpu.physical_id != 0] @@ -103,6 +106,7 @@ def isolate_bench_subset(cpus): # Setup/Teardown ################################################################################ + def setup_benchmark_env(): set_turbo(False) all_active_cpus = disable_ht() @@ -111,11 +115,13 @@ def setup_benchmark_env(): with open('bench_cpus', 'w') as f: f.write(','.join(str(cpu.processor) for cpu in bench_cpus)) + def teardown_benchmark_env(): remove_shield() enable_ht() set_turbo(True) + def main(): parser = argparse.ArgumentParser(description='Configure benchmarking environment') parser.add_argument('--setup', action='store_true') diff --git a/timing/python/benchmarks/__init__.py b/timing/python/benchmarks/__init__.py index 9534424d74..2de923f41b 100644 --- a/timing/python/benchmarks/__init__.py +++ b/timing/python/benchmarks/__init__.py @@ -5,4 +5,3 @@ from benchmarks.cpu_unary_benchmark import CPUUnaryBench from benchmarks.cpu_unary_benchmark import NumpyUnaryComparison from benchmarks.cuda_lstm_benchmark import CUDALSTMBench - diff --git a/timing/python/benchmarks/misc/mobilenet.py b/timing/python/benchmarks/misc/mobilenet.py index a7d446aa15..cbbafab54e 100644 --- a/timing/python/benchmarks/misc/mobilenet.py +++ b/timing/python/benchmarks/misc/mobilenet.py @@ -52,6 +52,7 @@ class MobileNetV2(nn.Module): `"Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation" `_paper. """ + def __init__(self, n_class=1000, input_size=224, width_mult=1.): super(MobileNetV2, self).__init__() # setting of inverted residual blocks @@ -82,7 +83,7 @@ def __init__(self, n_class=1000, input_size=224, width_mult=1.): input_channel = output_channel # building last several layers self.features.append(conv_1x1_bn(input_channel, self.last_channel)) - self.features.append(nn.AvgPool2d(int(input_size/32))) + self.features.append(nn.AvgPool2d(int(input_size / 32))) # make it nn.Sequential self.features = nn.Sequential(*self.features) @@ -114,4 +115,3 @@ def _initialize_weights(self): n = m.weight.size(1) m.weight.data.normal_(0, 0.01) m.bias.data.zero_() - diff --git a/timing/python/framework/maybe_garbage.py b/timing/python/framework/maybe_garbage.py index 8dfb605f57..72d2973dde 100644 --- a/timing/python/framework/maybe_garbage.py +++ b/timing/python/framework/maybe_garbage.py @@ -2,13 +2,13 @@ # cpu_pin(cpu) -#class Benchmark(object): +# class Benchmark(object): # # default_params = [] # # params = make_params() # # param_names = ['config'] -# -# +# +# # def start_stats(common_name, framework_name, fname, mag, count, tv): # status = "" # status += "tag: {:<15}".format(common_name) @@ -20,14 +20,14 @@ # status += " stride: {:<60}".format(list(map(lambda x: "{:>7}".format(x), list(tv.stride())))) # status += " numel: {:<9}".format(tv.numel()) # return status -# +# # def finish_stats(dtype, dim, elapsed): # status = "" # status += " type: {:<18}".format(dtype) # status += " dim: {:<5}".format(dim) # status += " elapsed: {:8.4f}".format(elapsed) # return status -# +# # def lambda_benchmark(common_name, types, fun, name, framework_name, cast): # goal_size = 1000 # onek = 1000 @@ -57,10 +57,10 @@ # class over(object): # def __init__(self, *args): # self.values = args -# -# -# -# +# +# +# +# # def make_params(**kwargs): # keys = list(kwargs.keys()) # iterables = [kwargs[k].values if isinstance(kwargs[k], over) else (kwargs[k],) for k in keys] @@ -73,61 +73,60 @@ # for k, v in self.default_params.items(): # params.setdefault(k, v) # self.prepare(params) -# -# +# +# # def get_env_pytorch_examples(): # pytorch_examples_home = os.environ.get('EXAMPLES_HOME') # if pytorch_examples_home is None: # print('EXAMPLES_HOME not found') # sys.exit() -# +# # return pytorch_examples_home -# -# +# +# # def execution(cmd, log_path): # gc.collect() -# +# # # logging # log_file = open(log_path, "w+") # log_file.write(cmd) # log_file.write('\n') -# +# # exec_command = shlex.split(cmd) # proc = subprocess.Popen(exec_command, stdout=log_file, stderr=subprocess.STDOUT) # proc.wait() # return_code = proc.returncode # log_file.close() -# +# # log_file = open(log_path, 'r+') -# +# # if return_code == 0: # acc = parse_accuracy(log_file) # else: # acc = ('NA', 'NA') -# +# # return acc -# -# +# +# # def parse_accuracy(log_file): # output_data = log_file.readlines() # _, _, prec1, _, prec2 = output_data[-2].split() # return (prec1, prec2) -# -# +# +# # def config_runs(model, no_iter): # iters = [i for i in range(no_iter)] # if model == 'all': # model = model_names -# +# # return list(itertools.product(model, iters)) -# -# +# +# # def cmd_string(examples_home, model, data_path): # lr = 0.1 # if model in ['alexnet', 'vgg11', 'vgg11_bn', 'vgg13_bn', # 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']: # lr = 0.01 -# +# # cmd = ' '.join(['python3', examples_home, '-a', model, '--lr', str(lr), data_path]) # return cmd -