From 31015a298d75fa2bac2580ce7f3de033474051a5 Mon Sep 17 00:00:00 2001
From: sh-mug <52068539+sh-mug@users.noreply.github.com>
Date: Thu, 6 Apr 2023 23:44:22 +0900
Subject: [PATCH] implement tanh activation function

---
 nngen/onnx/__init__.py                        |   1 +
 nngen/onnx/act_func.py                        |   3 +
 nngen/operator/__init__.py                    |   1 +
 nngen/operator/tanh.py                        | 118 ++++++++
 nngen/quantizer/__init__.py                   |   2 +
 nngen/quantizer/tanh.py                       |   8 +
 nngen/verify/tanh.py                          |  51 ++++
 ...st_matrix_conv2d_int16_3x3_stride1_tanh.py |  88 ++++++
 ...st_matrix_conv2d_int32_3x3_stride1_tanh.py |  88 ++++++
 ...est_matrix_conv2d_int8_3x3_stride1_tanh.py |  88 ++++++
 ...nx_matrix_conv2d_tanh_int16_3x3_stride1.py |  67 +++++
 ...nx_matrix_conv2d_tanh_int32_3x3_stride1.py |  67 +++++
 ...nnx_matrix_conv2d_tanh_int8_3x3_stride1.py |  67 +++++
 tests/onnx_matrix_tanh/onnx_matrix_tanh.py    | 284 ++++++++++++++++++
 .../test_onnx_matrix_tanh_int16.py            |  49 +++
 .../test_onnx_matrix_tanh_int32.py            |  49 +++
 .../test_onnx_matrix_tanh_int8.py             |  49 +++
 17 files changed, 1080 insertions(+)
 create mode 100644 nngen/operator/tanh.py
 create mode 100644 nngen/quantizer/tanh.py
 create mode 100644 nngen/verify/tanh.py
 create mode 100644 tests/matrix_conv2d/test_matrix_conv2d_int16_3x3_stride1_tanh.py
 create mode 100644 tests/matrix_conv2d/test_matrix_conv2d_int32_3x3_stride1_tanh.py
 create mode 100644 tests/matrix_conv2d/test_matrix_conv2d_int8_3x3_stride1_tanh.py
 create mode 100644 tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int16_3x3_stride1.py
 create mode 100644 tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int32_3x3_stride1.py
 create mode 100644 tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int8_3x3_stride1.py
 create mode 100644 tests/onnx_matrix_tanh/onnx_matrix_tanh.py
 create mode 100644 tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int16.py
 create mode 100644 tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int32.py
 create mode 100644 tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int8.py

diff --git a/nngen/onnx/__init__.py b/nngen/onnx/__init__.py
index fd870978..cc9c26df 100644
--- a/nngen/onnx/__init__.py
+++ b/nngen/onnx/__init__.py
@@ -53,6 +53,7 @@
     'Relu': act_func.Relu,
     'LeakyRelu': act_func.LeakyRelu,
     'Sigmoid': act_func.Sigmoid,
+    'Tanh': act_func.Tanh,
     'BatchNormalization': batchnormalization.BatchNormalization,
     'Shape': shape.Shape,
     'Reshape': reshape.Reshape,
diff --git a/nngen/onnx/act_func.py b/nngen/onnx/act_func.py
index efefa79e..1ea05b07 100644
--- a/nngen/onnx/act_func.py
+++ b/nngen/onnx/act_func.py
@@ -67,3 +67,6 @@ def LeakyRelu(visitor, node):
 
 def Sigmoid(visitor, node):
     return _act_func(operator.sigmoid, visitor, node)
+
+def Tanh(visitor, node):
+    return _act_func(operator.tanh, visitor, node)
diff --git a/nngen/operator/__init__.py b/nngen/operator/__init__.py
index e351f588..6a08468c 100644
--- a/nngen/operator/__init__.py
+++ b/nngen/operator/__init__.py
@@ -7,6 +7,7 @@
 from .relu import relu, relu6
 from .leaky_relu import leaky_relu, get_leaky_relu_op, leaky_relu_base
 from .sigmoid import sigmoid
+from .tanh import tanh
 from .matmul import matmul
 from .conv2d import conv2d
 from .log_weight_conv2d import log_weight_conv2d
diff --git a/nngen/operator/tanh.py b/nngen/operator/tanh.py
new file mode 100644
index 00000000..c979eceb
--- /dev/null
+++ b/nngen/operator/tanh.py
@@ -0,0 +1,118 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import functools
+import math
+import numpy as np
+from collections import OrderedDict
+
+import nngen.basic_types as bt
+from nngen.quantizer import util
+
+class tanh(bt._ActFuncOperator):
+
+    def __init__(self, features,
+                 lut_addrwidth=8, lut_clip=6.0, range_rate=0.95,
+                 dtype=None, name=None, par=1):
+
+        shape = None
+        if features.dtype is not None and features.dtype.width < 8:
+            lut_addrwidth = features.dtype.width
+
+        self.lut_addrwidth = lut_addrwidth
+        self.lut_clip = lut_clip
+        self.range_rate = range_rate
+        bt._ActFuncOperator.__init__(self, features,
+                                     dtype=dtype, shape=shape, name=name, par=par)
+
+    def _get_expected_scale_factor(self):
+        return (2 ** (self.lut_addrwidth - 1)) / self.lut_clip
+
+    def _get_features_scale_shamt(self):
+        expected_scale_factor = self._get_expected_scale_factor()
+
+        features_scale = np.array([expected_scale_factor / self.args[0].scale_factor])
+        q_features_scale, scale_factor = util.quantize_linear_scale(features_scale, 32)
+        q_features_scale = int(q_features_scale[0])
+        q_features_shamt = round(math.log(scale_factor, 2))
+        return q_features_scale, q_features_shamt
+
+    def get_local_control_param_values(self):
+        q_features_scale, q_features_shamt = self._get_features_scale_shamt()
+        return OrderedDict([('features_scale_cparam', q_features_scale),
+                            ('features_shamt_cparam', q_features_shamt)])
+
+    def get_stream_hash(self):
+        base = bt._ActFuncOperator.get_stream_hash(self)
+        return (base, self.lut_addrwidth, self.lut_clip, self.range_rate)
+
+    def op(self, strm, *args, **kwargs):
+        features_signed = self.args[0].get_signed()
+
+        features_scale = strm.ReinterpretCast(self.features_scale_cparam,
+                                              width=self.features_scale_cparam.width + 1,
+                                              signed=features_signed)
+        mul = strm.Times(args[0], features_scale)
+        mul.width = mul.width + features_scale.width
+
+        features_shamt = strm.ReinterpretCast(self.features_shamt_cparam,
+                                              width=self.features_shamt_cparam.width,
+                                              signed=False)
+        sra = strm.Sra(mul, features_shamt)
+        lut_addr = strm.Slice(sra, self.lut_addrwidth - 1, 0)
+
+        out_width = self.dtype.width
+        out_point = self.dtype.point
+        out_signed = self.dtype.signed
+        if out_signed:
+            out_scale = round((2 ** (out_width - 1)) * self.range_rate)
+        else:
+            out_scale = round((2 ** out_width) * self.range_rate)
+
+        def _tanh(x):
+            return int((np.tanh(x) * out_scale).astype(np.int64))
+
+        addr_scale = 1 / self._get_expected_scale_factor()
+        patterns_p = [_tanh(i * addr_scale)
+                      for i in range(2 ** (self.lut_addrwidth - 1))]
+        patterns_n = [_tanh((-i - 1) * addr_scale)
+                      for i in range(2 ** (self.lut_addrwidth - 1))]
+        patterns_n.reverse()
+
+        patterns = patterns_p + patterns_n
+
+        lut = strm.LUT(lut_addr, patterns, out_width, out_point, out_signed)
+
+        p_th = 2 ** (self.lut_addrwidth - 1) - 1
+        n_th = -1 * p_th
+
+        if out_point == 0:
+            th_scale = out_scale
+        elif out_point > 0:
+            th_scale = out_scale >> out_point
+        else:
+            th_scale = out_scale << (-1 * out_point)
+
+        p = strm.Mux(sra > p_th, th_scale, lut)
+        n = strm.Mux(sra < n_th, 0, lut)
+        out = strm.Mux(sra >= 0, p, n)
+
+        return out
+
+    def get_eval_method(self):
+        import nngen.verify as verify
+
+        name = self.__class__.__name__
+        method = getattr(verify, name, None)
+
+        features_scale, features_shamt = self._get_features_scale_shamt()
+
+        method = functools.partial(method,
+                                   lut_addrwidth=self.lut_addrwidth,
+                                   lut_clip=self.lut_clip,
+                                   range_rate=self.range_rate,
+                                   features_dtype=self.args[0].dtype,
+                                   features_scale=features_scale,
+                                   features_shamt=features_shamt)
+        return method
diff --git a/nngen/quantizer/__init__.py b/nngen/quantizer/__init__.py
index dd095580..2d44d35a 100644
--- a/nngen/quantizer/__init__.py
+++ b/nngen/quantizer/__init__.py
@@ -12,6 +12,7 @@
 from . import matmul
 from . import normalize
 from . import sigmoid
+from . import tanh
 from . import exp
 from . import reduce
 
@@ -26,6 +27,7 @@
     'scaled_multiply': normalize.scaled_multiply,
     'scaled_div': normalize.scaled_div,
     'sigmoid': sigmoid.sigmoid,
+    'tanh': tanh.tanh,
     'exp': exp.exp,
     'argmax': reduce.argmax,
     'argmin': reduce.argmin,
diff --git a/nngen/quantizer/tanh.py b/nngen/quantizer/tanh.py
new file mode 100644
index 00000000..5025957d
--- /dev/null
+++ b/nngen/quantizer/tanh.py
@@ -0,0 +1,8 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+from . import sigmoid
+
+def tanh(visitor, node):
+    sigmoid(visitor, node)
diff --git a/nngen/verify/tanh.py b/nngen/verify/tanh.py
new file mode 100644
index 00000000..7d3dc6ca
--- /dev/null
+++ b/nngen/verify/tanh.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+
+
+def tanh(features,
+            lut_addrwidth=8, lut_clip=6.0, range_rate=0.95,
+            dtype=None, name=None, par=1,
+            features_dtype=None, features_scale=1, features_shamt=0):
+
+    features_point = 0 if features_dtype is None else features_dtype.point
+    out_point = 0 if dtype is None else dtype.point
+    out_shift = out_point - features_point
+
+    mul = features * features_scale
+    sra = mul >> features_shamt
+
+    if dtype is None:
+        raise ValueError('tanh requires dtype to determine the value range.')
+
+    out_width = dtype.width
+    out_point = dtype.point
+    out_signed = dtype.signed
+    if out_signed:
+        out_scale = round((2 ** (out_width - 1)) * range_rate)
+    else:
+        out_scale = round((2 ** out_width) * range_rate)
+
+    def _tanh(x):
+        return (np.tanh(x) * out_scale).astype(np.int64)
+
+    addr_scale = lut_clip / (2 ** (lut_addrwidth - 1))
+    lut = _tanh(sra * addr_scale)
+
+    p_th = 2 ** (lut_addrwidth - 1) - 1
+    n_th = -1 * p_th
+
+    if out_point == 0:
+        th_scale = out_scale
+    elif out_point > 0:
+        th_scale = out_scale >> out_point
+    else:
+        th_scale = out_scale << (-1 * out_point)
+
+    p = np.where(sra > p_th, th_scale, lut)
+    n = np.where(sra < n_th, 0, lut)
+    out = np.where(sra >= 0, p, n)
+
+    return out
diff --git a/tests/matrix_conv2d/test_matrix_conv2d_int16_3x3_stride1_tanh.py b/tests/matrix_conv2d/test_matrix_conv2d_int16_3x3_stride1_tanh.py
new file mode 100644
index 00000000..41308621
--- /dev/null
+++ b/tests/matrix_conv2d/test_matrix_conv2d_int16_3x3_stride1_tanh.py
@@ -0,0 +1,88 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import matrix_conv2d
+
+
+act_shape = (1, 7, 7, 15)
+weight_shape = (7, 3, 3, 15)
+bias_shape = None
+scale_shape = None
+act_dtype = ng.int16
+weight_dtype = ng.int16
+bias_dtype = ng.int32
+scale_dtype = ng.int16
+out_dtype = ng.int16
+stride = (1, 1, 1, 1)
+rshift_mul = None
+rshift_sum = None
+rshift_out = None
+act_func = ng.tanh
+par_ich = 1
+par_och = 1
+par_col = 1
+par_row = 1
+concur_och = None
+stationary = 'filter'
+input_ram_size = None
+filter_ram_size = None
+bias_ram_size = None
+scale_ram_size = None
+out_ram_size = None
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = matrix_conv2d.run(act_shape, weight_shape,
+                             bias_shape, scale_shape,
+                             act_dtype, weight_dtype,
+                             bias_dtype, scale_dtype,
+                             out_dtype,
+                             stride,
+                             rshift_mul, rshift_sum, rshift_out,
+                             act_func,
+                             par_ich, par_och, par_col, par_row,
+                             concur_och, stationary,
+                             input_ram_size, filter_ram_size,
+                             bias_ram_size, scale_ram_size,
+                             out_ram_size,
+                             axi_datawidth, silent,
+                             filename=None, simtype=simtype,
+                             outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = matrix_conv2d.run(act_shape, weight_shape,
+                             bias_shape, scale_shape,
+                             act_dtype, weight_dtype,
+                             bias_dtype, scale_dtype,
+                             out_dtype,
+                             stride,
+                             rshift_mul, rshift_sum, rshift_out,
+                             act_func,
+                             par_ich, par_och, par_col, par_row,
+                             concur_och, stationary,
+                             input_ram_size, filter_ram_size,
+                             bias_ram_size, scale_ram_size,
+                             out_ram_size,
+                             axi_datawidth, silent=False,
+                             filename='tmp.v',
+                             outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)
diff --git a/tests/matrix_conv2d/test_matrix_conv2d_int32_3x3_stride1_tanh.py b/tests/matrix_conv2d/test_matrix_conv2d_int32_3x3_stride1_tanh.py
new file mode 100644
index 00000000..3437e930
--- /dev/null
+++ b/tests/matrix_conv2d/test_matrix_conv2d_int32_3x3_stride1_tanh.py
@@ -0,0 +1,88 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import matrix_conv2d
+
+
+act_shape = (1, 7, 7, 15)
+weight_shape = (7, 3, 3, 15)
+bias_shape = None
+scale_shape = None
+act_dtype = ng.int32
+weight_dtype = ng.int32
+bias_dtype = ng.int32
+scale_dtype = ng.int32
+out_dtype = ng.int32
+stride = (1, 1, 1, 1)
+rshift_mul = None
+rshift_sum = None
+rshift_out = None
+act_func = ng.tanh
+par_ich = 1
+par_och = 1
+par_col = 1
+par_row = 1
+concur_och = None
+stationary = 'filter'
+input_ram_size = None
+filter_ram_size = None
+bias_ram_size = None
+scale_ram_size = None
+out_ram_size = None
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = matrix_conv2d.run(act_shape, weight_shape,
+                             bias_shape, scale_shape,
+                             act_dtype, weight_dtype,
+                             bias_dtype, scale_dtype,
+                             out_dtype,
+                             stride,
+                             rshift_mul, rshift_sum, rshift_out,
+                             act_func,
+                             par_ich, par_och, par_col, par_row,
+                             concur_och, stationary,
+                             input_ram_size, filter_ram_size,
+                             bias_ram_size, scale_ram_size,
+                             out_ram_size,
+                             axi_datawidth, silent,
+                             filename=None, simtype=simtype,
+                             outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = matrix_conv2d.run(act_shape, weight_shape,
+                             bias_shape, scale_shape,
+                             act_dtype, weight_dtype,
+                             bias_dtype, scale_dtype,
+                             out_dtype,
+                             stride,
+                             rshift_mul, rshift_sum, rshift_out,
+                             act_func,
+                             par_ich, par_och, par_col, par_row,
+                             concur_och, stationary,
+                             input_ram_size, filter_ram_size,
+                             bias_ram_size, scale_ram_size,
+                             out_ram_size,
+                             axi_datawidth, silent=False,
+                             filename='tmp.v',
+                             outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)
diff --git a/tests/matrix_conv2d/test_matrix_conv2d_int8_3x3_stride1_tanh.py b/tests/matrix_conv2d/test_matrix_conv2d_int8_3x3_stride1_tanh.py
new file mode 100644
index 00000000..a1855fe4
--- /dev/null
+++ b/tests/matrix_conv2d/test_matrix_conv2d_int8_3x3_stride1_tanh.py
@@ -0,0 +1,88 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import matrix_conv2d
+
+
+act_shape = (1, 7, 7, 15)
+weight_shape = (7, 3, 3, 15)
+bias_shape = None
+scale_shape = None
+act_dtype = ng.int8
+weight_dtype = ng.int8
+bias_dtype = ng.int32
+scale_dtype = ng.int8
+out_dtype = ng.int8
+stride = (1, 1, 1, 1)
+rshift_mul = None
+rshift_sum = None
+rshift_out = None
+act_func = ng.tanh
+par_ich = 1
+par_och = 1
+par_col = 1
+par_row = 1
+concur_och = None
+stationary = 'filter'
+input_ram_size = None
+filter_ram_size = None
+bias_ram_size = None
+scale_ram_size = None
+out_ram_size = None
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = matrix_conv2d.run(act_shape, weight_shape,
+                             bias_shape, scale_shape,
+                             act_dtype, weight_dtype,
+                             bias_dtype, scale_dtype,
+                             out_dtype,
+                             stride,
+                             rshift_mul, rshift_sum, rshift_out,
+                             act_func,
+                             par_ich, par_och, par_col, par_row,
+                             concur_och, stationary,
+                             input_ram_size, filter_ram_size,
+                             bias_ram_size, scale_ram_size,
+                             out_ram_size,
+                             axi_datawidth, silent,
+                             filename=None, simtype=simtype,
+                             outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = matrix_conv2d.run(act_shape, weight_shape,
+                             bias_shape, scale_shape,
+                             act_dtype, weight_dtype,
+                             bias_dtype, scale_dtype,
+                             out_dtype,
+                             stride,
+                             rshift_mul, rshift_sum, rshift_out,
+                             act_func,
+                             par_ich, par_och, par_col, par_row,
+                             concur_och, stationary,
+                             input_ram_size, filter_ram_size,
+                             bias_ram_size, scale_ram_size,
+                             out_ram_size,
+                             axi_datawidth, silent=False,
+                             filename='tmp.v',
+                             outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)
diff --git a/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int16_3x3_stride1.py b/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int16_3x3_stride1.py
new file mode 100644
index 00000000..6d7a3137
--- /dev/null
+++ b/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int16_3x3_stride1.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import onnx_matrix_conv2d
+
+
+act_shape = (1, 7, 7, 3)
+weight_shape = (9, 3, 3, 3)
+act_dtype = ng.int16
+weight_dtype = ng.int16
+stride = 1
+padding = 0
+with_batchnorm = False
+act_func = 'Sigmoid'
+disable_fusion = False
+par_ich = 1
+par_och = 1
+par_col = 1
+par_row = 1
+concur_och = None
+stationary = 'filter'
+chunk_size = 64
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = onnx_matrix_conv2d.run(act_shape, weight_shape,
+                                  act_dtype, weight_dtype,
+                                  stride, padding,
+                                  with_batchnorm, act_func, disable_fusion,
+                                  par_ich, par_och, par_col, par_row,
+                                  concur_och, stationary,
+                                  chunk_size,
+                                  axi_datawidth, silent,
+                                  filename=None, simtype=simtype,
+                                  outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = onnx_matrix_conv2d.run(act_shape, weight_shape,
+                                  act_dtype, weight_dtype,
+                                  stride, padding,
+                                  with_batchnorm, act_func, disable_fusion,
+                                  par_ich, par_och, par_col, par_row,
+                                  concur_och, stationary,
+                                  chunk_size,
+                                  axi_datawidth, silent=False,
+                                  filename='tmp.v',
+                                  outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)
diff --git a/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int32_3x3_stride1.py b/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int32_3x3_stride1.py
new file mode 100644
index 00000000..e93dbf87
--- /dev/null
+++ b/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int32_3x3_stride1.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import onnx_matrix_conv2d
+
+
+act_shape = (1, 7, 7, 3)
+weight_shape = (9, 3, 3, 3)
+act_dtype = ng.int32
+weight_dtype = ng.int32
+stride = 1
+padding = 0
+with_batchnorm = False
+act_func = 'Sigmoid'
+disable_fusion = False
+par_ich = 1
+par_och = 1
+par_col = 1
+par_row = 1
+concur_och = None
+stationary = 'filter'
+chunk_size = 64
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = onnx_matrix_conv2d.run(act_shape, weight_shape,
+                                  act_dtype, weight_dtype,
+                                  stride, padding,
+                                  with_batchnorm, act_func, disable_fusion,
+                                  par_ich, par_och, par_col, par_row,
+                                  concur_och, stationary,
+                                  chunk_size,
+                                  axi_datawidth, silent,
+                                  filename=None, simtype=simtype,
+                                  outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = onnx_matrix_conv2d.run(act_shape, weight_shape,
+                                  act_dtype, weight_dtype,
+                                  stride, padding,
+                                  with_batchnorm, act_func, disable_fusion,
+                                  par_ich, par_och, par_col, par_row,
+                                  concur_och, stationary,
+                                  chunk_size,
+                                  axi_datawidth, silent=False,
+                                  filename='tmp.v',
+                                  outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)
diff --git a/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int8_3x3_stride1.py b/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int8_3x3_stride1.py
new file mode 100644
index 00000000..3ae43372
--- /dev/null
+++ b/tests/onnx_matrix_conv2d/test_onnx_matrix_conv2d_tanh_int8_3x3_stride1.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import onnx_matrix_conv2d
+
+
+act_shape = (1, 7, 7, 3)
+weight_shape = (9, 3, 3, 3)
+act_dtype = ng.int8
+weight_dtype = ng.int8
+stride = 1
+padding = 0
+with_batchnorm = False
+act_func = 'Sigmoid'
+disable_fusion = False
+par_ich = 1
+par_och = 1
+par_col = 1
+par_row = 1
+concur_och = None
+stationary = 'filter'
+chunk_size = 64
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = onnx_matrix_conv2d.run(act_shape, weight_shape,
+                                  act_dtype, weight_dtype,
+                                  stride, padding,
+                                  with_batchnorm, act_func, disable_fusion,
+                                  par_ich, par_och, par_col, par_row,
+                                  concur_och, stationary,
+                                  chunk_size,
+                                  axi_datawidth, silent,
+                                  filename=None, simtype=simtype,
+                                  outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = onnx_matrix_conv2d.run(act_shape, weight_shape,
+                                  act_dtype, weight_dtype,
+                                  stride, padding,
+                                  with_batchnorm, act_func, disable_fusion,
+                                  par_ich, par_och, par_col, par_row,
+                                  concur_och, stationary,
+                                  chunk_size,
+                                  axi_datawidth, silent=False,
+                                  filename='tmp.v',
+                                  outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)
diff --git a/tests/onnx_matrix_tanh/onnx_matrix_tanh.py b/tests/onnx_matrix_tanh/onnx_matrix_tanh.py
new file mode 100644
index 00000000..9430a89a
--- /dev/null
+++ b/tests/onnx_matrix_tanh/onnx_matrix_tanh.py
@@ -0,0 +1,284 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+import functools
+import math
+import numpy as np
+
+import torch
+import torchvision
+import torchvision.transforms as transforms
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.autograd
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+
+from veriloggen import *
+import veriloggen.thread as vthread
+import veriloggen.types.axi as axi
+
+
+class MatrixSigmoid(nn.Module):
+    def __init__(self):
+        super(MatrixSigmoid, self).__init__()
+        self.tanh = nn.Sigmoid()
+
+    def forward(self, x):
+        z = self.tanh(x)
+        return z
+
+
+def run(act_shape=(15, 31),
+        act_dtype=ng.int32,
+        par=1,
+        chunk_size=64,
+        axi_datawidth=32, silent=False,
+        filename=None, simtype='iverilog', outputfile=None):
+
+    # pytorch model
+    model = MatrixSigmoid()
+
+    # Pytorch to ONNX
+    onnx_filename = 'onnx_matrix_tanh.onnx'
+    dummy_input = torch.randn(*act_shape)
+    input_names = ['act']
+    output_names = ['out']
+    model.eval()
+    torch.onnx.export(model, dummy_input, onnx_filename,
+                      input_names=input_names, output_names=output_names)
+
+    # --------------------
+    # (1) Represent a DNN model as a dataflow by NNgen operators
+    # --------------------
+
+    # ONNX to NNgen
+    dtypes = {'act': act_dtype,
+              'out': act_dtype}
+
+    (outputs, placeholders, variables,
+     constants, operators) = ng.from_onnx(onnx_filename,
+                                          value_dtypes=dtypes,
+                                          default_placeholder_dtype=act_dtype,
+                                          default_variable_dtype=ng.int32,
+                                          default_constant_dtype=ng.int32,
+                                          default_operator_dtype=act_dtype,
+                                          default_scale_dtype=ng.int32,
+                                          default_bias_dtype=ng.int32,
+                                          disable_fusion=False)
+
+    # --------------------
+    # (2) Assign quantized weights to the NNgen operators
+    # --------------------
+
+    if act_dtype.width > 8:
+        act_scale_factor = 128
+    else:
+        act_scale_factor = int(round(2 ** (act_dtype.width - 1) * 0.5))
+
+    input_scale_factors = {'act': act_scale_factor}
+
+    ng.quantize(outputs, input_scale_factors)
+
+    # --------------------
+    # (3) Assign hardware attributes
+    # --------------------
+
+    for op in operators.values():
+        if isinstance(op, ng.tanh):
+            op.attribute(par=par)
+
+    # --------------------
+    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
+    # --------------------
+
+    act = placeholders['act']
+    out = outputs['out']
+
+    # verification data
+    # random data
+    #std = 0.2
+    #mean = 0.5
+    std = 3.0
+    mean = 0.0
+    img = np.random.normal(size=act.length).astype(np.float32).reshape(act.shape)
+    img = img * std + mean
+
+    # execution on pytorch
+    model_input = img
+
+    if act.perm is not None:
+        model_input = np.transpose(model_input, act.reversed_perm)
+
+    model.eval()
+    model_out = model(torch.from_numpy(model_input)).detach().numpy()
+    if act.perm is not None and len(model_out.shape) == len(act.shape):
+        model_out = np.transpose(model_out, act.perm)
+    scaled_model_out = model_out * out.scale_factor
+
+    # software-based verification
+    vact = img * act_scale_factor
+    vact = np.clip(vact,
+                   -1.0 * (2 ** (act.dtype.width - 1) - 1),
+                   1.0 * (2 ** (act.dtype.width - 1) - 1))
+    vact = np.round(vact).astype(np.int64)
+
+    eval_outs = ng.eval([out], act=vact)
+    vout = eval_outs[0]
+
+    mean_square_error = np.sum((vout - scaled_model_out) ** 2) / vout.size
+    corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))
+
+    # breakpoint()
+
+    # --------------------
+    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
+    # --------------------
+
+    targ = ng.to_veriloggen([out], 'onnx_matrix_tanh', silent=silent,
+                            config={'maxi_datawidth': axi_datawidth})
+
+    # --------------------
+    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
+    # --------------------
+
+    if simtype is None:
+        sys.exit()
+
+    # to memory image
+    param_data = ng.export_ndarray([out])
+    param_bytes = len(param_data)
+
+    variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size
+    check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size
+    tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size
+
+    memimg_datawidth = 32
+    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64)
+    mem = mem + [100]
+
+    # placeholder
+    axi.set_memory(mem, vact, memimg_datawidth,
+                   act_dtype.width, act.addr,
+                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))
+
+    # parameters (variable and constant)
+    axi.set_memory(mem, param_data, memimg_datawidth,
+                   8, variable_addr)
+
+    # verification data
+    axi.set_memory(mem, vout, memimg_datawidth,
+                   act_dtype.width, check_addr,
+                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))
+
+    # test controller
+    m = Module('test')
+    params = m.copy_params(targ)
+    ports = m.copy_sim_ports(targ)
+    clk = ports['CLK']
+    resetn = ports['RESETN']
+    rst = m.Wire('RST')
+    rst.assign(Not(resetn))
+
+    # AXI memory model
+    if outputfile is None:
+        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'
+
+    memimg_name = 'memimg_' + outputfile
+
+    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
+                                datawidth=axi_datawidth,
+                                memimg=mem, memimg_name=memimg_name,
+                                memimg_datawidth=memimg_datawidth)
+    memory.connect(ports, 'maxi')
+
+    # AXI-Slave controller
+    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
+    _saxi.connect(ports, 'saxi')
+
+    # timer
+    time_counter = m.Reg('time_counter', 32, initval=0)
+    seq = Seq(m, 'seq', clk, rst)
+    seq(
+        time_counter.inc()
+    )
+
+    num_rep = functools.reduce(lambda x, y: x * y, out.shape[:-1], 1)
+
+    def ctrl():
+        for i in range(100):
+            pass
+
+        ng.sim.set_global_addrs(_saxi, tmp_addr)
+
+        start_time = time_counter.value
+        ng.sim.start(_saxi)
+
+        print('# start')
+
+        ng.sim.wait(_saxi)
+        end_time = time_counter.value
+
+        print('# end')
+        print('# execution cycles: %d' % (end_time - start_time))
+
+        # verify
+        ok = True
+        for i in range(num_rep):
+            for j in range(out.shape[-1]):
+                orig = memory.read_word(i * out.aligned_shape[-1] + j,
+                                        out.addr, act_dtype.width)
+                check = memory.read_word(i * out.aligned_shape[-1] + j,
+                                         check_addr, act_dtype.width)
+
+                if vthread.verilog.NotEql(orig, check):
+                    print('NG', i, j, orig, check)
+                    ok = False
+                # else:
+                #    print('OK', i, j, orig, check)
+
+        if ok:
+            print('# verify: PASSED')
+        else:
+            print('# verify: FAILED')
+
+        vthread.finish()
+
+    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
+    fsm = th.start()
+
+    uut = m.Instance(targ, 'uut',
+                     params=m.connect_params(targ),
+                     ports=m.connect_ports(targ))
+
+    # simulation.setup_waveform(m, uut)
+    simulation.setup_clock(m, clk, hperiod=5)
+    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')
+
+    init.add(
+        Delay(1000000),
+        Systask('finish'),
+    )
+
+    # output source code
+    if filename is not None:
+        m.to_verilog(filename)
+
+    # run simulation
+    sim = simulation.Simulator(m, sim=simtype)
+    rslt = sim.run(outputfile=outputfile)
+    lines = rslt.splitlines()
+    if simtype == 'verilator' and lines[-1].startswith('-'):
+        rslt = '\n'.join(lines[:-1])
+    return rslt
+
+
+if __name__ == '__main__':
+    rslt = run(silent=False, filename='tmp.v')
+    print(rslt)
diff --git a/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int16.py b/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int16.py
new file mode 100644
index 00000000..810ae56b
--- /dev/null
+++ b/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int16.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import onnx_matrix_tanh
+
+
+act_shape = (15, 31)
+act_dtype = ng.int16
+par = 1
+chunk_size = 64
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = onnx_matrix_tanh.run(act_shape,
+                                   act_dtype,
+                                   par,
+                                   chunk_size,
+                                   axi_datawidth, silent,
+                                   filename=None, simtype=simtype,
+                                   outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = onnx_matrix_tanh.run(act_shape,
+                                   act_dtype,
+                                   par,
+                                   chunk_size,
+                                   axi_datawidth, silent=False,
+                                   filename='tmp.v',
+                                   outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)
diff --git a/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int32.py b/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int32.py
new file mode 100644
index 00000000..5d9e1212
--- /dev/null
+++ b/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int32.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import onnx_matrix_tanh
+
+
+act_shape = (15, 31)
+act_dtype = ng.int32
+par = 1
+chunk_size = 64
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = onnx_matrix_tanh.run(act_shape,
+                                   act_dtype,
+                                   par,
+                                   chunk_size,
+                                   axi_datawidth, silent,
+                                   filename=None, simtype=simtype,
+                                   outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = onnx_matrix_tanh.run(act_shape,
+                                   act_dtype,
+                                   par,
+                                   chunk_size,
+                                   axi_datawidth, silent=False,
+                                   filename='tmp.v',
+                                   outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)
diff --git a/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int8.py b/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int8.py
new file mode 100644
index 00000000..40f335c8
--- /dev/null
+++ b/tests/onnx_matrix_tanh/test_onnx_matrix_tanh_int8.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# the next line can be removed after installation
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+    os.path.dirname(os.path.abspath(__file__)))))
+
+import nngen as ng
+import veriloggen
+
+import onnx_matrix_tanh
+
+
+act_shape = (15, 31)
+act_dtype = ng.int8
+par = 1
+chunk_size = 64
+axi_datawidth = 32
+
+
+def test(request, silent=True):
+    veriloggen.reset()
+
+    simtype = request.config.getoption('--sim')
+
+    rslt = onnx_matrix_tanh.run(act_shape,
+                                   act_dtype,
+                                   par,
+                                   chunk_size,
+                                   axi_datawidth, silent,
+                                   filename=None, simtype=simtype,
+                                   outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+
+    verify_rslt = rslt.splitlines()[-1]
+    assert(verify_rslt == '# verify: PASSED')
+
+
+if __name__ == '__main__':
+    rslt = onnx_matrix_tanh.run(act_shape,
+                                   act_dtype,
+                                   par,
+                                   chunk_size,
+                                   axi_datawidth, silent=False,
+                                   filename='tmp.v',
+                                   outputfile=os.path.splitext(os.path.basename(__file__))[0] + '.out')
+    print(rslt)