-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNe16.py
190 lines (161 loc) · 6.79 KB
/
Ne16.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import numpy as np
def div_and_ceil(a, b):
return ((a - 1) // b) + 1
class Ne16:
TP_IN = 16
TP_OUT = 32
KS = 3
INPUT_BUFFER_H = 5
INPUT_BUFFER_W = 5
OUTPUT_BUFFER_SHAPE = (3, 3, 32)
@property
def name(self):
return 'ne16'
def weights_ko_len(self, ko, dw):
return div_and_ceil(ko, self.TP_IN) if dw else ko
def weights_ki_size(self, ki, ks, qw, dw):
if dw:
return qw * ks[0] * ks[1] * (self.TP_IN // 8)
else:
return div_and_ceil(ki, self.TP_IN) * qw * ks[0] * ks[1] * (self.TP_IN // 8)
def weights_size(self, ko, ki, ks, qw, dw):
return self.weights_ko_len(ko, dw) * self.weights_ki_size(ki, ks, qw, dw)
# assuming torch shapes, w must already be in uint format!
# format --> [Ko, KiMajor, Qw, KiMinor] (binary tensor)
# +++++++++++ --> these are *contiguous and packed*
def conv1x1_unroll(self, w, qw, tp_in=16):
Ko, Ki, H, W = w.shape
nb_ki = (Ki // tp_in + (1 if Ki % tp_in != 0 else 0))
wbytes = np.zeros((Ko * nb_ki * qw, 2), dtype=np.uint8)
for ko in range(Ko):
for ki in range(Ki):
kimaj = ki // tp_in
kimin = ki % tp_in
byte = kimin // 8
shift = kimin % 8
for q in range(qw):
index = ko * nb_ki * qw + kimaj * qw + q
wbytes[index, byte] = np.bitwise_or(wbytes[index, byte],
1 << shift if w[ko, ki, 0, 0] & (1 << q) != 0 else 0)
wbytes = wbytes.reshape(-1)
return wbytes
def conv1x1_roll(self, wbytes, qw, shape, layout='CoutCinK'):
if layout == 'CoutCinK':
Ko, Ki, H, W = shape
w = np.zeros(shape, dtype=np.uint8)
wv = w
elif layout == 'CoutKCin':
Ko, H, W, Ki = shape
w = np.zeros(shape, dtype=np.uint8)
wv = w.transpose((0, 3, 1, 2))
else:
raise Exception(f'Format {layout} not implemented.')
nb_ki = (Ki // self.TP_IN + (1 if Ki % self.TP_IN != 0 else 0))
for ko in range(Ko):
for kimaj in range(nb_ki):
for q in range(qw):
for kimin in range(self.TP_IN):
byte = kimin // 8
shift = kimin % 8
index = ko * nb_ki * qw * 2 + kimaj * qw * 2 + q * 2 + byte
if kimaj * self.TP_IN + kimin < Ki:
wv[ko, kimaj * self.TP_IN + kimin, 0, 0] += (1 & (wbytes[index] >> shift)) << q
return w
def subtile_bit_extract(self, subtile, bit_idx):
retval = 0
for i, el in enumerate(subtile):
if el.item() & (1 << bit_idx):
retval |= 1 << i
return retval
def conv3x3_unroll(self, w, qw):
Ko, Ki, H, W = w.shape
nb_ki = (Ki // self.TP_IN) + (1 if Ki % self.TP_IN != 0 else 0)
nb_tp_in = self.TP_IN // 8
wbytes = np.zeros((Ko, nb_ki, qw, H * W, nb_tp_in), dtype=np.uint8)
for i in range(Ko):
for j in range(nb_ki):
tile = w[i, j * self.TP_IN:(j + 1) * self.TP_IN].transpose(1, 2, 0).reshape(H * W, -1)
for k, subtile in enumerate(tile):
for bit in range(qw):
subtile_bit = self.subtile_bit_extract(subtile, bit)
for l in range(nb_tp_in):
wbytes[i, j, bit, k, l] = (subtile_bit >> (l * 8)) & 0xff
wbytes = wbytes.reshape(-1)
return wbytes
def subtile_bit_roll(self, w_subtile, subtile, bit):
s = 0
for i, byte in enumerate(subtile):
s += byte.item() << (i * 8)
for i in range(w_subtile.size):
w_subtile[i] += ((s & (1 << i)) >> i) << bit
def conv3x3_roll(self, wbytes, qw, shape, format="CoutCinK"):
if format == 'CoutCinK':
Ko, Ki, H, W = shape
w = np.zeros(shape, dtype=np.uint8)
wv = w
elif format == 'CoutKCin':
Ko, H, W, Ki = shape
w = np.zeros(shape, dtype=np.uint8)
wv = w.transpose((0, 3, 1, 2))
else:
raise Exception(f'Format {format} not implemented.')
nb_ki = (Ki // self.TP_IN) + (1 if Ki % self.TP_IN != 0 else 0)
wbytes = wbytes.reshape(Ko, nb_ki, qw, H, W, 2)
for i in range(Ko):
for j in range(nb_ki):
for bit in range(qw):
for k in range(H):
for l in range(W):
self.subtile_bit_roll(wv[i, j * self.TP_IN:(j + 1) * self.TP_IN, k, l].reshape(-1),
wbytes[i, j, bit, k, l], bit)
return w
def conv_unroll(self, w, qw, layout='CoutCinK', dw=False):
if layout == "CoutCinK":
if dw:
w = w.transpose(1, 0, 2, 3) # Swap Cout and Cin
elif layout == "CoutKCin":
if dw:
w = w.transpose(3, 0, 1, 2)
else:
w = w.transpose(0, 3, 1, 2)
elif layout == "CoutCin":
w = w[:, :, np.newaxis, np.newaxis]
elif layout == "CinCout":
w = w.T
w = w[:, :, np.newaxis, np.newaxis]
else:
raise Exception(f'Format {layout} not implemented.')
fs = w.shape[2]
if dw:
assert fs == 3, "Only support filter size of 3 with depthwise convolution"
assert w.shape[0] == 1, "Assumes that the Cout is equal to 1 in case of depthwise convolution"
if fs == 1:
return self.conv1x1_unroll(w, qw)
elif fs == 3:
return self.conv3x3_unroll(w, qw)
if __name__ == "__main__":
import random
def test(name, Ko, Ki, fs, qw):
print(f'Test {name} shape=({Ko:3}, {Ki:3}, {fs}, {fs}) qw={qw}: ', end='', flush=True)
shape = (Ko, Ki, fs, fs)
test_in = np.random.randint(low=0, high=1 << qw, size=shape, dtype=np.uint8)
test_out = globals()[f'conv{fs}x{fs}_roll'](globals()[f'conv_unroll'](test_in, qw), qw, shape)
if not np.array_equal(test_in, test_out):
print(f'Fail!')
print('Test in:')
print(test_in)
print('Test out:')
print(test_out)
print(test_in[np.equal(test_in, test_out)])
else:
print(f'Success!')
def test_generator(fs, test_count):
print(f'Testing {fs}x{fs} convolution:')
for i in range(test_count):
Ko = random.randint(1, 128)
Ki = random.randint(1, 128)
qw = random.randint(2, 8)
test(f'[{i}]', Ko, Ki, fs, qw)
TEST_COUNT = 10
test_generator(1, TEST_COUNT)
test_generator(3, TEST_COUNT)