-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNetworks.h
161 lines (106 loc) · 3.66 KB
/
Networks.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#pragma once
#include<immintrin.h>
#include"LROCT.h"
#include"Misc.h"
//Based off the paper
//Fast Bit Gather, Bit Scatter and Bit Permutation Instructions for Commodity Microprocessors
//by Yedidya Hilewitz and Ruby B. Lee
//DOI:10.1007/s11265-008-0212-8
namespace Butterfly{
inline __m256i Scatter64x4(__m256i val,__m256i mask){
using namespace Internal;
using namespace LROCT;
__m256i control;
auto counts=NibblePopCnt(mask);
counts.bytes=HoriSum64x4(counts.bytes);
control=LROCT64x32(counts.bytes);
val=Step64(val,control);
val=Step32(val,control);
__m256i upperNibble=_mm256_and_si256(counts.bytes,_mm256_set1_epi8(0x0F));
control=LROCT16(upperNibble);
val=Step16(val,control);
__m256i nibbles=MakeNibble(upperNibble,counts.hiNibble);
control=LROCT8(nibbles);
val=StepSub<4>(val,control);
__m256i bits=MakeBitPair(nibbles,counts.bitPair);
control=LROCT4(bits);
val=StepSub<2>(val,control);
control=MakeSingle(bits,mask);
val=StepSub<1>(val,control);
return _mm256_and_si256(val,mask);
}
inline __m256i Gather64x4(__m256i val,__m256i mask){
using namespace Internal;
using namespace LROCT;
__m256i control;
auto counts=NibblePopCnt(mask);
counts.bytes=HoriSum64x4(counts.bytes);
__m256i upperNibble=_mm256_and_si256(counts.bytes,_mm256_set1_epi8(0x0F));
__m256i nibbles=MakeNibble(upperNibble,counts.hiNibble);
__m256i bits=MakeBitPair(nibbles,counts.bitPair);
val=_mm256_and_si256(val,mask);
control=MakeSingle(bits,mask);
val=StepSub<1>(val,control);
control=LROCT4(bits);
val=StepSub<2>(val,control);
control=LROCT8(nibbles);
val=StepSub<4>(val,control);
control=LROCT16(upperNibble);
val=Step16(val,control);
control=LROCT64x32(counts.bytes);
val=Step32(val,control);
val=Step64(val,control);
return val;
}
inline __m256i Scatter256(__m256i val,__m256i mask){
using namespace Internal;
using namespace LROCT;
__m256i control;
auto counts=NibblePopCnt(mask);
counts.bytes=HoriSum256(counts.bytes);
control=LROCT256x128(counts.bytes);
val=Step256(val,control);
val=Step128(val,control);
control=LROCT64x32(counts.bytes);
val=Step64(val,control);
val=Step32(val,control);
__m256i upperNibble=_mm256_and_si256(counts.bytes,_mm256_set1_epi8(0x0F));
control=LROCT16(upperNibble);
val=Step16(val,control);
__m256i nibbles=MakeNibble(upperNibble,counts.hiNibble);
control=LROCT8(nibbles);
val=StepSub<4>(val,control);
__m256i bits=MakeBitPair(nibbles,counts.bitPair);
control=LROCT4(bits);
val=StepSub<2>(val,control);
control=MakeSingle(bits,mask);
val=StepSub<1>(val,control);
return _mm256_and_si256(val,mask);
}
inline __m256i Gather256(__m256i val,__m256i mask){
using namespace Internal;
using namespace LROCT;
__m256i control;
auto counts=NibblePopCnt(mask);
counts.bytes=HoriSum256(counts.bytes);
__m256i upperNibble=_mm256_and_si256(counts.bytes,_mm256_set1_epi8(0x0F));
__m256i nibbles=MakeNibble(upperNibble,counts.hiNibble);
__m256i bits=MakeBitPair(nibbles,counts.bitPair);
val=_mm256_and_si256(val,mask);
control=MakeSingle(bits,mask);
val=StepSub<1>(val,control);
control=LROCT4(bits);
val=StepSub<2>(val,control);
control=LROCT8(nibbles);
val=StepSub<4>(val,control);
control=LROCT16(upperNibble);
val=Step16(val,control);
control=LROCT64x32(counts.bytes);
val=Step32(val,control);
val=Step64(val,control);
control=LROCT256x128(counts.bytes);
val=Step128(val,control);
val=Step256(val,control);
return val;
}
}