-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathxpar-x86_64.asm
398 lines (386 loc) · 11.3 KB
/
xpar-x86_64.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
; Copyright (C) 2022-2024 Kamila Szewczyk
;
; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program. If not, see <http://www.gnu.org/licenses/>.
section .text
; =============================================================================
; MacOS likes leading underscores in symbol names. Please it.
; =============================================================================
%ifdef MACHO
global _xpar_x86_64_cpuflags
global _crc32c_small_x86_64_sse42
global _rse32_x86_64_generic
global _rse32_x86_64_avx512
global _crc32c_32k_x86_64_sse42
extern _PROD_GEN
%define xpar_x86_64_cpuflags _xpar_x86_64_cpuflags
%define crc32c_small_x86_64_sse42 _crc32c_small_x86_64_sse42
%define rse32_x86_64_generic _rse32_x86_64_generic
%define rse32_x86_64_avx512 _rse32_x86_64_avx512
%define PROD_GEN _PROD_GEN
%define crc32c_32k_x86_64_sse42 _crc32c_32k_x86_64_sse42
%else
global xpar_x86_64_cpuflags
global crc32c_small_x86_64_sse42
global rse32_x86_64_generic
global rse32_x86_64_avx512
global crc32c_32k_x86_64_sse42
extern PROD_GEN
%endif
; =============================================================================
; The case for this file is very simple: tabular CRC32 is a horrible
; use of the CPU. It causes a lot of cache thrashing and is generally
; somewhat slow. Profiling shows that on the author's CPU,
; AMD Ryzen 7 PRO 7840U, the runtime share of CRC32 goes down from about
; 15% to about 0.5% with this simple implementation.
; Notice that there are better ways to do CRC32 by saturating the
; CPU's execution units. On most architectures, the latency of CRC32 is 3,
; while the throughput is 1. This means that you can do 3 CRC32s in parallel.
; This is also done in this unit, see below. Another improvement
; stems from the use of the PCLMULQDQ instruction, which can be used to
; calculate CRC32 in parallel with the dedicated instruction, further
; increasing throughput to about 70-80GiB/s.
; =============================================================================
; =============================================================================
; Probe the CPU features, keeping in mind that the OS can disable
; AVX512F and AVX512VL support. The return value is a bitfield:
; rax & (1 << 0) - SSE4.2 support. rax & (1 << 1) - PCLMULQDQ support.
; rax & (1 << 2) - AVX512F support. rax & (1 << 3) - AVX512VL support.
; =============================================================================
xpar_x86_64_cpuflags:
push rbx
mov eax, 1
xor ecx, ecx
cpuid
mov eax, ecx
shr eax, 20
and eax, 0x01
mov esi, ecx
and esi, 0x02
or esi, eax
not ecx
test ecx, 0x18000000
jne .no_osxsave
xor ecx, ecx
xgetbv
not eax
test al, 0x06
jne .no_osxsave
mov eax, 0x07
xor ecx, ecx
cpuid
mov eax, ebx
shr eax, 14
and eax, 0x04
shr ebx, 31
lea eax, [rax + 8 * rbx]
or esi, eax
.no_osxsave:
mov eax, esi
pop rbx
ret
; =============================================================================
; A simple SSE4.2 implementation particularly suited for small buffers.
; =============================================================================
crc32c_small_x86_64_sse42:
mov eax, edi
cmp rdx, 8
jb .no_byte_padding
lea rcx, qword [rdx - 8]
mov edi, ecx
shr edi, 3
inc edi
and edi, 7
je .no_qword_padding
shl edi, 3
xor r8d, r8d
.qword_loop:
crc32 rax, qword [rsi + r8]
add r8, 8
cmp rdi, r8
jne .qword_loop
sub rdx, r8
add rsi, r8
.no_qword_padding:
cmp rcx, 56
jb .no_byte_padding
.8way_qword_loop:
crc32 rax, qword [rsi]
crc32 rax, qword [rsi + 8]
crc32 rax, qword [rsi + 16]
crc32 rax, qword [rsi + 24]
crc32 rax, qword [rsi + 32]
crc32 rax, qword [rsi + 40]
crc32 rax, qword [rsi + 48]
crc32 rax, qword [rsi + 56]
sub rdx, 64
add rsi, 64
cmp rdx, 7
ja .8way_qword_loop
.no_byte_padding:
test rdx, rdx
je .done
xor ecx, ecx
.byte_loop:
crc32 eax, byte [rsi + rcx]
inc rcx
cmp rdx, rcx
jne .byte_loop
.done:
retq
; =============================================================================
; Software CLMUL, for non-PCMULQDQ CPUs. 2x unrolled.
; =============================================================================
clmul:
mov edx, esi
xor ecx, ecx
xor eax, eax
.clmul_loop:
; General formula: res ^= (a & (1 << cl)) * b;
mov esi, 1
shl esi, cl
and esi, edi
imul rsi, rdx
xor rsi, rax
mov eax, 2
shl eax, cl
and eax, edi
imul rax, rdx
xor rax, rsi
add ecx, 2
cmp ecx, 32
jne .clmul_loop
ret
; =============================================================================
; 3-way saturating CRC32C implementation. Works in blocks of 32K. Defers to
; `crc32c_small_x86_64_sse42' for leftover.
; =============================================================================
BLK32k equ 32000
crc32c_32k_x86_64_sse42:
push r15
push r14
push r13
push r12
push rbx
mov r14, rdx
mov rbx, rsi
test rdx, rdx
sete al
test bl, 7
sete cl
or cl, al
jne .crc32c_32k_check_size
lea rax, [rbx + 1]
.crc32c_32k_byte_align:
crc32 edi, byte [rbx]
mov rcx, r14
inc rbx
dec r14
cmp rcx, 1
je .crc32c_32k_check_size
mov ecx, eax
and ecx, 7
inc rax
test rcx, rcx
jne .crc32c_32k_byte_align
.crc32c_32k_check_size:
cmp r14, BLK32k
jae .crc32c_32k_enter_loop
mov r15d, edi
.crc32c_32k_done:
mov edi, r15d
mov rsi, rbx
mov rdx, r14
pop rbx
pop r12
pop r13
pop r14
pop r15
jmp crc32c_small_x86_64_sse42
.crc32c_32k_combine:
mov esi, 0xABAEB715
call clmul
mov r13, rax
mov edi, r12d
mov esi, 0x10B2EE53
call clmul
xor rax, r13
xor rax, qword [rbx + BLK32k - 8]
crc32 r15, rax
add rbx, BLK32k
sub r14, BLK32k
mov edi, r15d
cmp r14, BLK32k - 1
jbe .crc32c_32k_done
.crc32c_32k_enter_loop:
xor eax, eax
xor r12d, r12d
xor r15d, r15d
.crc32c_32k_2x3way_qword_loop:
mov edi, edi
crc32 rdi, qword [rbx + 8 * rax]
crc32 r12, qword [rbx + 8 * rax + (BLK32k - 8) / 3]
crc32 r15, qword [rbx + 8 * rax + 2 * (BLK32k - 8) / 3]
cmp eax, BLK32k / 24 - 1
je .crc32c_32k_combine
crc32 rdi, qword [rbx + 8 * rax + 8]
crc32 r12, qword [rbx + 8 * rax + (BLK32k - 8) / 3 + 8]
crc32 r15, qword [rbx + 8 * rax + 2 * (BLK32k - 8) / 3 + 8]
add rax, 2
jmp .crc32c_32k_2x3way_qword_loop
; TO-DO: 3-way saturating CRC32C + concurrent PCMULQDQ fusion
; implementation for 2MB blocks based on Peter Cawley's ideas.
; =============================================================================
; Reed-Solomon encoder is basically a shift register. Here, we observe
; that compilers are generally very bad at optimising this kind of pattern.
; This is however excusable, as the x86_64 instruction set does not
; accommodate it very well. Below we use a somewhat cheap trick with storing
; the 32-byte circular buffers in XMM registers (128-bit wide each).
; Then we can use pslldq/psrldq to shift the data (like in a shift register).
; and make room for the new data by ANDing with the mask below.
; A version that works on all x86_64 machines.
; =============================================================================
align 16
rse32_wipe_mask:
db 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
db 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
K equ 223
T equ 32
rse32_x86_64_generic:
mov rax, rsi
mov rsi, rdi
pxor xmm0, xmm0
movdqu [rax + K + (T / 2)], xmm0
movdqu [rax + K], xmm0
movzx edx, byte [rax + K + T - 1]
movzx ecx, byte [rax + K + T - 2]
movzx r10d, byte [rax + K + T - 4]
movzx r8d, byte [rax + K + T - 3]
movd xmm3, dword [rax + K + T - 8]
movq xmm2, qword [rax + K + T - 16]
mov edi, K
lea r9, [rel PROD_GEN]
movdqa xmm1, [rel rse32_wipe_mask]
.rse32_K_loop:
xor dl, byte [rsi + rdi - 1]
movzx r11d, dl
shl r11d, 5
mov edx, ecx
xor dl, byte [r11 + r9 + T - 1]
mov byte [rax + K + T - 1], dl
mov ecx, r8d
xor cl, byte [r11 + r9 + T - 2]
mov byte [rax + K + T - 2], cl
mov r8d, r10d
xor r8b, byte [r11 + r9 + T - 3]
mov byte [rax + K + T - 3], r8b
movd xmm4, dword [r11 + r9 + T - 7]
pxor xmm4, xmm3
movd dword [rax + K + T - 7], xmm4
movq xmm3, qword [r11 + r9 + T - 15]
pxor xmm3, xmm2
movq qword [rax + K + T - 15], xmm3
movdqu xmm5, [r11 + r9 + 1]
pxor xmm5, xmm0
movdqu [rax + K + 1], xmm5
mov r10d, dword [r11 + r9]
mov byte [rax + K], r10b
movdqa xmm2, xmm5
pslldq xmm2, 1
movd xmm6, r10d
movdqa xmm0, xmm1
pandn xmm0, xmm6
por xmm0, xmm2
psrldq xmm5, 15
movdqa xmm2, xmm3
psllq xmm2, 8
por xmm2, xmm5
psrlq xmm3, 56
movdqa xmm5, xmm1
pandn xmm5, xmm3
movdqa xmm3, xmm4
pslld xmm3, 8
por xmm3, xmm5
movdqa [rsp - 24], xmm4
movzx r10d, byte [rsp - 21]
dec rdi
jne .rse32_K_loop
mov ecx, K
mov rdi, rax
rep movsb
ret
; =============================================================================
; Same thing as above, but AVX gives us some interesting instructions that
; simplify working with the shift register pattern. My old version here used
; blends and permutes (and hence was marginally faster), but I found a better
; way.
; =============================================================================
align 16
rse32_shuf_mask:
db 1, 0, 2, 4, 0, 0, 0, 0
db 0, 0, 0, 0, 0, 0, 0, 0
rse32_x86_64_avx512:
vxorps xmm0, xmm0, xmm0
vmovups [rsi + K], ymm0
movzx ecx, byte [rsi + K + T - 1]
movzx eax, byte [rsi + K + T - 2]
movzx edx, byte [rsi + K + T - 3]
movzx r10d, byte [rsi + K + T - 4]
vmovd xmm3, dword [rsi + K + T - 8]
vmovq xmm2, qword [rsi + K + T - 16]
vmovupd xmm1, [rsi + K]
mov r8d, K
lea r9, [rel PROD_GEN]
vmovd xmm0, dword [rel rse32_shuf_mask]
.rse32_K_loop:
xor cl, byte [rdi + r8 - 1]
movzx r11d, cl
shl r11d, 5
mov ecx, eax
xor cl, byte [r11 + r9 + T - 1]
mov byte [rsi + K + T - 1], cl
mov eax, edx
xor al, byte [r11 + r9 + T - 2]
mov byte [rsi + K + T - 2], al
mov edx, r10d
xor dl, byte [r11 + r9 + T - 3]
mov byte [rsi + K + T - 3], dl
vmovd xmm4, dword [r11 + r9 + T - 7]
vpxor xmm4, xmm4, xmm3
vmovd dword [rsi + K + T - 7], xmm4
vmovq xmm3, qword [r11 + r9 + T - 15]
vpxor xmm3, xmm3, xmm2
vmovq qword [rsi + K + T - 15], xmm3
vxorpd xmm2, xmm1, [r11 + r9 + 1]
vmovupd [rsi + K + 1], xmm2
mov r10d, dword [r11 + r9]
mov byte [rsi + K], r10b
vpslldq xmm1, xmm2, 1
vpinsrb xmm1, xmm1, r10d, 0
vpalignr xmm2, xmm3, xmm2, 15
vpsrlq xmm3, xmm3, 56
vpunpcklbw xmm3, xmm4, xmm3
vpshufb xmm3, xmm3, xmm0
vpextrb r10d, xmm4, 3
dec r8
jne .rse32_K_loop
mov ecx, K
xchg rdi, rsi
rep movsb
vzeroupper ; <-- If you remove this because you don't understand what it
ret ; does, I will find you, and it will not end well for you.
; Unlike some people say, this is still necessary on AVX512 CPUs.
; Need .GNU-stack to mark the stack as non-executable on ELF targets.
%ifdef ELF
section .note.GNU-stack noalloc noexec nowrite progbits
%endif