Skip to content

Commit

Permalink
aarch64-machine-specific code
Browse files Browse the repository at this point in the history
  • Loading branch information
kspalaiologos committed Oct 15, 2024
1 parent 3ba5794 commit 5e5a6b3
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 15 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ jobs:
name: Compile (MacOS clang, aarch64)
needs: [ dist ]
runs-on: macOS-latest
strategy:
fail-fast: false
matrix:
feature: [ enable-aarch64, disable-aarch64 ]
steps:
- name: Download source package artifact
uses: actions/download-artifact@v4
Expand All @@ -76,7 +80,7 @@ jobs:
- name: Extract source package
run: tar --strip-components=1 -xf xpar-${{ github.sha }}.tar.gz
- name: Configure
run: ./configure CC=clang
run: ./configure CC=clang --${{ matrix.feature }}
- name: Make
run: make all self-check

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ jobs:
fail-fast: false
matrix:
target:
- [ "x86_64-linux", "--enable-static --enable-lto", "" ]
- [ "x86_64", "CC=x86_64-w64-mingw32-gcc --host x86_64-w64-mingw32 --enable-static --enable-lto", "gcc-mingw-w64-x86-64" ]
- [ "x86_64-linux", "--enable-static --enable-lto --enable-x86-64", "" ]
- [ "x86_64", "CC=x86_64-w64-mingw32-gcc --host x86_64-w64-mingw32 --enable-static --enable-lto --enable-x86-64", "gcc-mingw-w64-x86-64" ]
- [ "i686", "CC=i686-w64-mingw32-gcc --host i686-w64-mingw32 --enable-static --enable-lto", "gcc-mingw-w64-i686" ]
steps:
- name: Download source package artifact
Expand Down
4 changes: 4 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ SUFFIXES = .asm
$(NASM) $(NAFLAGS) -o $@ $<
endif

if XPAR_AARCH64
xpar_SOURCES += xpar-aarch64.S
endif

# Developer convenience targets
.PHONY: update-ChangeLog
update-ChangeLog:
Expand Down
10 changes: 1 addition & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,11 @@ A rough outline of some development-related topics below.
## Roadmap

- Need to implement the parallel variant.
- Need to provide automatic testing for all configurations.
- Write a proper readme, manpages, etc.
- Make sure that this builds on Windows and port over the assembly code.
- Fuzz to find segfaults.
- Apple M1-specific optimizations to CRC32.
- 32- vs 64-bit code: determine if there's any compatibility issues.

Low priority:
- Speed up the joint mode encoder loop (basically a LFSR).
- Should probably not pad to the full interlacing block size.
- Add assembly routines for hot spots in the program.

Expand All @@ -61,8 +58,3 @@ As it stands:

Code style:
- Two space indent, brace on the same line, middle pointers - `char * p;`.

## Show-stoppers

clang bug (msys2 packages repository ticket 4958) - makes it impossible to
build on x86_64 Windows with clang. Only gcc is supported.
9 changes: 9 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ AM_INIT_AUTOMAKE([-Wall color-tests])
AC_PROG_INSTALL
AC_PROG_MAKE_SET
AC_PROG_CC
AM_PROG_AS

AC_CHECK_HEADERS([getopt.h io.h])
AC_CHECK_FUNCS([getopt_long asprintf strndup stat _commit _setmode isatty fsync mmap CreateFileMappingA])
Expand All @@ -34,6 +35,14 @@ else
AM_CONDITIONAL([XPAR_X86_64], [false])
fi

AC_ARG_ENABLE([aarch64], [AS_HELP_STRING([--enable-aarch64], [Enable aarch64 platform specific code.])], [enable_aarch64=$enableval], [enable_aarch64=no])
if test "x$enable_aarch64" = "xyes"; then
AC_DEFINE([XPAR_AARCH64], [1], [Enable aarch64 platform specific code.])
AM_CONDITIONAL([XPAR_AARCH64], [true])
else
AM_CONDITIONAL([XPAR_AARCH64], [false])
fi

AC_ARG_ENABLE([native], [AS_HELP_STRING([--enable-native], [Enable native platform optimisations.])], [enable_native=$enableval], [enable_native=no])
if test "x$enable_native" = "xyes"; then
AX_APPEND_COMPILE_FLAGS([-march=native -mtune=native])
Expand Down
16 changes: 13 additions & 3 deletions crc32c.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,26 +89,36 @@ u32 crc32c_tabular(u32 crc, u8 * data, sz length) {
return crc;
}

typedef u32 (*crc32c_func)(u32, u8 *, sz);

#if defined(XPAR_X86_64)
#ifdef HAVE_FUNC_ATTRIBUTE_SYSV_ABI
#define EXTERNAL_ABI __attribute__((sysv_abi))
#else
#define EXTERNAL_ABI
#endif

typedef u32 (*crc32c_func)(u32, u8 *, sz);

extern EXTERNAL_ABI int crc32c_x86_64_cpuflags(void);
extern EXTERNAL_ABI u32 crc32c_small_x86_64_sse42(u32, u8 *, sz);
#elif defined(XPAR_AARCH64)
extern int crc32c_aarch64_cpuflags(void);
extern u32 crc32c_small_aarch64_neon(u32, u8 *, sz);
#endif

u32 crc32c(u8 * data, sz length) {
#if defined(XPAR_X86_64)
static int cpuflags = -1;
#if defined(XPAR_X86_64)
if (cpuflags == -1) cpuflags = crc32c_x86_64_cpuflags();
if (cpuflags & 1)
return crc32c_small_x86_64_sse42(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL;
else
return crc32c_tabular(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL;
#elif defined(XPAR_AARCH64)
if (cpuflags == -1) cpuflags = crc32c_aarch64_cpuflags();
if (cpuflags)
return crc32c_small_aarch64_neon(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL;
else
return crc32c_tabular(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL;
#else
return crc32c_tabular(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL;
#endif
Expand Down
120 changes: 120 additions & 0 deletions xpar-aarch64.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
Copyright (C) 2022-2024 Kamila Szewczyk
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

.text
.extern getauxval

#if defined(__APPLE__)
; aarch64 MacOS
.extern _sysctlbyname
.globl _crc32c_aarch64_cpuflags
_crc32c_aarch64_cpuflags:
sub sp, sp, #32
stp x29, x30, [sp, #16]
add x29, sp, #16
mov w8, #4
adrp x0, .name@GOTPAGE
ldr x0, [x0, .name@GOTPAGEOFF]
sub x1, x29, #4
mov x2, sp
mov x3, xzr
mov x4, xzr
stur wzr, [x29, #-4]
str x8, [sp]
bl _sysctlbyname
ldur w8, [x29, #-4]
cmp w0, #0
ccmp w8, #0, #4, eq
cset w0, ne
ldp x29, x30, [sp, #16]
add sp, sp, #32
ret
.name: .asciz "hw.optional.armv8_crc32"
#else
; aarch64 Linux
.extern getauxval
.globl crc32c_aarch64_cpuflags
crc32c_aarch64_cpuflags:
stp x29, x30, [sp, -16]!
mov x0, 16 ; AT_HWCAP
mov x29, sp
bl getauxval
ldp x29, x30, [sp], 16
and w0, w0, 128
ret
#endif

/* Uses fundamentally the same algorithm as crc32c_small_x86_64_sse42 */

#if defined(__APPLE__)
.globl _crc32c_small_aarch64_neon
_crc32c_small_aarch64_neon:
#else
.globl crc32c_small_aarch64_neon
crc32c_small_aarch64_neon:
#endif
cmp x2, 63
bls .fallback_1way
sub x8, x2, #64
mov x3, x1
and x7, x8, -64
add x7, x7, 64
add x7, x1, x7
.crc32c_8way_quad:
ldp x4, x6, [x3]
crc32cx w0, w0, x4
ldp x4, x5, [x3, 16]
crc32cx w0, w0, x6
crc32cx w0, w0, x4
ldp x4, x6, [x3, 32]
crc32cx w0, w0, x5
crc32cx w0, w0, x4
ldp x5, x4, [x3, 48]
crc32cx w0, w0, x6
add x3, x3, 64
crc32cx w0, w0, x5
crc32cx w0, w0, x4
cmp x3, x7
bne .crc32c_8way_quad
and x8, x8, -64
add x1, x1, 64
add x1, x8, x1
and x2, x2, 63
.fallback_1way:
cmp x2, 7
bls .fallback_1way_byte
sub x6, x2, #8
mov x3, x1
lsr x5, x6, 3
add w5, w5, 1
add x5, x1, x5, lsl 3
.crc32c_1way_quad:
ldr x4, [x3], 8
crc32cx w0, w0, x4
cmp x3, x5
bne .crc32c_1way_quad
and x6, x6, -8
add x1, x1, 8
add x1, x6, x1
and x2, x2, 7
.fallback_1way_byte:
cbz x2, .crc32c_done
add x2, x1, x2
.crc32c_1way_byte:
ldrb w3, [x1], 1
crc32cb w0, w0, w3
cmp x1, x2
bne .crc32c_1way_byte
.crc32c_done:
ret

0 comments on commit 5e5a6b3

Please sign in to comment.