From 0a3490c382ba63accf4fe4abc1744c63ec6717cf Mon Sep 17 00:00:00 2001 From: Romain Dolbeau <romain@dolbeau.org> Date: Sat, 25 Jul 2020 13:05:43 +0200 Subject: [PATCH 01/13] Clean rebuild of the arm-sve-alt branch (at 3b1a5c7468af05f1ce20c3b48a82e0948d093dfe) --- Makefile.am | 17 +- api/version.c | 4 + configure.ac | 16 + dft/codelet-dft.h | 5 + dft/conf.c | 12 + dft/simd/Makefile.am | 2 +- dft/simd/sve1024/Makefile.am | 13 + dft/simd/sve128/Makefile.am | 13 + dft/simd/sve2048/Makefile.am | 13 + dft/simd/sve256/Makefile.am | 13 + dft/simd/sve512/Makefile.am | 13 + kernel/ifftw.h | 1 + rdft/codelet-rdft.h | 5 + rdft/conf.c | 12 + rdft/simd/Makefile.am | 2 +- rdft/simd/sve1024/Makefile.am | 13 + rdft/simd/sve128/Makefile.am | 13 + rdft/simd/sve2048/Makefile.am | 13 + rdft/simd/sve256/Makefile.am | 13 + rdft/simd/sve512/Makefile.am | 13 + simd-support/Makefile.am | 3 +- simd-support/generate_vtw.c | 79 ++++ simd-support/generate_vtw.sh | 13 + simd-support/simd-common.h | 2 +- simd-support/simd-maskedsve.h | 305 +++++++++++++ simd-support/simd-maskedsve1024.h | 31 ++ simd-support/simd-maskedsve128.h | 31 ++ simd-support/simd-maskedsve2048.h | 31 ++ simd-support/simd-maskedsve256.h | 31 ++ simd-support/simd-maskedsve512.h | 31 ++ simd-support/sve.c | 49 ++ simd-support/vtw.h | 729 ++++++++++++++++++++++++++++++ 32 files changed, 1536 insertions(+), 5 deletions(-) create mode 100644 dft/simd/sve1024/Makefile.am create mode 100644 dft/simd/sve128/Makefile.am create mode 100644 dft/simd/sve2048/Makefile.am create mode 100644 dft/simd/sve256/Makefile.am create mode 100644 dft/simd/sve512/Makefile.am create mode 100644 rdft/simd/sve1024/Makefile.am create mode 100644 rdft/simd/sve128/Makefile.am create mode 100644 rdft/simd/sve2048/Makefile.am create mode 100644 rdft/simd/sve256/Makefile.am create mode 100644 rdft/simd/sve512/Makefile.am create mode 100644 simd-support/generate_vtw.c create mode 100755 simd-support/generate_vtw.sh create mode 100644 simd-support/simd-maskedsve.h create mode 100644 simd-support/simd-maskedsve1024.h create mode 100644 simd-support/simd-maskedsve128.h create mode 100644 simd-support/simd-maskedsve2048.h create mode 100644 simd-support/simd-maskedsve256.h create mode 100644 simd-support/simd-maskedsve512.h create mode 100644 simd-support/sve.c create mode 100644 simd-support/vtw.h diff --git a/Makefile.am b/Makefile.am index eaf131cca..1704670d3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -94,6 +94,21 @@ NEON_LIBS = dft/simd/neon/libdft_neon_codelets.la \ rdft/simd/neon/librdft_neon_codelets.la endif +if HAVE_SVE +SVE_LIBS = \ +dft/simd/sve128/libdft_sve128_codelets.la \ +rdft/simd/sve128/librdft_sve128_codelets.la \ +dft/simd/sve256/libdft_sve256_codelets.la \ +rdft/simd/sve256/librdft_sve256_codelets.la \ +dft/simd/sve512/libdft_sve512_codelets.la \ +rdft/simd/sve512/librdft_sve512_codelets.la \ +dft/simd/sve1024/libdft_sve1024_codelets.la \ +rdft/simd/sve1024/librdft_sve1024_codelets.la \ +dft/simd/sve2048/libdft_sve2048_codelets.la \ +rdft/simd/sve2048/librdft_sve2048_codelets.la +endif + + if HAVE_GENERIC_SIMD128 GENERIC_SIMD128_LIBS = dft/simd/generic-simd128/libdft_generic_simd128_codelets.la \ rdft/simd/generic-simd128/librdft_generic_simd128_codelets.la @@ -126,7 +141,7 @@ libfftw3@PREC_SUFFIX@_la_LIBADD = \ api/libapi.la \ $(SIMD_LIBS) $(SSE2_LIBS) $(AVX_LIBS) $(AVX_128_FMA_LIBS) \ $(AVX2_LIBS) $(ALTIVEC_LIBS) \ - $(VSX_LIBS) $(NEON_LIBS) $(KCVI_LIBS) $(AVX512_LIBS) \ + $(VSX_LIBS) $(NEON_LIBS) $(SVE_LIBS) $(KCVI_LIBS) $(AVX512_LIBS) \ $(GENERIC_SIMD128_LIBS) $(GENERIC_SIMD256_LIBS) \ $(COMBINED_THREADLIBS) diff --git a/api/version.c b/api/version.c index 4f14de157..0f79ce759 100644 --- a/api/version.c +++ b/api/version.c @@ -77,6 +77,10 @@ const char X(version)[] = PACKAGE "-" PACKAGE_VERSION "-neon" #endif +#if HAVE_SVE + "-sve" +#endif + #if defined(HAVE_GENERIC_SIMD128) "-generic_simd128" #endif diff --git a/configure.ac b/configure.ac index 5e33b1c2b..e6fd591bf 100644 --- a/configure.ac +++ b/configure.ac @@ -235,6 +235,12 @@ if test "$have_generic_simd256" = "yes"; then fi AM_CONDITIONAL(HAVE_GENERIC_SIMD256, test "$have_generic_simd256" = "yes") +AC_ARG_ENABLE(sve, [AC_HELP_STRING([--enable-sve],[enable ARM SVE optimizations])], have_sve=$enableval, have_sve=no) +if test "$have_sve" = "yes"; then + AC_DEFINE(HAVE_SVE,1,[Define to enable ARM SVE optimizations.]) +fi +AM_CONDITIONAL(HAVE_SVE, test "$have_sve" = "yes") + dnl FIXME: dnl AC_ARG_ENABLE(mips-ps, [AS_HELP_STRING([--enable-mips-ps],[enable MIPS pair-single optimizations])], have_mips_ps=$enableval, have_mips_ps=no) @@ -766,6 +772,11 @@ AC_CONFIG_FILES([ dft/simd/altivec/Makefile dft/simd/vsx/Makefile dft/simd/neon/Makefile + dft/simd/sve128/Makefile + dft/simd/sve256/Makefile + dft/simd/sve512/Makefile + dft/simd/sve1024/Makefile + dft/simd/sve2048/Makefile dft/simd/generic-simd128/Makefile dft/simd/generic-simd256/Makefile @@ -786,6 +797,11 @@ AC_CONFIG_FILES([ rdft/simd/altivec/Makefile rdft/simd/vsx/Makefile rdft/simd/neon/Makefile + rdft/simd/sve128/Makefile + rdft/simd/sve256/Makefile + rdft/simd/sve512/Makefile + rdft/simd/sve1024/Makefile + rdft/simd/sve2048/Makefile rdft/simd/generic-simd128/Makefile rdft/simd/generic-simd256/Makefile diff --git a/dft/codelet-dft.h b/dft/codelet-dft.h index b78e135c8..2ba9bcb66 100644 --- a/dft/codelet-dft.h +++ b/dft/codelet-dft.h @@ -106,6 +106,11 @@ extern const solvtab X(solvtab_dft_kcvi); extern const solvtab X(solvtab_dft_altivec); extern const solvtab X(solvtab_dft_vsx); extern const solvtab X(solvtab_dft_neon); +extern const solvtab X(solvtab_dft_sve128); +extern const solvtab X(solvtab_dft_sve256); +extern const solvtab X(solvtab_dft_sve512); +extern const solvtab X(solvtab_dft_sve1024); +extern const solvtab X(solvtab_dft_sve2048); extern const solvtab X(solvtab_dft_generic_simd128); extern const solvtab X(solvtab_dft_generic_simd256); diff --git a/dft/conf.c b/dft/conf.c index d0951de5d..2cad5c0cd 100644 --- a/dft/conf.c +++ b/dft/conf.c @@ -79,6 +79,18 @@ void X(dft_conf_standard)(planner *p) if (X(have_simd_neon)()) X(solvtab_exec)(X(solvtab_dft_neon), p); #endif +#if HAVE_SVE + if (X(have_simd_sve)(128)) + X(solvtab_exec)(X(solvtab_dft_sve128), p); + if (X(have_simd_sve)(256)) + X(solvtab_exec)(X(solvtab_dft_sve256), p); + if (X(have_simd_sve)(512)) + X(solvtab_exec)(X(solvtab_dft_sve512), p); + if (X(have_simd_sve)(1024)) + X(solvtab_exec)(X(solvtab_dft_sve1024), p); + if (X(have_simd_sve)(2048)) + X(solvtab_exec)(X(solvtab_dft_sve2048), p); +#endif #if HAVE_GENERIC_SIMD128 X(solvtab_exec)(X(solvtab_dft_generic_simd128), p); #endif diff --git a/dft/simd/Makefile.am b/dft/simd/Makefile.am index 315d74474..7b5f28b1b 100644 --- a/dft/simd/Makefile.am +++ b/dft/simd/Makefile.am @@ -1,4 +1,4 @@ AM_CPPFLAGS = -I $(top_srcdir) -SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256 +SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon sve128 sve256 sve512 sve1024 sve2048 generic-simd128 generic-simd256 EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h \ t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk diff --git a/dft/simd/sve1024/Makefile.am b/dft/simd/sve1024/Makefile.am new file mode 100644 index 000000000..89b996197 --- /dev/null +++ b/dft/simd/sve1024/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve1024.h + +include $(top_srcdir)/dft/simd/codlist.mk +include $(top_srcdir)/dft/simd/simd.mk + +if HAVE_SVE + +BUILT_SOURCES = $(EXTRA_DIST) +noinst_LTLIBRARIES = libdft_sve1024_codelets.la +libdft_sve1024_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/dft/simd/sve128/Makefile.am b/dft/simd/sve128/Makefile.am new file mode 100644 index 000000000..9609917bf --- /dev/null +++ b/dft/simd/sve128/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve128.h + +include $(top_srcdir)/dft/simd/codlist.mk +include $(top_srcdir)/dft/simd/simd.mk + +if HAVE_SVE + +BUILT_SOURCES = $(EXTRA_DIST) +noinst_LTLIBRARIES = libdft_sve128_codelets.la +libdft_sve128_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/dft/simd/sve2048/Makefile.am b/dft/simd/sve2048/Makefile.am new file mode 100644 index 000000000..f633df29b --- /dev/null +++ b/dft/simd/sve2048/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve2048.h + +include $(top_srcdir)/dft/simd/codlist.mk +include $(top_srcdir)/dft/simd/simd.mk + +if HAVE_SVE + +BUILT_SOURCES = $(EXTRA_DIST) +noinst_LTLIBRARIES = libdft_sve2048_codelets.la +libdft_sve2048_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/dft/simd/sve256/Makefile.am b/dft/simd/sve256/Makefile.am new file mode 100644 index 000000000..6f21f3a3b --- /dev/null +++ b/dft/simd/sve256/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve256.h + +include $(top_srcdir)/dft/simd/codlist.mk +include $(top_srcdir)/dft/simd/simd.mk + +if HAVE_SVE + +BUILT_SOURCES = $(EXTRA_DIST) +noinst_LTLIBRARIES = libdft_sve256_codelets.la +libdft_sve256_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/dft/simd/sve512/Makefile.am b/dft/simd/sve512/Makefile.am new file mode 100644 index 000000000..dc57ffb5e --- /dev/null +++ b/dft/simd/sve512/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve512.h + +include $(top_srcdir)/dft/simd/codlist.mk +include $(top_srcdir)/dft/simd/simd.mk + +if HAVE_SVE + +BUILT_SOURCES = $(EXTRA_DIST) +noinst_LTLIBRARIES = libdft_sve512_codelets.la +libdft_sve512_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/kernel/ifftw.h b/kernel/ifftw.h index b4705ba8d..bd6efacc7 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -119,6 +119,7 @@ extern int X(have_simd_avx512)(void); extern int X(have_simd_altivec)(void); extern int X(have_simd_vsx)(void); extern int X(have_simd_neon)(void); +extern int X(have_simd_sve)(int minwidth); /* forward declarations */ typedef struct problem_s problem; diff --git a/rdft/codelet-rdft.h b/rdft/codelet-rdft.h index 789040f65..07b62312b 100644 --- a/rdft/codelet-rdft.h +++ b/rdft/codelet-rdft.h @@ -145,6 +145,11 @@ extern const solvtab X(solvtab_rdft_kcvi); extern const solvtab X(solvtab_rdft_altivec); extern const solvtab X(solvtab_rdft_vsx); extern const solvtab X(solvtab_rdft_neon); +extern const solvtab X(solvtab_rdft_sve128); +extern const solvtab X(solvtab_rdft_sve256); +extern const solvtab X(solvtab_rdft_sve512); +extern const solvtab X(solvtab_rdft_sve1024); +extern const solvtab X(solvtab_rdft_sve2048); extern const solvtab X(solvtab_rdft_generic_simd128); extern const solvtab X(solvtab_rdft_generic_simd256); diff --git a/rdft/conf.c b/rdft/conf.c index 5fe8d665f..752d25820 100644 --- a/rdft/conf.c +++ b/rdft/conf.c @@ -96,6 +96,18 @@ void X(rdft_conf_standard)(planner *p) if (X(have_simd_neon)()) X(solvtab_exec)(X(solvtab_rdft_neon), p); #endif +#if HAVE_SVE + if (X(have_simd_sve)(128)) + X(solvtab_exec)(X(solvtab_rdft_sve128), p); + if (X(have_simd_sve)(256)) + X(solvtab_exec)(X(solvtab_rdft_sve256), p); + if (X(have_simd_sve)(512)) + X(solvtab_exec)(X(solvtab_rdft_sve512), p); + if (X(have_simd_sve)(1024)) + X(solvtab_exec)(X(solvtab_rdft_sve1024), p); + if (X(have_simd_sve)(2048)) + X(solvtab_exec)(X(solvtab_rdft_sve2048), p); +#endif #if HAVE_GENERIC_SIMD128 X(solvtab_exec)(X(solvtab_rdft_generic_simd128), p); #endif diff --git a/rdft/simd/Makefile.am b/rdft/simd/Makefile.am index 53de164f0..42dc4d74f 100644 --- a/rdft/simd/Makefile.am +++ b/rdft/simd/Makefile.am @@ -1,4 +1,4 @@ AM_CPPFLAGS = -I $(top_srcdir) -SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256 +SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon sve128 sve256 sve512 sve1024 sve2048 generic-simd128 generic-simd256 EXTRA_DIST = hc2cbv.h hc2cfv.h codlist.mk simd.mk diff --git a/rdft/simd/sve1024/Makefile.am b/rdft/simd/sve1024/Makefile.am new file mode 100644 index 000000000..e02438d1a --- /dev/null +++ b/rdft/simd/sve1024/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve1024.h + +include $(top_srcdir)/rdft/simd/codlist.mk +include $(top_srcdir)/rdft/simd/simd.mk + +if HAVE_SVE + +noinst_LTLIBRARIES = librdft_sve1024_codelets.la +BUILT_SOURCES = $(EXTRA_DIST) +librdft_sve1024_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/rdft/simd/sve128/Makefile.am b/rdft/simd/sve128/Makefile.am new file mode 100644 index 000000000..3bc5216d4 --- /dev/null +++ b/rdft/simd/sve128/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve128.h + +include $(top_srcdir)/rdft/simd/codlist.mk +include $(top_srcdir)/rdft/simd/simd.mk + +if HAVE_SVE + +noinst_LTLIBRARIES = librdft_sve128_codelets.la +BUILT_SOURCES = $(EXTRA_DIST) +librdft_sve128_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/rdft/simd/sve2048/Makefile.am b/rdft/simd/sve2048/Makefile.am new file mode 100644 index 000000000..025aa4de4 --- /dev/null +++ b/rdft/simd/sve2048/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve2048.h + +include $(top_srcdir)/rdft/simd/codlist.mk +include $(top_srcdir)/rdft/simd/simd.mk + +if HAVE_SVE + +noinst_LTLIBRARIES = librdft_sve2048_codelets.la +BUILT_SOURCES = $(EXTRA_DIST) +librdft_sve2048_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/rdft/simd/sve256/Makefile.am b/rdft/simd/sve256/Makefile.am new file mode 100644 index 000000000..c58adb578 --- /dev/null +++ b/rdft/simd/sve256/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve256.h + +include $(top_srcdir)/rdft/simd/codlist.mk +include $(top_srcdir)/rdft/simd/simd.mk + +if HAVE_SVE + +noinst_LTLIBRARIES = librdft_sve256_codelets.la +BUILT_SOURCES = $(EXTRA_DIST) +librdft_sve256_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/rdft/simd/sve512/Makefile.am b/rdft/simd/sve512/Makefile.am new file mode 100644 index 000000000..db9c030cb --- /dev/null +++ b/rdft/simd/sve512/Makefile.am @@ -0,0 +1,13 @@ +AM_CFLAGS = $(SVE_CFLAGS) +SIMD_HEADER=simd-support/simd-maskedsve512.h + +include $(top_srcdir)/rdft/simd/codlist.mk +include $(top_srcdir)/rdft/simd/simd.mk + +if HAVE_SVE + +noinst_LTLIBRARIES = librdft_sve512_codelets.la +BUILT_SOURCES = $(EXTRA_DIST) +librdft_sve512_codelets_la_SOURCES = $(BUILT_SOURCES) + +endif diff --git a/simd-support/Makefile.am b/simd-support/Makefile.am index 26db46e93..60b705377 100644 --- a/simd-support/Makefile.am +++ b/simd-support/Makefile.am @@ -11,5 +11,6 @@ avx512.c simd-avx512.h \ kcvi.c simd-kcvi.h \ altivec.c simd-altivec.h vsx.c simd-vsx.h \ neon.c simd-neon.h \ -simd-generic128.h simd-generic256.h +simd-generic128.h simd-generic256.h \ +sve.c simd-maskedsve.h simd-maskedsve128.h simd-maskedsve256.h simd-maskedsve512.h simd-maskedsve1024.h simd-maskedsve2048.h diff --git a/simd-support/generate_vtw.c b/simd-support/generate_vtw.c new file mode 100644 index 000000000..505a5804c --- /dev/null +++ b/simd-support/generate_vtw.c @@ -0,0 +1,79 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <malloc.h> + +unsigned int rp2(unsigned int size) { + size = size | (size >> 1); + size = size | (size >> 2); + size = size | (size >> 4); + size = size | (size >> 8); + size = size | (size >> 16); +// size = size | (size >> 32); + size = size - (size >> 1); + return size; +} + +int main(int argc, char **argv) { + if (argc < 3) { + printf("usage: %s <array_name> <width>\n", argv[0]); + exit(-1); + } + if (strncmp(argv[1], "VTW1", 4) == 0) { + unsigned int osize = atoi(argv[2]); + unsigned int size = rp2(osize); + if (osize != size) + exit(-4); + if (size < 1) + exit(-2); + if (size > 256) + exit(-3); + printf("#define VTW1(v,x) "); + for (unsigned int i = 0 ; i < size ; i++) { + printf("{TW_CEXP, v+%d, x}%s%s", i, (i == size-1?"":","), ((i%4==3 && i!=size-1)?" \\\n\t":" ")); + } + printf("\n"); + } + if (strncmp(argv[1], "VTW2", 4) == 0) { + unsigned int osize = atoi(argv[2]); + unsigned int size = rp2(osize); + if (osize != size) + exit(-4); + if (size < 1) + exit(-2); + if (size > 256) + exit(-3); + printf("#define VTW2(v,x) "); + for (unsigned int i = 0 ; i < size ; i++) { + printf("{TW_COS, v+%d, x}%s%s", i/2, ",", ((i%4==3)?" \\\n\t":" ")); + } + for (unsigned int i = 0 ; i < size ; i++) { + printf("{TW_SIN, v+%d, %sx}%s%s", i/2, (i%2==0?"-":""), (i == size-1?"":","), ((i%4==3 && i!=size-1)?" \\\n\t":" ")); + } + + printf("\n"); + } + if (strncmp(argv[1], "VTWS", 4) == 0) { + unsigned int osize = atoi(argv[2]); + unsigned int size = rp2(osize); + if (osize != size) + exit(-4); + if (size < 1) + exit(-2); + if (size > 256) + exit(-3); + printf("#define VTWS(v,x) "); + for (unsigned int i = 0 ; i < size ; i++) { + printf("{TW_COS, v+%d, x}%s%s", i, ",", ((i%4==3)?" \\\n\t":" ")); + } + for (unsigned int i = 0 ; i < size ; i++) { + printf("{TW_SIN, v+%d, x}%s%s", i, (i == size-1?"":","), ((i%4==3 && i!=size-1)?" \\\n\t":" ")); + } + + printf("\n"); + } + + + + return 0; +} diff --git a/simd-support/generate_vtw.sh b/simd-support/generate_vtw.sh new file mode 100755 index 000000000..f4a1cfa15 --- /dev/null +++ b/simd-support/generate_vtw.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +echo "/* auto-generated */" +for A in VTW1 VTW2 VTWS; do + echo "#if defined(REQ_$A)" + for X in 1 2 4 8 16 32 64 128 256; do + echo "#if defined(VTW_SIZE) && VTW_SIZE == $X" + echo "#warning \"using $A with $X\"" + ./generate_vtw $A $X + echo "#endif // VTW_SIZE == $X" + done + echo "#endif // REQ_$A" +done diff --git a/simd-support/simd-common.h b/simd-support/simd-common.h index ad2c96fa1..147d9f692 100644 --- a/simd-support/simd-common.h +++ b/simd-support/simd-common.h @@ -34,7 +34,7 @@ #elif defined(HAVE_ALTIVEC) # define ALIGNMENT 8 /* Alignment for the LD/ST macros */ # define ALIGNMENTA 16 /* Alignment for the LDA/STA macros */ -#elif defined(HAVE_NEON) || defined(HAVE_VSX) +#elif defined(HAVE_NEON) || defined(HAVE_VSX) || defined(HAVE_SVE) # define ALIGNMENT 8 /* Alignment for the LD/ST macros */ # define ALIGNMENTA 8 /* Alignment for the LDA/STA macros */ #elif defined(HAVE_KCVI) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h new file mode 100644 index 000000000..459d2bb8b --- /dev/null +++ b/simd-support/simd-maskedsve.h @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2003, 2007-11 Matteo Frigo + * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology + * + * ARM SVE support implemented by Romain Dolbeau. (c) 2017 Romain Dolbeau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD) +#error "SVE vector instructions only works in single or double precision" +#endif + +#ifdef FFTW_SINGLE +# define DS(d,s) s /* single-precision option */ +# define TYPE(name) name ## _f32 +# define TYPESUF(name,suf) name ## _f32 ## suf +# define ALLA svptrue_b32() +#else /* !FFTW_SINGLE */ +# define DS(d,s) d /* double-precision option */ +# define TYPE(name) name ## _f64 +# define TYPESUF(name,suf) name ## _f64 ## suf +# define ALLA svptrue_b64() +#endif /* FFTW_SINGLE */ + +//#define SIMD_SUFFIX _sve /* for renaming */ +#if SVE_SIZE == 2048 +#define VL DS(16, 32) /* SIMD complex vector length */ +#define MASKA DS(svptrue_pat_b64(SV_VL32),svptrue_pat_b32(SV_VL64)) +#elif SVE_SIZE == 1024 +#define VL DS(8, 16) /* SIMD complex vector length */ +#define MASKA DS(svptrue_pat_b64(SV_VL16),svptrue_pat_b32(SV_VL32)) +#elif SVE_SIZE == 512 +#define VL DS(4, 8) /* SIMD complex vector length */ +#define MASKA DS(svptrue_pat_b64(SV_VL8),svptrue_pat_b32(SV_VL16)) +#elif SVE_SIZE == 256 +#define VL DS(2, 4) /* SIMD complex vector length */ +#define MASKA DS(svptrue_pat_b64(SV_VL4),svptrue_pat_b32(SV_VL8)) +#elif SVE_SIZE == 128 +#define VL DS(1, 2) /* SIMD complex vector length */ +#define MASKA DS(svptrue_pat_b64(SV_VL2),svptrue_pat_b32(SV_VL4)) +#else /* SVE_SIZE */ +#error "SVE_SIZE must be 128, 256, 512, 1024, 2048 bits" +#endif /* SVE_SIZE */ +#define SIMD_VSTRIDE_OKA(x) ((x) == 2) +#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK + +#if defined(__GNUC__) && !defined(__ARM_FEATURE_SVE) /* sanity check */ +#error "compiling simd-sve.h without SVE support" +#endif + +#include <arm_sve.h> + +typedef DS(svfloat64_t, svfloat32_t) V; + +#define VLIT(re, im) DS(svdupq_n_f64(re,im),svdupq_n_f32(re,im,re,im)) +#define VLIT1(val) DS(svdup_n_f64(val), svdup_n_f32(val)) +#define LDK(x) x +#define DVK(var, val) V var = VLIT1(val) +#define VZERO VLIT1(DS(0.,0.f)) +#define VRONE VLIT(DS(1.,1.f),DS(0.,0.f)) +#define VCI VLIT(DS(0.,0.f),DS(1.,1.f)) +#define VCONEMI VLIT(DS(1.,1.f),DS(-1.,-1.f)) +#define VONE VLIT1(DS(1.,1.f)) +#define VMINUSONE VLIT1(DS(-1.,-1.f)) + +#define VDUPL(x) TYPE(svtrn1)(x,x) +#define VDUPH(x) TYPE(svtrn2)(x,x) + +#ifdef FFTW_SINGLE +//#define FLIP_RI(x) svreinterpret_f32_u64(svrevw_u64_x(MASKA,svreinterpret_u64_f32(x))) +#define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x) +#else +#define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x) +#endif + +/* FXIME: there is a better way, surely */ +/* #define VCONJ(x) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VRONE,0),x,VRONE,270) */ +#define VCONJ(x) TYPESUF(svmul,_x)(MASKA,x,VCONEMI) +#define VBYI(x) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VCI,0),x,VCI,90) + +#define VNEG(a) TYPESUF(svneg,_x)(MASKA,a) +#define VADD(a,b) TYPESUF(svadd,_x)(MASKA,a,b) +#define VSUB(a,b) TYPESUF(svsub,_x)(MASKA,a,b) +#define VMUL(a,b) TYPESUF(svmul,_x)(MASKA,a,b) +#define VFMA(a, b, c) TYPESUF(svmad,_x)(MASKA,b,a,c) +#define VFMS(a, b, c) TYPESUF(svnmsb,_x)(MASKA,b,a,c) +#define VFNMS(a, b, c) TYPESUF(svmsb,_x)(MASKA,b,a,c) +#define VFMAI(b, c) TYPESUF(svcadd,_x)(MASKA,c,b,90) +#define VFNMSI(b, c) TYPESUF(svcadd,_x)(MASKA,c,b,270) +/* FIXME: next 3 overkill ? */ +#if 0 +#define VFMACONJ(b,c) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,c,b,VRONE,0),b,VRONE,270) +#else +/* Use inline functions instead of macros to avoid replicating inputs */ +static inline V VFMACONJ(V b, V c) { + V m = TYPESUF(svcmla,_x)(MASKA,c,b,VRONE,0); + return TYPESUF(svcmla,_x)(MASKA,m,b,VRONE,270); +} +#endif +#define VFMSCONJ(b,c) VFMACONJ(b,VNEG(c)) +#define VFNMSCONJ(b,c) VNEG(VFMSCONJ(b,c)) + +#if 0 +#define VZMUL(a,b) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0),a,b,90) +#define VZMULJ(a,b) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0),a,b,270) +#define VZMULI(a,b) VZMUL(VCI,VZMUL(a,b)) +#define VZMULIJ(a,b) VZMUL(VCI,VZMULJ(a,b)) +#else +/* Use inline functions instead of macros to avoid replicating inputs */ +static inline V VZMUL(V a, V b) { + V m = TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0); + return TYPESUF(svcmla,_x)(MASKA,m,a,b,90); +} +static inline V VZMULJ(V a, V b) { + V m = TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0); + return TYPESUF(svcmla,_x)(MASKA,m,a,b,270); +} +/* FIXME: there's probably a better way */ +static inline V VZMULI(V a, V b) { + V m = VZMUL(a,b); + return VZMUL(VCI,m); +} +/* FIXME: there's probably a better way */ +static inline V VZMULIJ(V a, V b) { + V m = VZMULJ(a,b); + return VZMUL(VCI,m); +} +#endif + +static inline V LDA(const R *x, INT ivs, const R *aligned_like) { + (void)aligned_like; /* UNUSED */ + (void)ivs; /* UNUSED */ + return TYPE(svld1)(MASKA,x); +} +static inline void STA(R *x, V v, INT ovs, const R *aligned_like) { + (void)aligned_like; /* UNUSED */ + (void)ovs; /* UNUSED */ + TYPE(svst1)(MASKA,x,v); +} + +#if FFTW_SINGLE + +static inline V LDu(const R *x, INT ivs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + svuint32_t gvvl = svindex_u32(0, 1); + gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ivs); + gvvl = svzip1_u32(gvvl, gvvl); + gvvl = svadd_u32_x(svptrue_b32(), gvvl, svdupq_n_u32(0,sizeof(R),0,sizeof(R))); + + return svld1_gather_u32offset_f32(MASKA, x, gvvl); +} + +static inline void STu(R *x, V v, INT ovs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + if (ovs==0) { // FIXME: hack for extra_iter hack support + v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); + } + svuint32_t gvvl = svindex_u32(0, 1); + gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ovs); + gvvl = svzip1_u32(gvvl, gvvl); + gvvl = svadd_u32_x(svptrue_b32(), gvvl, svdupq_n_u32(0,sizeof(R),0,sizeof(R))); + + svst1_scatter_u32offset_f32(MASKA, x, gvvl, v); +} + +#else /* !FFTW_SINGLE */ + +static inline V LDu(const R *x, INT ivs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + (void)aligned_like; /* UNUSED */ + svuint64_t gvvl = svindex_u64(0, 1); + gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ivs); + gvvl = svzip1_u64(gvvl, gvvl); + gvvl = svadd_u64_x(svptrue_b64(), gvvl, svdupq_n_u64(0,sizeof(R))); + + return svld1_gather_u64offset_f64(MASKA, x, gvvl); +} + +static inline void STu(R *x, V v, INT ovs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + if (ovs==0) { // FIXME: hack for extra_iter hack support + v = svdupq_lane_f64(v,0); + } + svuint64_t gvvl = svindex_u64(0, 1); + gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ovs); + gvvl = svzip1_u64(gvvl, gvvl); + gvvl = svadd_u64_x(svptrue_b64(), gvvl, svdupq_n_u64(0,sizeof(R))); + + svst1_scatter_u64offset_f64(MASKA, x, gvvl, v); +} + +#endif /* FFTW_SINGLE */ + +#define LD LDu +#define ST STu + +#ifdef FFTW_SINGLE +#define STM2(x, v, ovs, a) ST(x, v, ovs, a) +#define STN2(x, v0, v1, ovs) /* nop */ + +static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + (void)aligned_like; /* UNUSED */ + svuint32_t gvvl = svindex_u32(0, 1); + gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ovs); + + svst1_scatter_u32offset_f32(MASKA, x, gvvl, v); +} +#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */ +#else /* !FFTW_SINGLE */ +#define STM2(x, v, ovs, a) ST(x, v, ovs, a) +#define STN2(x, v0, v1, ovs) /* nop */ + +static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) +{ + (void)aligned_like; /* UNUSED */ + (void)aligned_like; /* UNUSED */ + svuint64_t gvvl = svindex_u64(0, 1); + gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ovs); + + svst1_scatter_u64offset_f64(MASKA, x, gvvl, v); +} +#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */ +#endif /* FFTW_SINGLE */ + +/* twiddle storage #1: compact, slower */ +#define REQ_VTW1 +#define VTW_SIZE VL +#include "vtw.h" +#define TWVL1 (VL) +#undef VTW_SIZE +#undef REQ_VTW1 + +static inline V BYTW1(const R *t, V sr) +{ + return VZMUL(LDA(t, 2, t), sr); +} + +static inline V BYTWJ1(const R *t, V sr) +{ + return VZMULJ(LDA(t, 2, t), sr); +} + +/* twiddle storage #2: twice the space, faster (when in cache) */ +#define REQ_VTW2 +#define VTW_SIZE (2*VL) +#include "vtw.h" +#define TWVL2 (2*VL) +#undef VTW_SIZE +#undef REQ_VTW2 + +static inline V BYTW2(const R *t, V sr) +{ + V si = FLIP_RI(sr); + V ti = LDA(t + 2*VL, 2, t + 4*VL); + V tr = LDA(t, 2, t); + return VFMA(tr, sr, VMUL(ti, si)); +} + +static inline V BYTWJ2(const R *t, V sr) +{ + V si = FLIP_RI(sr); + V ti = LDA(t + 2*VL, 2, t + 4*VL); + V tr = LDA(t, 2, t); + return VFNMS(ti, si, VMUL(tr, sr)); +} + +/* twiddle storage #3 */ +#define VTW3(v,x) VTW1(v,x) +#define TWVL3 TWVL1 + +/* twiddle storage for split arrays */ +#define REQ_VTWS +#define VTW_SIZE (2*VL) +#include "vtw.h" +#define TWVLS (2*VL) +#undef VTW_SIZE +#undef REQ_VTWS + +#define VLEAVE() /* nothing */ + +#include "simd-common.h" diff --git a/simd-support/simd-maskedsve1024.h b/simd-support/simd-maskedsve1024.h new file mode 100644 index 000000000..736eaf135 --- /dev/null +++ b/simd-support/simd-maskedsve1024.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2003, 2007-11 Matteo Frigo + * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology + * + * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + + +#define SIMD_SUFFIX _sve1024 /* for renaming */ +#define SVE_SIZE 1024 +#include "simd-maskedsve.h" + diff --git a/simd-support/simd-maskedsve128.h b/simd-support/simd-maskedsve128.h new file mode 100644 index 000000000..a97ffe400 --- /dev/null +++ b/simd-support/simd-maskedsve128.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2003, 2007-11 Matteo Frigo + * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology + * + * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + + +#define SIMD_SUFFIX _sve128 /* for renaming */ +#define SVE_SIZE 128 +#include "simd-maskedsve.h" + diff --git a/simd-support/simd-maskedsve2048.h b/simd-support/simd-maskedsve2048.h new file mode 100644 index 000000000..966a46614 --- /dev/null +++ b/simd-support/simd-maskedsve2048.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2003, 2007-11 Matteo Frigo + * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology + * + * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + + +#define SIMD_SUFFIX _sve2048 /* for renaming */ +#define SVE_SIZE 2048 +#include "simd-maskedsve.h" + diff --git a/simd-support/simd-maskedsve256.h b/simd-support/simd-maskedsve256.h new file mode 100644 index 000000000..e36be395b --- /dev/null +++ b/simd-support/simd-maskedsve256.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2003, 2007-11 Matteo Frigo + * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology + * + * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + + +#define SIMD_SUFFIX _sve256 /* for renaming */ +#define SVE_SIZE 256 +#include "simd-maskedsve.h" + diff --git a/simd-support/simd-maskedsve512.h b/simd-support/simd-maskedsve512.h new file mode 100644 index 000000000..0fc09b944 --- /dev/null +++ b/simd-support/simd-maskedsve512.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2003, 2007-11 Matteo Frigo + * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology + * + * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + + +#define SIMD_SUFFIX _sve512 /* for renaming */ +#define SVE_SIZE 512 +#include "simd-maskedsve.h" + diff --git a/simd-support/sve.c b/simd-support/sve.c new file mode 100644 index 000000000..9efc8df5e --- /dev/null +++ b/simd-support/sve.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2003, 2007-14 Matteo Frigo + * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include "kernel/ifftw.h" + +#if HAVE_SVE +#if defined(__ARM_FEATURE_SVE) +#include <arm_sve.h> +#endif + + static int sve_getwidth(void) { +#if defined(__GNUC__) && !defined(__ARM_FEATURE_SVE) +#warning "SVE not supported" + return -1; +#else + return svcntb()*8; +#endif + } + + int X(have_simd_sve)(int minwidth) + { + static int init = 0, res; + + if (!init) { + init = sve_getwidth(); + } + return ((init > 0) ? (minwidth <= init ? 1 : 0) : 0); + } + + +#endif diff --git a/simd-support/vtw.h b/simd-support/vtw.h new file mode 100644 index 000000000..0c31a32b1 --- /dev/null +++ b/simd-support/vtw.h @@ -0,0 +1,729 @@ +/* auto-generated */ +#if defined(REQ_VTW1) +#if defined(VTW_SIZE) && VTW_SIZE == 1 +#warning "using VTW1 with 1" +#define VTW1(v,x) {TW_CEXP, v+0, x} +#endif // VTW_SIZE == 1 +#if defined(VTW_SIZE) && VTW_SIZE == 2 +#warning "using VTW1 with 2" +#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x} +#endif // VTW_SIZE == 2 +#if defined(VTW_SIZE) && VTW_SIZE == 4 +#warning "using VTW1 with 4" +#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x} +#endif // VTW_SIZE == 4 +#if defined(VTW_SIZE) && VTW_SIZE == 8 +#warning "using VTW1 with 8" +#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ + {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x} +#endif // VTW_SIZE == 8 +#if defined(VTW_SIZE) && VTW_SIZE == 16 +#warning "using VTW1 with 16" +#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ + {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ + {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ + {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x} +#endif // VTW_SIZE == 16 +#if defined(VTW_SIZE) && VTW_SIZE == 32 +#warning "using VTW1 with 32" +#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ + {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ + {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ + {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \ + {TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \ + {TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \ + {TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \ + {TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x} +#endif // VTW_SIZE == 32 +#if defined(VTW_SIZE) && VTW_SIZE == 64 +#warning "using VTW1 with 64" +#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ + {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ + {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ + {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \ + {TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \ + {TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \ + {TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \ + {TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \ + {TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \ + {TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \ + {TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \ + {TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \ + {TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \ + {TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \ + {TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \ + {TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x} +#endif // VTW_SIZE == 64 +#if defined(VTW_SIZE) && VTW_SIZE == 128 +#warning "using VTW1 with 128" +#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ + {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ + {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ + {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \ + {TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \ + {TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \ + {TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \ + {TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \ + {TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \ + {TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \ + {TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \ + {TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \ + {TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \ + {TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \ + {TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \ + {TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x}, \ + {TW_CEXP, v+64, x}, {TW_CEXP, v+65, x}, {TW_CEXP, v+66, x}, {TW_CEXP, v+67, x}, \ + {TW_CEXP, v+68, x}, {TW_CEXP, v+69, x}, {TW_CEXP, v+70, x}, {TW_CEXP, v+71, x}, \ + {TW_CEXP, v+72, x}, {TW_CEXP, v+73, x}, {TW_CEXP, v+74, x}, {TW_CEXP, v+75, x}, \ + {TW_CEXP, v+76, x}, {TW_CEXP, v+77, x}, {TW_CEXP, v+78, x}, {TW_CEXP, v+79, x}, \ + {TW_CEXP, v+80, x}, {TW_CEXP, v+81, x}, {TW_CEXP, v+82, x}, {TW_CEXP, v+83, x}, \ + {TW_CEXP, v+84, x}, {TW_CEXP, v+85, x}, {TW_CEXP, v+86, x}, {TW_CEXP, v+87, x}, \ + {TW_CEXP, v+88, x}, {TW_CEXP, v+89, x}, {TW_CEXP, v+90, x}, {TW_CEXP, v+91, x}, \ + {TW_CEXP, v+92, x}, {TW_CEXP, v+93, x}, {TW_CEXP, v+94, x}, {TW_CEXP, v+95, x}, \ + {TW_CEXP, v+96, x}, {TW_CEXP, v+97, x}, {TW_CEXP, v+98, x}, {TW_CEXP, v+99, x}, \ + {TW_CEXP, v+100, x}, {TW_CEXP, v+101, x}, {TW_CEXP, v+102, x}, {TW_CEXP, v+103, x}, \ + {TW_CEXP, v+104, x}, {TW_CEXP, v+105, x}, {TW_CEXP, v+106, x}, {TW_CEXP, v+107, x}, \ + {TW_CEXP, v+108, x}, {TW_CEXP, v+109, x}, {TW_CEXP, v+110, x}, {TW_CEXP, v+111, x}, \ + {TW_CEXP, v+112, x}, {TW_CEXP, v+113, x}, {TW_CEXP, v+114, x}, {TW_CEXP, v+115, x}, \ + {TW_CEXP, v+116, x}, {TW_CEXP, v+117, x}, {TW_CEXP, v+118, x}, {TW_CEXP, v+119, x}, \ + {TW_CEXP, v+120, x}, {TW_CEXP, v+121, x}, {TW_CEXP, v+122, x}, {TW_CEXP, v+123, x}, \ + {TW_CEXP, v+124, x}, {TW_CEXP, v+125, x}, {TW_CEXP, v+126, x}, {TW_CEXP, v+127, x} +#endif // VTW_SIZE == 128 +#if defined(VTW_SIZE) && VTW_SIZE == 256 +#warning "using VTW1 with 256" +#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ + {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ + {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ + {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \ + {TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \ + {TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \ + {TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \ + {TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \ + {TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \ + {TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \ + {TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \ + {TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \ + {TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \ + {TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \ + {TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \ + {TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x}, \ + {TW_CEXP, v+64, x}, {TW_CEXP, v+65, x}, {TW_CEXP, v+66, x}, {TW_CEXP, v+67, x}, \ + {TW_CEXP, v+68, x}, {TW_CEXP, v+69, x}, {TW_CEXP, v+70, x}, {TW_CEXP, v+71, x}, \ + {TW_CEXP, v+72, x}, {TW_CEXP, v+73, x}, {TW_CEXP, v+74, x}, {TW_CEXP, v+75, x}, \ + {TW_CEXP, v+76, x}, {TW_CEXP, v+77, x}, {TW_CEXP, v+78, x}, {TW_CEXP, v+79, x}, \ + {TW_CEXP, v+80, x}, {TW_CEXP, v+81, x}, {TW_CEXP, v+82, x}, {TW_CEXP, v+83, x}, \ + {TW_CEXP, v+84, x}, {TW_CEXP, v+85, x}, {TW_CEXP, v+86, x}, {TW_CEXP, v+87, x}, \ + {TW_CEXP, v+88, x}, {TW_CEXP, v+89, x}, {TW_CEXP, v+90, x}, {TW_CEXP, v+91, x}, \ + {TW_CEXP, v+92, x}, {TW_CEXP, v+93, x}, {TW_CEXP, v+94, x}, {TW_CEXP, v+95, x}, \ + {TW_CEXP, v+96, x}, {TW_CEXP, v+97, x}, {TW_CEXP, v+98, x}, {TW_CEXP, v+99, x}, \ + {TW_CEXP, v+100, x}, {TW_CEXP, v+101, x}, {TW_CEXP, v+102, x}, {TW_CEXP, v+103, x}, \ + {TW_CEXP, v+104, x}, {TW_CEXP, v+105, x}, {TW_CEXP, v+106, x}, {TW_CEXP, v+107, x}, \ + {TW_CEXP, v+108, x}, {TW_CEXP, v+109, x}, {TW_CEXP, v+110, x}, {TW_CEXP, v+111, x}, \ + {TW_CEXP, v+112, x}, {TW_CEXP, v+113, x}, {TW_CEXP, v+114, x}, {TW_CEXP, v+115, x}, \ + {TW_CEXP, v+116, x}, {TW_CEXP, v+117, x}, {TW_CEXP, v+118, x}, {TW_CEXP, v+119, x}, \ + {TW_CEXP, v+120, x}, {TW_CEXP, v+121, x}, {TW_CEXP, v+122, x}, {TW_CEXP, v+123, x}, \ + {TW_CEXP, v+124, x}, {TW_CEXP, v+125, x}, {TW_CEXP, v+126, x}, {TW_CEXP, v+127, x}, \ + {TW_CEXP, v+128, x}, {TW_CEXP, v+129, x}, {TW_CEXP, v+130, x}, {TW_CEXP, v+131, x}, \ + {TW_CEXP, v+132, x}, {TW_CEXP, v+133, x}, {TW_CEXP, v+134, x}, {TW_CEXP, v+135, x}, \ + {TW_CEXP, v+136, x}, {TW_CEXP, v+137, x}, {TW_CEXP, v+138, x}, {TW_CEXP, v+139, x}, \ + {TW_CEXP, v+140, x}, {TW_CEXP, v+141, x}, {TW_CEXP, v+142, x}, {TW_CEXP, v+143, x}, \ + {TW_CEXP, v+144, x}, {TW_CEXP, v+145, x}, {TW_CEXP, v+146, x}, {TW_CEXP, v+147, x}, \ + {TW_CEXP, v+148, x}, {TW_CEXP, v+149, x}, {TW_CEXP, v+150, x}, {TW_CEXP, v+151, x}, \ + {TW_CEXP, v+152, x}, {TW_CEXP, v+153, x}, {TW_CEXP, v+154, x}, {TW_CEXP, v+155, x}, \ + {TW_CEXP, v+156, x}, {TW_CEXP, v+157, x}, {TW_CEXP, v+158, x}, {TW_CEXP, v+159, x}, \ + {TW_CEXP, v+160, x}, {TW_CEXP, v+161, x}, {TW_CEXP, v+162, x}, {TW_CEXP, v+163, x}, \ + {TW_CEXP, v+164, x}, {TW_CEXP, v+165, x}, {TW_CEXP, v+166, x}, {TW_CEXP, v+167, x}, \ + {TW_CEXP, v+168, x}, {TW_CEXP, v+169, x}, {TW_CEXP, v+170, x}, {TW_CEXP, v+171, x}, \ + {TW_CEXP, v+172, x}, {TW_CEXP, v+173, x}, {TW_CEXP, v+174, x}, {TW_CEXP, v+175, x}, \ + {TW_CEXP, v+176, x}, {TW_CEXP, v+177, x}, {TW_CEXP, v+178, x}, {TW_CEXP, v+179, x}, \ + {TW_CEXP, v+180, x}, {TW_CEXP, v+181, x}, {TW_CEXP, v+182, x}, {TW_CEXP, v+183, x}, \ + {TW_CEXP, v+184, x}, {TW_CEXP, v+185, x}, {TW_CEXP, v+186, x}, {TW_CEXP, v+187, x}, \ + {TW_CEXP, v+188, x}, {TW_CEXP, v+189, x}, {TW_CEXP, v+190, x}, {TW_CEXP, v+191, x}, \ + {TW_CEXP, v+192, x}, {TW_CEXP, v+193, x}, {TW_CEXP, v+194, x}, {TW_CEXP, v+195, x}, \ + {TW_CEXP, v+196, x}, {TW_CEXP, v+197, x}, {TW_CEXP, v+198, x}, {TW_CEXP, v+199, x}, \ + {TW_CEXP, v+200, x}, {TW_CEXP, v+201, x}, {TW_CEXP, v+202, x}, {TW_CEXP, v+203, x}, \ + {TW_CEXP, v+204, x}, {TW_CEXP, v+205, x}, {TW_CEXP, v+206, x}, {TW_CEXP, v+207, x}, \ + {TW_CEXP, v+208, x}, {TW_CEXP, v+209, x}, {TW_CEXP, v+210, x}, {TW_CEXP, v+211, x}, \ + {TW_CEXP, v+212, x}, {TW_CEXP, v+213, x}, {TW_CEXP, v+214, x}, {TW_CEXP, v+215, x}, \ + {TW_CEXP, v+216, x}, {TW_CEXP, v+217, x}, {TW_CEXP, v+218, x}, {TW_CEXP, v+219, x}, \ + {TW_CEXP, v+220, x}, {TW_CEXP, v+221, x}, {TW_CEXP, v+222, x}, {TW_CEXP, v+223, x}, \ + {TW_CEXP, v+224, x}, {TW_CEXP, v+225, x}, {TW_CEXP, v+226, x}, {TW_CEXP, v+227, x}, \ + {TW_CEXP, v+228, x}, {TW_CEXP, v+229, x}, {TW_CEXP, v+230, x}, {TW_CEXP, v+231, x}, \ + {TW_CEXP, v+232, x}, {TW_CEXP, v+233, x}, {TW_CEXP, v+234, x}, {TW_CEXP, v+235, x}, \ + {TW_CEXP, v+236, x}, {TW_CEXP, v+237, x}, {TW_CEXP, v+238, x}, {TW_CEXP, v+239, x}, \ + {TW_CEXP, v+240, x}, {TW_CEXP, v+241, x}, {TW_CEXP, v+242, x}, {TW_CEXP, v+243, x}, \ + {TW_CEXP, v+244, x}, {TW_CEXP, v+245, x}, {TW_CEXP, v+246, x}, {TW_CEXP, v+247, x}, \ + {TW_CEXP, v+248, x}, {TW_CEXP, v+249, x}, {TW_CEXP, v+250, x}, {TW_CEXP, v+251, x}, \ + {TW_CEXP, v+252, x}, {TW_CEXP, v+253, x}, {TW_CEXP, v+254, x}, {TW_CEXP, v+255, x} +#endif // VTW_SIZE == 256 +#endif // REQ_VTW1 +#if defined(REQ_VTW2) +#if defined(VTW_SIZE) && VTW_SIZE == 1 +#warning "using VTW2 with 1" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_SIN, v+0, -x} +#endif // VTW_SIZE == 1 +#if defined(VTW_SIZE) && VTW_SIZE == 2 +#warning "using VTW2 with 2" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_SIN, v+0, -x}, {TW_SIN, v+0, x} +#endif // VTW_SIZE == 2 +#if defined(VTW_SIZE) && VTW_SIZE == 4 +#warning "using VTW2 with 4" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ + {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x} +#endif // VTW_SIZE == 4 +#if defined(VTW_SIZE) && VTW_SIZE == 8 +#warning "using VTW2 with 8" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ + {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ + {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ + {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x} +#endif // VTW_SIZE == 8 +#if defined(VTW_SIZE) && VTW_SIZE == 16 +#warning "using VTW2 with 16" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ + {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ + {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ + {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ + {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ + {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x} +#endif // VTW_SIZE == 16 +#if defined(VTW_SIZE) && VTW_SIZE == 32 +#warning "using VTW2 with 32" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ + {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ + {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \ + {TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \ + {TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \ + {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ + {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ + {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \ + {TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \ + {TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x} +#endif // VTW_SIZE == 32 +#if defined(VTW_SIZE) && VTW_SIZE == 64 +#warning "using VTW2 with 64" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ + {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ + {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \ + {TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \ + {TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \ + {TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \ + {TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \ + {TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \ + {TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \ + {TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \ + {TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \ + {TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \ + {TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \ + {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ + {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ + {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \ + {TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \ + {TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \ + {TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \ + {TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \ + {TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \ + {TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \ + {TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \ + {TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \ + {TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \ + {TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x} +#endif // VTW_SIZE == 64 +#if defined(VTW_SIZE) && VTW_SIZE == 128 +#warning "using VTW2 with 128" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ + {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ + {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \ + {TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \ + {TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \ + {TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \ + {TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \ + {TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \ + {TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \ + {TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \ + {TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \ + {TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \ + {TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \ + {TW_COS, v+32, x}, {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+33, x}, \ + {TW_COS, v+34, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, {TW_COS, v+35, x}, \ + {TW_COS, v+36, x}, {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+37, x}, \ + {TW_COS, v+38, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, {TW_COS, v+39, x}, \ + {TW_COS, v+40, x}, {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+41, x}, \ + {TW_COS, v+42, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, {TW_COS, v+43, x}, \ + {TW_COS, v+44, x}, {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+45, x}, \ + {TW_COS, v+46, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, {TW_COS, v+47, x}, \ + {TW_COS, v+48, x}, {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+49, x}, \ + {TW_COS, v+50, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, {TW_COS, v+51, x}, \ + {TW_COS, v+52, x}, {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+53, x}, \ + {TW_COS, v+54, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, {TW_COS, v+55, x}, \ + {TW_COS, v+56, x}, {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+57, x}, \ + {TW_COS, v+58, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, {TW_COS, v+59, x}, \ + {TW_COS, v+60, x}, {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+61, x}, \ + {TW_COS, v+62, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, {TW_COS, v+63, x}, \ + {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ + {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ + {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \ + {TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \ + {TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \ + {TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \ + {TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \ + {TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \ + {TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \ + {TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \ + {TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \ + {TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \ + {TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x}, \ + {TW_SIN, v+32, -x}, {TW_SIN, v+32, x}, {TW_SIN, v+33, -x}, {TW_SIN, v+33, x}, \ + {TW_SIN, v+34, -x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, -x}, {TW_SIN, v+35, x}, \ + {TW_SIN, v+36, -x}, {TW_SIN, v+36, x}, {TW_SIN, v+37, -x}, {TW_SIN, v+37, x}, \ + {TW_SIN, v+38, -x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, -x}, {TW_SIN, v+39, x}, \ + {TW_SIN, v+40, -x}, {TW_SIN, v+40, x}, {TW_SIN, v+41, -x}, {TW_SIN, v+41, x}, \ + {TW_SIN, v+42, -x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, -x}, {TW_SIN, v+43, x}, \ + {TW_SIN, v+44, -x}, {TW_SIN, v+44, x}, {TW_SIN, v+45, -x}, {TW_SIN, v+45, x}, \ + {TW_SIN, v+46, -x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, -x}, {TW_SIN, v+47, x}, \ + {TW_SIN, v+48, -x}, {TW_SIN, v+48, x}, {TW_SIN, v+49, -x}, {TW_SIN, v+49, x}, \ + {TW_SIN, v+50, -x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, -x}, {TW_SIN, v+51, x}, \ + {TW_SIN, v+52, -x}, {TW_SIN, v+52, x}, {TW_SIN, v+53, -x}, {TW_SIN, v+53, x}, \ + {TW_SIN, v+54, -x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, -x}, {TW_SIN, v+55, x}, \ + {TW_SIN, v+56, -x}, {TW_SIN, v+56, x}, {TW_SIN, v+57, -x}, {TW_SIN, v+57, x}, \ + {TW_SIN, v+58, -x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, -x}, {TW_SIN, v+59, x}, \ + {TW_SIN, v+60, -x}, {TW_SIN, v+60, x}, {TW_SIN, v+61, -x}, {TW_SIN, v+61, x}, \ + {TW_SIN, v+62, -x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, -x}, {TW_SIN, v+63, x} +#endif // VTW_SIZE == 128 +#if defined(VTW_SIZE) && VTW_SIZE == 256 +#warning "using VTW2 with 256" +#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ + {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ + {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \ + {TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \ + {TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \ + {TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \ + {TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \ + {TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \ + {TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \ + {TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \ + {TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \ + {TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \ + {TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \ + {TW_COS, v+32, x}, {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+33, x}, \ + {TW_COS, v+34, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, {TW_COS, v+35, x}, \ + {TW_COS, v+36, x}, {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+37, x}, \ + {TW_COS, v+38, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, {TW_COS, v+39, x}, \ + {TW_COS, v+40, x}, {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+41, x}, \ + {TW_COS, v+42, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, {TW_COS, v+43, x}, \ + {TW_COS, v+44, x}, {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+45, x}, \ + {TW_COS, v+46, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, {TW_COS, v+47, x}, \ + {TW_COS, v+48, x}, {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+49, x}, \ + {TW_COS, v+50, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, {TW_COS, v+51, x}, \ + {TW_COS, v+52, x}, {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+53, x}, \ + {TW_COS, v+54, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, {TW_COS, v+55, x}, \ + {TW_COS, v+56, x}, {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+57, x}, \ + {TW_COS, v+58, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, {TW_COS, v+59, x}, \ + {TW_COS, v+60, x}, {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+61, x}, \ + {TW_COS, v+62, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, {TW_COS, v+63, x}, \ + {TW_COS, v+64, x}, {TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+65, x}, \ + {TW_COS, v+66, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, {TW_COS, v+67, x}, \ + {TW_COS, v+68, x}, {TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+69, x}, \ + {TW_COS, v+70, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, {TW_COS, v+71, x}, \ + {TW_COS, v+72, x}, {TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+73, x}, \ + {TW_COS, v+74, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, {TW_COS, v+75, x}, \ + {TW_COS, v+76, x}, {TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+77, x}, \ + {TW_COS, v+78, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, {TW_COS, v+79, x}, \ + {TW_COS, v+80, x}, {TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+81, x}, \ + {TW_COS, v+82, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, {TW_COS, v+83, x}, \ + {TW_COS, v+84, x}, {TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+85, x}, \ + {TW_COS, v+86, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, {TW_COS, v+87, x}, \ + {TW_COS, v+88, x}, {TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+89, x}, \ + {TW_COS, v+90, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, {TW_COS, v+91, x}, \ + {TW_COS, v+92, x}, {TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+93, x}, \ + {TW_COS, v+94, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, {TW_COS, v+95, x}, \ + {TW_COS, v+96, x}, {TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+97, x}, \ + {TW_COS, v+98, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, {TW_COS, v+99, x}, \ + {TW_COS, v+100, x}, {TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+101, x}, \ + {TW_COS, v+102, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, {TW_COS, v+103, x}, \ + {TW_COS, v+104, x}, {TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+105, x}, \ + {TW_COS, v+106, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, {TW_COS, v+107, x}, \ + {TW_COS, v+108, x}, {TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+109, x}, \ + {TW_COS, v+110, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, {TW_COS, v+111, x}, \ + {TW_COS, v+112, x}, {TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+113, x}, \ + {TW_COS, v+114, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, {TW_COS, v+115, x}, \ + {TW_COS, v+116, x}, {TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+117, x}, \ + {TW_COS, v+118, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, {TW_COS, v+119, x}, \ + {TW_COS, v+120, x}, {TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+121, x}, \ + {TW_COS, v+122, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, {TW_COS, v+123, x}, \ + {TW_COS, v+124, x}, {TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+125, x}, \ + {TW_COS, v+126, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, {TW_COS, v+127, x}, \ + {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ + {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ + {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \ + {TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \ + {TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \ + {TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \ + {TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \ + {TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \ + {TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \ + {TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \ + {TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \ + {TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \ + {TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x}, \ + {TW_SIN, v+32, -x}, {TW_SIN, v+32, x}, {TW_SIN, v+33, -x}, {TW_SIN, v+33, x}, \ + {TW_SIN, v+34, -x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, -x}, {TW_SIN, v+35, x}, \ + {TW_SIN, v+36, -x}, {TW_SIN, v+36, x}, {TW_SIN, v+37, -x}, {TW_SIN, v+37, x}, \ + {TW_SIN, v+38, -x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, -x}, {TW_SIN, v+39, x}, \ + {TW_SIN, v+40, -x}, {TW_SIN, v+40, x}, {TW_SIN, v+41, -x}, {TW_SIN, v+41, x}, \ + {TW_SIN, v+42, -x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, -x}, {TW_SIN, v+43, x}, \ + {TW_SIN, v+44, -x}, {TW_SIN, v+44, x}, {TW_SIN, v+45, -x}, {TW_SIN, v+45, x}, \ + {TW_SIN, v+46, -x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, -x}, {TW_SIN, v+47, x}, \ + {TW_SIN, v+48, -x}, {TW_SIN, v+48, x}, {TW_SIN, v+49, -x}, {TW_SIN, v+49, x}, \ + {TW_SIN, v+50, -x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, -x}, {TW_SIN, v+51, x}, \ + {TW_SIN, v+52, -x}, {TW_SIN, v+52, x}, {TW_SIN, v+53, -x}, {TW_SIN, v+53, x}, \ + {TW_SIN, v+54, -x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, -x}, {TW_SIN, v+55, x}, \ + {TW_SIN, v+56, -x}, {TW_SIN, v+56, x}, {TW_SIN, v+57, -x}, {TW_SIN, v+57, x}, \ + {TW_SIN, v+58, -x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, -x}, {TW_SIN, v+59, x}, \ + {TW_SIN, v+60, -x}, {TW_SIN, v+60, x}, {TW_SIN, v+61, -x}, {TW_SIN, v+61, x}, \ + {TW_SIN, v+62, -x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, -x}, {TW_SIN, v+63, x}, \ + {TW_SIN, v+64, -x}, {TW_SIN, v+64, x}, {TW_SIN, v+65, -x}, {TW_SIN, v+65, x}, \ + {TW_SIN, v+66, -x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, -x}, {TW_SIN, v+67, x}, \ + {TW_SIN, v+68, -x}, {TW_SIN, v+68, x}, {TW_SIN, v+69, -x}, {TW_SIN, v+69, x}, \ + {TW_SIN, v+70, -x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, -x}, {TW_SIN, v+71, x}, \ + {TW_SIN, v+72, -x}, {TW_SIN, v+72, x}, {TW_SIN, v+73, -x}, {TW_SIN, v+73, x}, \ + {TW_SIN, v+74, -x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, -x}, {TW_SIN, v+75, x}, \ + {TW_SIN, v+76, -x}, {TW_SIN, v+76, x}, {TW_SIN, v+77, -x}, {TW_SIN, v+77, x}, \ + {TW_SIN, v+78, -x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, -x}, {TW_SIN, v+79, x}, \ + {TW_SIN, v+80, -x}, {TW_SIN, v+80, x}, {TW_SIN, v+81, -x}, {TW_SIN, v+81, x}, \ + {TW_SIN, v+82, -x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, -x}, {TW_SIN, v+83, x}, \ + {TW_SIN, v+84, -x}, {TW_SIN, v+84, x}, {TW_SIN, v+85, -x}, {TW_SIN, v+85, x}, \ + {TW_SIN, v+86, -x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, -x}, {TW_SIN, v+87, x}, \ + {TW_SIN, v+88, -x}, {TW_SIN, v+88, x}, {TW_SIN, v+89, -x}, {TW_SIN, v+89, x}, \ + {TW_SIN, v+90, -x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, -x}, {TW_SIN, v+91, x}, \ + {TW_SIN, v+92, -x}, {TW_SIN, v+92, x}, {TW_SIN, v+93, -x}, {TW_SIN, v+93, x}, \ + {TW_SIN, v+94, -x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, -x}, {TW_SIN, v+95, x}, \ + {TW_SIN, v+96, -x}, {TW_SIN, v+96, x}, {TW_SIN, v+97, -x}, {TW_SIN, v+97, x}, \ + {TW_SIN, v+98, -x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, -x}, {TW_SIN, v+99, x}, \ + {TW_SIN, v+100, -x}, {TW_SIN, v+100, x}, {TW_SIN, v+101, -x}, {TW_SIN, v+101, x}, \ + {TW_SIN, v+102, -x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, -x}, {TW_SIN, v+103, x}, \ + {TW_SIN, v+104, -x}, {TW_SIN, v+104, x}, {TW_SIN, v+105, -x}, {TW_SIN, v+105, x}, \ + {TW_SIN, v+106, -x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, -x}, {TW_SIN, v+107, x}, \ + {TW_SIN, v+108, -x}, {TW_SIN, v+108, x}, {TW_SIN, v+109, -x}, {TW_SIN, v+109, x}, \ + {TW_SIN, v+110, -x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, -x}, {TW_SIN, v+111, x}, \ + {TW_SIN, v+112, -x}, {TW_SIN, v+112, x}, {TW_SIN, v+113, -x}, {TW_SIN, v+113, x}, \ + {TW_SIN, v+114, -x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, -x}, {TW_SIN, v+115, x}, \ + {TW_SIN, v+116, -x}, {TW_SIN, v+116, x}, {TW_SIN, v+117, -x}, {TW_SIN, v+117, x}, \ + {TW_SIN, v+118, -x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, -x}, {TW_SIN, v+119, x}, \ + {TW_SIN, v+120, -x}, {TW_SIN, v+120, x}, {TW_SIN, v+121, -x}, {TW_SIN, v+121, x}, \ + {TW_SIN, v+122, -x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, -x}, {TW_SIN, v+123, x}, \ + {TW_SIN, v+124, -x}, {TW_SIN, v+124, x}, {TW_SIN, v+125, -x}, {TW_SIN, v+125, x}, \ + {TW_SIN, v+126, -x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, -x}, {TW_SIN, v+127, x} +#endif // VTW_SIZE == 256 +#endif // REQ_VTW2 +#if defined(REQ_VTWS) +#if defined(VTW_SIZE) && VTW_SIZE == 1 +#warning "using VTWS with 1" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_SIN, v+0, x} +#endif // VTW_SIZE == 1 +#if defined(VTW_SIZE) && VTW_SIZE == 2 +#warning "using VTWS with 2" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, x} +#endif // VTW_SIZE == 2 +#if defined(VTW_SIZE) && VTW_SIZE == 4 +#warning "using VTWS with 4" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ + {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x} +#endif // VTW_SIZE == 4 +#if defined(VTW_SIZE) && VTW_SIZE == 8 +#warning "using VTWS with 8" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ + {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x} +#endif // VTW_SIZE == 8 +#if defined(VTW_SIZE) && VTW_SIZE == 16 +#warning "using VTWS with 16" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ + {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x} +#endif // VTW_SIZE == 16 +#if defined(VTW_SIZE) && VTW_SIZE == 32 +#warning "using VTWS with 32" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ + {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \ + {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \ + {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \ + {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \ + {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \ + {TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \ + {TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \ + {TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \ + {TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x} +#endif // VTW_SIZE == 32 +#if defined(VTW_SIZE) && VTW_SIZE == 64 +#warning "using VTWS with 64" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ + {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \ + {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \ + {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \ + {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \ + {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \ + {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \ + {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \ + {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \ + {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \ + {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \ + {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \ + {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \ + {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \ + {TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \ + {TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \ + {TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \ + {TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \ + {TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \ + {TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \ + {TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \ + {TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \ + {TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \ + {TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \ + {TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \ + {TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x} +#endif // VTW_SIZE == 64 +#if defined(VTW_SIZE) && VTW_SIZE == 128 +#warning "using VTWS with 128" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ + {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \ + {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \ + {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \ + {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \ + {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \ + {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \ + {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \ + {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \ + {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \ + {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \ + {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \ + {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \ + {TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, \ + {TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, \ + {TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, \ + {TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, \ + {TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, \ + {TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, \ + {TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, \ + {TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, \ + {TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, \ + {TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, \ + {TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, \ + {TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, \ + {TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, \ + {TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, \ + {TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, \ + {TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, \ + {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \ + {TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \ + {TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \ + {TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \ + {TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \ + {TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \ + {TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \ + {TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \ + {TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \ + {TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \ + {TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \ + {TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \ + {TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x}, \ + {TW_SIN, v+64, x}, {TW_SIN, v+65, x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, x}, \ + {TW_SIN, v+68, x}, {TW_SIN, v+69, x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, x}, \ + {TW_SIN, v+72, x}, {TW_SIN, v+73, x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, x}, \ + {TW_SIN, v+76, x}, {TW_SIN, v+77, x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, x}, \ + {TW_SIN, v+80, x}, {TW_SIN, v+81, x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, x}, \ + {TW_SIN, v+84, x}, {TW_SIN, v+85, x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, x}, \ + {TW_SIN, v+88, x}, {TW_SIN, v+89, x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, x}, \ + {TW_SIN, v+92, x}, {TW_SIN, v+93, x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, x}, \ + {TW_SIN, v+96, x}, {TW_SIN, v+97, x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, x}, \ + {TW_SIN, v+100, x}, {TW_SIN, v+101, x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, x}, \ + {TW_SIN, v+104, x}, {TW_SIN, v+105, x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, x}, \ + {TW_SIN, v+108, x}, {TW_SIN, v+109, x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, x}, \ + {TW_SIN, v+112, x}, {TW_SIN, v+113, x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, x}, \ + {TW_SIN, v+116, x}, {TW_SIN, v+117, x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, x}, \ + {TW_SIN, v+120, x}, {TW_SIN, v+121, x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, x}, \ + {TW_SIN, v+124, x}, {TW_SIN, v+125, x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, x} +#endif // VTW_SIZE == 128 +#if defined(VTW_SIZE) && VTW_SIZE == 256 +#warning "using VTWS with 256" +#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ + {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ + {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ + {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ + {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \ + {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \ + {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \ + {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \ + {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \ + {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \ + {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \ + {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \ + {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \ + {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \ + {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \ + {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \ + {TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, \ + {TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, \ + {TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, \ + {TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, \ + {TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, \ + {TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, \ + {TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, \ + {TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, \ + {TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, \ + {TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, \ + {TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, \ + {TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, \ + {TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, \ + {TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, \ + {TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, \ + {TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, \ + {TW_COS, v+128, x}, {TW_COS, v+129, x}, {TW_COS, v+130, x}, {TW_COS, v+131, x}, \ + {TW_COS, v+132, x}, {TW_COS, v+133, x}, {TW_COS, v+134, x}, {TW_COS, v+135, x}, \ + {TW_COS, v+136, x}, {TW_COS, v+137, x}, {TW_COS, v+138, x}, {TW_COS, v+139, x}, \ + {TW_COS, v+140, x}, {TW_COS, v+141, x}, {TW_COS, v+142, x}, {TW_COS, v+143, x}, \ + {TW_COS, v+144, x}, {TW_COS, v+145, x}, {TW_COS, v+146, x}, {TW_COS, v+147, x}, \ + {TW_COS, v+148, x}, {TW_COS, v+149, x}, {TW_COS, v+150, x}, {TW_COS, v+151, x}, \ + {TW_COS, v+152, x}, {TW_COS, v+153, x}, {TW_COS, v+154, x}, {TW_COS, v+155, x}, \ + {TW_COS, v+156, x}, {TW_COS, v+157, x}, {TW_COS, v+158, x}, {TW_COS, v+159, x}, \ + {TW_COS, v+160, x}, {TW_COS, v+161, x}, {TW_COS, v+162, x}, {TW_COS, v+163, x}, \ + {TW_COS, v+164, x}, {TW_COS, v+165, x}, {TW_COS, v+166, x}, {TW_COS, v+167, x}, \ + {TW_COS, v+168, x}, {TW_COS, v+169, x}, {TW_COS, v+170, x}, {TW_COS, v+171, x}, \ + {TW_COS, v+172, x}, {TW_COS, v+173, x}, {TW_COS, v+174, x}, {TW_COS, v+175, x}, \ + {TW_COS, v+176, x}, {TW_COS, v+177, x}, {TW_COS, v+178, x}, {TW_COS, v+179, x}, \ + {TW_COS, v+180, x}, {TW_COS, v+181, x}, {TW_COS, v+182, x}, {TW_COS, v+183, x}, \ + {TW_COS, v+184, x}, {TW_COS, v+185, x}, {TW_COS, v+186, x}, {TW_COS, v+187, x}, \ + {TW_COS, v+188, x}, {TW_COS, v+189, x}, {TW_COS, v+190, x}, {TW_COS, v+191, x}, \ + {TW_COS, v+192, x}, {TW_COS, v+193, x}, {TW_COS, v+194, x}, {TW_COS, v+195, x}, \ + {TW_COS, v+196, x}, {TW_COS, v+197, x}, {TW_COS, v+198, x}, {TW_COS, v+199, x}, \ + {TW_COS, v+200, x}, {TW_COS, v+201, x}, {TW_COS, v+202, x}, {TW_COS, v+203, x}, \ + {TW_COS, v+204, x}, {TW_COS, v+205, x}, {TW_COS, v+206, x}, {TW_COS, v+207, x}, \ + {TW_COS, v+208, x}, {TW_COS, v+209, x}, {TW_COS, v+210, x}, {TW_COS, v+211, x}, \ + {TW_COS, v+212, x}, {TW_COS, v+213, x}, {TW_COS, v+214, x}, {TW_COS, v+215, x}, \ + {TW_COS, v+216, x}, {TW_COS, v+217, x}, {TW_COS, v+218, x}, {TW_COS, v+219, x}, \ + {TW_COS, v+220, x}, {TW_COS, v+221, x}, {TW_COS, v+222, x}, {TW_COS, v+223, x}, \ + {TW_COS, v+224, x}, {TW_COS, v+225, x}, {TW_COS, v+226, x}, {TW_COS, v+227, x}, \ + {TW_COS, v+228, x}, {TW_COS, v+229, x}, {TW_COS, v+230, x}, {TW_COS, v+231, x}, \ + {TW_COS, v+232, x}, {TW_COS, v+233, x}, {TW_COS, v+234, x}, {TW_COS, v+235, x}, \ + {TW_COS, v+236, x}, {TW_COS, v+237, x}, {TW_COS, v+238, x}, {TW_COS, v+239, x}, \ + {TW_COS, v+240, x}, {TW_COS, v+241, x}, {TW_COS, v+242, x}, {TW_COS, v+243, x}, \ + {TW_COS, v+244, x}, {TW_COS, v+245, x}, {TW_COS, v+246, x}, {TW_COS, v+247, x}, \ + {TW_COS, v+248, x}, {TW_COS, v+249, x}, {TW_COS, v+250, x}, {TW_COS, v+251, x}, \ + {TW_COS, v+252, x}, {TW_COS, v+253, x}, {TW_COS, v+254, x}, {TW_COS, v+255, x}, \ + {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ + {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ + {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ + {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \ + {TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \ + {TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \ + {TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \ + {TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \ + {TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \ + {TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \ + {TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \ + {TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \ + {TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \ + {TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \ + {TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \ + {TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x}, \ + {TW_SIN, v+64, x}, {TW_SIN, v+65, x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, x}, \ + {TW_SIN, v+68, x}, {TW_SIN, v+69, x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, x}, \ + {TW_SIN, v+72, x}, {TW_SIN, v+73, x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, x}, \ + {TW_SIN, v+76, x}, {TW_SIN, v+77, x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, x}, \ + {TW_SIN, v+80, x}, {TW_SIN, v+81, x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, x}, \ + {TW_SIN, v+84, x}, {TW_SIN, v+85, x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, x}, \ + {TW_SIN, v+88, x}, {TW_SIN, v+89, x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, x}, \ + {TW_SIN, v+92, x}, {TW_SIN, v+93, x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, x}, \ + {TW_SIN, v+96, x}, {TW_SIN, v+97, x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, x}, \ + {TW_SIN, v+100, x}, {TW_SIN, v+101, x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, x}, \ + {TW_SIN, v+104, x}, {TW_SIN, v+105, x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, x}, \ + {TW_SIN, v+108, x}, {TW_SIN, v+109, x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, x}, \ + {TW_SIN, v+112, x}, {TW_SIN, v+113, x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, x}, \ + {TW_SIN, v+116, x}, {TW_SIN, v+117, x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, x}, \ + {TW_SIN, v+120, x}, {TW_SIN, v+121, x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, x}, \ + {TW_SIN, v+124, x}, {TW_SIN, v+125, x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, x}, \ + {TW_SIN, v+128, x}, {TW_SIN, v+129, x}, {TW_SIN, v+130, x}, {TW_SIN, v+131, x}, \ + {TW_SIN, v+132, x}, {TW_SIN, v+133, x}, {TW_SIN, v+134, x}, {TW_SIN, v+135, x}, \ + {TW_SIN, v+136, x}, {TW_SIN, v+137, x}, {TW_SIN, v+138, x}, {TW_SIN, v+139, x}, \ + {TW_SIN, v+140, x}, {TW_SIN, v+141, x}, {TW_SIN, v+142, x}, {TW_SIN, v+143, x}, \ + {TW_SIN, v+144, x}, {TW_SIN, v+145, x}, {TW_SIN, v+146, x}, {TW_SIN, v+147, x}, \ + {TW_SIN, v+148, x}, {TW_SIN, v+149, x}, {TW_SIN, v+150, x}, {TW_SIN, v+151, x}, \ + {TW_SIN, v+152, x}, {TW_SIN, v+153, x}, {TW_SIN, v+154, x}, {TW_SIN, v+155, x}, \ + {TW_SIN, v+156, x}, {TW_SIN, v+157, x}, {TW_SIN, v+158, x}, {TW_SIN, v+159, x}, \ + {TW_SIN, v+160, x}, {TW_SIN, v+161, x}, {TW_SIN, v+162, x}, {TW_SIN, v+163, x}, \ + {TW_SIN, v+164, x}, {TW_SIN, v+165, x}, {TW_SIN, v+166, x}, {TW_SIN, v+167, x}, \ + {TW_SIN, v+168, x}, {TW_SIN, v+169, x}, {TW_SIN, v+170, x}, {TW_SIN, v+171, x}, \ + {TW_SIN, v+172, x}, {TW_SIN, v+173, x}, {TW_SIN, v+174, x}, {TW_SIN, v+175, x}, \ + {TW_SIN, v+176, x}, {TW_SIN, v+177, x}, {TW_SIN, v+178, x}, {TW_SIN, v+179, x}, \ + {TW_SIN, v+180, x}, {TW_SIN, v+181, x}, {TW_SIN, v+182, x}, {TW_SIN, v+183, x}, \ + {TW_SIN, v+184, x}, {TW_SIN, v+185, x}, {TW_SIN, v+186, x}, {TW_SIN, v+187, x}, \ + {TW_SIN, v+188, x}, {TW_SIN, v+189, x}, {TW_SIN, v+190, x}, {TW_SIN, v+191, x}, \ + {TW_SIN, v+192, x}, {TW_SIN, v+193, x}, {TW_SIN, v+194, x}, {TW_SIN, v+195, x}, \ + {TW_SIN, v+196, x}, {TW_SIN, v+197, x}, {TW_SIN, v+198, x}, {TW_SIN, v+199, x}, \ + {TW_SIN, v+200, x}, {TW_SIN, v+201, x}, {TW_SIN, v+202, x}, {TW_SIN, v+203, x}, \ + {TW_SIN, v+204, x}, {TW_SIN, v+205, x}, {TW_SIN, v+206, x}, {TW_SIN, v+207, x}, \ + {TW_SIN, v+208, x}, {TW_SIN, v+209, x}, {TW_SIN, v+210, x}, {TW_SIN, v+211, x}, \ + {TW_SIN, v+212, x}, {TW_SIN, v+213, x}, {TW_SIN, v+214, x}, {TW_SIN, v+215, x}, \ + {TW_SIN, v+216, x}, {TW_SIN, v+217, x}, {TW_SIN, v+218, x}, {TW_SIN, v+219, x}, \ + {TW_SIN, v+220, x}, {TW_SIN, v+221, x}, {TW_SIN, v+222, x}, {TW_SIN, v+223, x}, \ + {TW_SIN, v+224, x}, {TW_SIN, v+225, x}, {TW_SIN, v+226, x}, {TW_SIN, v+227, x}, \ + {TW_SIN, v+228, x}, {TW_SIN, v+229, x}, {TW_SIN, v+230, x}, {TW_SIN, v+231, x}, \ + {TW_SIN, v+232, x}, {TW_SIN, v+233, x}, {TW_SIN, v+234, x}, {TW_SIN, v+235, x}, \ + {TW_SIN, v+236, x}, {TW_SIN, v+237, x}, {TW_SIN, v+238, x}, {TW_SIN, v+239, x}, \ + {TW_SIN, v+240, x}, {TW_SIN, v+241, x}, {TW_SIN, v+242, x}, {TW_SIN, v+243, x}, \ + {TW_SIN, v+244, x}, {TW_SIN, v+245, x}, {TW_SIN, v+246, x}, {TW_SIN, v+247, x}, \ + {TW_SIN, v+248, x}, {TW_SIN, v+249, x}, {TW_SIN, v+250, x}, {TW_SIN, v+251, x}, \ + {TW_SIN, v+252, x}, {TW_SIN, v+253, x}, {TW_SIN, v+254, x}, {TW_SIN, v+255, x} +#endif // VTW_SIZE == 256 +#endif // REQ_VTWS From bfe9886c1b632ca7a25361627c364bf9b7f23983 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet <gilles@rist.or.jp> Date: Sun, 26 Jul 2020 00:02:19 +0900 Subject: [PATCH 02/13] simd/support/generate_vtw: fix include files for OSX --- simd-support/generate_vtw.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/simd-support/generate_vtw.c b/simd-support/generate_vtw.c index 505a5804c..9b70f7f16 100644 --- a/simd-support/generate_vtw.c +++ b/simd-support/generate_vtw.c @@ -1,7 +1,12 @@ #include <stdio.h> #include <string.h> #include <stdlib.h> +#ifdef HAVE_MALLOC_H #include <malloc.h> +#endif +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif unsigned int rp2(unsigned int size) { size = size | (size >> 1); From 867eaec14b15e06ef50b35e2e79f6be3dab00e04 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet <gilles@rist.or.jp> Date: Sun, 26 Jul 2020 00:03:16 +0900 Subject: [PATCH 03/13] automatically generate simd-support/vtw.h in bootstrap.sh and add simd-support/{generate_vtw.sh,vtw.h} into the dist tarball --- simd-support/Makefile.am | 18 +- simd-support/vtw.h | 729 --------------------------------------- 2 files changed, 17 insertions(+), 730 deletions(-) delete mode 100644 simd-support/vtw.h diff --git a/simd-support/Makefile.am b/simd-support/Makefile.am index 60b705377..e624cc3c2 100644 --- a/simd-support/Makefile.am +++ b/simd-support/Makefile.am @@ -1,6 +1,21 @@ AM_CPPFLAGS = -I $(top_srcdir) noinst_LTLIBRARIES = libsimd_support.la +noinst_PROGRAMS = + +if MAINTAINER_MODE +noinst_PROGRAMS += generate_vtw + +vtw.h: generate_vtw + $(top_srcdir)/simd-support/generate_vtw.sh > vtw.h + +generate_vtw_SOURCES = generate_vtw.c + +sve.c: vtw.h +endif + +libsimd_support_la: vtw.h + libsimd_support_la_SOURCES = taint.c simd-common.h \ x86-cpuid.h amd64-cpuid.h \ simd-sse2.h sse2.c \ @@ -12,5 +27,6 @@ kcvi.c simd-kcvi.h \ altivec.c simd-altivec.h vsx.c simd-vsx.h \ neon.c simd-neon.h \ simd-generic128.h simd-generic256.h \ -sve.c simd-maskedsve.h simd-maskedsve128.h simd-maskedsve256.h simd-maskedsve512.h simd-maskedsve1024.h simd-maskedsve2048.h +sve.c simd-maskedsve.h simd-maskedsve128.h simd-maskedsve256.h simd-maskedsve512.h simd-maskedsve1024.h simd-maskedsve2048.h vtw.h +EXTRA_DIST = generate_vtw.sh diff --git a/simd-support/vtw.h b/simd-support/vtw.h deleted file mode 100644 index 0c31a32b1..000000000 --- a/simd-support/vtw.h +++ /dev/null @@ -1,729 +0,0 @@ -/* auto-generated */ -#if defined(REQ_VTW1) -#if defined(VTW_SIZE) && VTW_SIZE == 1 -#warning "using VTW1 with 1" -#define VTW1(v,x) {TW_CEXP, v+0, x} -#endif // VTW_SIZE == 1 -#if defined(VTW_SIZE) && VTW_SIZE == 2 -#warning "using VTW1 with 2" -#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x} -#endif // VTW_SIZE == 2 -#if defined(VTW_SIZE) && VTW_SIZE == 4 -#warning "using VTW1 with 4" -#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x} -#endif // VTW_SIZE == 4 -#if defined(VTW_SIZE) && VTW_SIZE == 8 -#warning "using VTW1 with 8" -#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ - {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x} -#endif // VTW_SIZE == 8 -#if defined(VTW_SIZE) && VTW_SIZE == 16 -#warning "using VTW1 with 16" -#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ - {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ - {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ - {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x} -#endif // VTW_SIZE == 16 -#if defined(VTW_SIZE) && VTW_SIZE == 32 -#warning "using VTW1 with 32" -#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ - {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ - {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ - {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \ - {TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \ - {TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \ - {TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \ - {TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x} -#endif // VTW_SIZE == 32 -#if defined(VTW_SIZE) && VTW_SIZE == 64 -#warning "using VTW1 with 64" -#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ - {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ - {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ - {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \ - {TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \ - {TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \ - {TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \ - {TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \ - {TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \ - {TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \ - {TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \ - {TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \ - {TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \ - {TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \ - {TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \ - {TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x} -#endif // VTW_SIZE == 64 -#if defined(VTW_SIZE) && VTW_SIZE == 128 -#warning "using VTW1 with 128" -#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ - {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ - {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ - {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \ - {TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \ - {TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \ - {TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \ - {TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \ - {TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \ - {TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \ - {TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \ - {TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \ - {TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \ - {TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \ - {TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \ - {TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x}, \ - {TW_CEXP, v+64, x}, {TW_CEXP, v+65, x}, {TW_CEXP, v+66, x}, {TW_CEXP, v+67, x}, \ - {TW_CEXP, v+68, x}, {TW_CEXP, v+69, x}, {TW_CEXP, v+70, x}, {TW_CEXP, v+71, x}, \ - {TW_CEXP, v+72, x}, {TW_CEXP, v+73, x}, {TW_CEXP, v+74, x}, {TW_CEXP, v+75, x}, \ - {TW_CEXP, v+76, x}, {TW_CEXP, v+77, x}, {TW_CEXP, v+78, x}, {TW_CEXP, v+79, x}, \ - {TW_CEXP, v+80, x}, {TW_CEXP, v+81, x}, {TW_CEXP, v+82, x}, {TW_CEXP, v+83, x}, \ - {TW_CEXP, v+84, x}, {TW_CEXP, v+85, x}, {TW_CEXP, v+86, x}, {TW_CEXP, v+87, x}, \ - {TW_CEXP, v+88, x}, {TW_CEXP, v+89, x}, {TW_CEXP, v+90, x}, {TW_CEXP, v+91, x}, \ - {TW_CEXP, v+92, x}, {TW_CEXP, v+93, x}, {TW_CEXP, v+94, x}, {TW_CEXP, v+95, x}, \ - {TW_CEXP, v+96, x}, {TW_CEXP, v+97, x}, {TW_CEXP, v+98, x}, {TW_CEXP, v+99, x}, \ - {TW_CEXP, v+100, x}, {TW_CEXP, v+101, x}, {TW_CEXP, v+102, x}, {TW_CEXP, v+103, x}, \ - {TW_CEXP, v+104, x}, {TW_CEXP, v+105, x}, {TW_CEXP, v+106, x}, {TW_CEXP, v+107, x}, \ - {TW_CEXP, v+108, x}, {TW_CEXP, v+109, x}, {TW_CEXP, v+110, x}, {TW_CEXP, v+111, x}, \ - {TW_CEXP, v+112, x}, {TW_CEXP, v+113, x}, {TW_CEXP, v+114, x}, {TW_CEXP, v+115, x}, \ - {TW_CEXP, v+116, x}, {TW_CEXP, v+117, x}, {TW_CEXP, v+118, x}, {TW_CEXP, v+119, x}, \ - {TW_CEXP, v+120, x}, {TW_CEXP, v+121, x}, {TW_CEXP, v+122, x}, {TW_CEXP, v+123, x}, \ - {TW_CEXP, v+124, x}, {TW_CEXP, v+125, x}, {TW_CEXP, v+126, x}, {TW_CEXP, v+127, x} -#endif // VTW_SIZE == 128 -#if defined(VTW_SIZE) && VTW_SIZE == 256 -#warning "using VTW1 with 256" -#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \ - {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \ - {TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \ - {TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \ - {TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \ - {TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \ - {TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \ - {TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \ - {TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \ - {TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \ - {TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \ - {TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \ - {TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \ - {TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \ - {TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \ - {TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x}, \ - {TW_CEXP, v+64, x}, {TW_CEXP, v+65, x}, {TW_CEXP, v+66, x}, {TW_CEXP, v+67, x}, \ - {TW_CEXP, v+68, x}, {TW_CEXP, v+69, x}, {TW_CEXP, v+70, x}, {TW_CEXP, v+71, x}, \ - {TW_CEXP, v+72, x}, {TW_CEXP, v+73, x}, {TW_CEXP, v+74, x}, {TW_CEXP, v+75, x}, \ - {TW_CEXP, v+76, x}, {TW_CEXP, v+77, x}, {TW_CEXP, v+78, x}, {TW_CEXP, v+79, x}, \ - {TW_CEXP, v+80, x}, {TW_CEXP, v+81, x}, {TW_CEXP, v+82, x}, {TW_CEXP, v+83, x}, \ - {TW_CEXP, v+84, x}, {TW_CEXP, v+85, x}, {TW_CEXP, v+86, x}, {TW_CEXP, v+87, x}, \ - {TW_CEXP, v+88, x}, {TW_CEXP, v+89, x}, {TW_CEXP, v+90, x}, {TW_CEXP, v+91, x}, \ - {TW_CEXP, v+92, x}, {TW_CEXP, v+93, x}, {TW_CEXP, v+94, x}, {TW_CEXP, v+95, x}, \ - {TW_CEXP, v+96, x}, {TW_CEXP, v+97, x}, {TW_CEXP, v+98, x}, {TW_CEXP, v+99, x}, \ - {TW_CEXP, v+100, x}, {TW_CEXP, v+101, x}, {TW_CEXP, v+102, x}, {TW_CEXP, v+103, x}, \ - {TW_CEXP, v+104, x}, {TW_CEXP, v+105, x}, {TW_CEXP, v+106, x}, {TW_CEXP, v+107, x}, \ - {TW_CEXP, v+108, x}, {TW_CEXP, v+109, x}, {TW_CEXP, v+110, x}, {TW_CEXP, v+111, x}, \ - {TW_CEXP, v+112, x}, {TW_CEXP, v+113, x}, {TW_CEXP, v+114, x}, {TW_CEXP, v+115, x}, \ - {TW_CEXP, v+116, x}, {TW_CEXP, v+117, x}, {TW_CEXP, v+118, x}, {TW_CEXP, v+119, x}, \ - {TW_CEXP, v+120, x}, {TW_CEXP, v+121, x}, {TW_CEXP, v+122, x}, {TW_CEXP, v+123, x}, \ - {TW_CEXP, v+124, x}, {TW_CEXP, v+125, x}, {TW_CEXP, v+126, x}, {TW_CEXP, v+127, x}, \ - {TW_CEXP, v+128, x}, {TW_CEXP, v+129, x}, {TW_CEXP, v+130, x}, {TW_CEXP, v+131, x}, \ - {TW_CEXP, v+132, x}, {TW_CEXP, v+133, x}, {TW_CEXP, v+134, x}, {TW_CEXP, v+135, x}, \ - {TW_CEXP, v+136, x}, {TW_CEXP, v+137, x}, {TW_CEXP, v+138, x}, {TW_CEXP, v+139, x}, \ - {TW_CEXP, v+140, x}, {TW_CEXP, v+141, x}, {TW_CEXP, v+142, x}, {TW_CEXP, v+143, x}, \ - {TW_CEXP, v+144, x}, {TW_CEXP, v+145, x}, {TW_CEXP, v+146, x}, {TW_CEXP, v+147, x}, \ - {TW_CEXP, v+148, x}, {TW_CEXP, v+149, x}, {TW_CEXP, v+150, x}, {TW_CEXP, v+151, x}, \ - {TW_CEXP, v+152, x}, {TW_CEXP, v+153, x}, {TW_CEXP, v+154, x}, {TW_CEXP, v+155, x}, \ - {TW_CEXP, v+156, x}, {TW_CEXP, v+157, x}, {TW_CEXP, v+158, x}, {TW_CEXP, v+159, x}, \ - {TW_CEXP, v+160, x}, {TW_CEXP, v+161, x}, {TW_CEXP, v+162, x}, {TW_CEXP, v+163, x}, \ - {TW_CEXP, v+164, x}, {TW_CEXP, v+165, x}, {TW_CEXP, v+166, x}, {TW_CEXP, v+167, x}, \ - {TW_CEXP, v+168, x}, {TW_CEXP, v+169, x}, {TW_CEXP, v+170, x}, {TW_CEXP, v+171, x}, \ - {TW_CEXP, v+172, x}, {TW_CEXP, v+173, x}, {TW_CEXP, v+174, x}, {TW_CEXP, v+175, x}, \ - {TW_CEXP, v+176, x}, {TW_CEXP, v+177, x}, {TW_CEXP, v+178, x}, {TW_CEXP, v+179, x}, \ - {TW_CEXP, v+180, x}, {TW_CEXP, v+181, x}, {TW_CEXP, v+182, x}, {TW_CEXP, v+183, x}, \ - {TW_CEXP, v+184, x}, {TW_CEXP, v+185, x}, {TW_CEXP, v+186, x}, {TW_CEXP, v+187, x}, \ - {TW_CEXP, v+188, x}, {TW_CEXP, v+189, x}, {TW_CEXP, v+190, x}, {TW_CEXP, v+191, x}, \ - {TW_CEXP, v+192, x}, {TW_CEXP, v+193, x}, {TW_CEXP, v+194, x}, {TW_CEXP, v+195, x}, \ - {TW_CEXP, v+196, x}, {TW_CEXP, v+197, x}, {TW_CEXP, v+198, x}, {TW_CEXP, v+199, x}, \ - {TW_CEXP, v+200, x}, {TW_CEXP, v+201, x}, {TW_CEXP, v+202, x}, {TW_CEXP, v+203, x}, \ - {TW_CEXP, v+204, x}, {TW_CEXP, v+205, x}, {TW_CEXP, v+206, x}, {TW_CEXP, v+207, x}, \ - {TW_CEXP, v+208, x}, {TW_CEXP, v+209, x}, {TW_CEXP, v+210, x}, {TW_CEXP, v+211, x}, \ - {TW_CEXP, v+212, x}, {TW_CEXP, v+213, x}, {TW_CEXP, v+214, x}, {TW_CEXP, v+215, x}, \ - {TW_CEXP, v+216, x}, {TW_CEXP, v+217, x}, {TW_CEXP, v+218, x}, {TW_CEXP, v+219, x}, \ - {TW_CEXP, v+220, x}, {TW_CEXP, v+221, x}, {TW_CEXP, v+222, x}, {TW_CEXP, v+223, x}, \ - {TW_CEXP, v+224, x}, {TW_CEXP, v+225, x}, {TW_CEXP, v+226, x}, {TW_CEXP, v+227, x}, \ - {TW_CEXP, v+228, x}, {TW_CEXP, v+229, x}, {TW_CEXP, v+230, x}, {TW_CEXP, v+231, x}, \ - {TW_CEXP, v+232, x}, {TW_CEXP, v+233, x}, {TW_CEXP, v+234, x}, {TW_CEXP, v+235, x}, \ - {TW_CEXP, v+236, x}, {TW_CEXP, v+237, x}, {TW_CEXP, v+238, x}, {TW_CEXP, v+239, x}, \ - {TW_CEXP, v+240, x}, {TW_CEXP, v+241, x}, {TW_CEXP, v+242, x}, {TW_CEXP, v+243, x}, \ - {TW_CEXP, v+244, x}, {TW_CEXP, v+245, x}, {TW_CEXP, v+246, x}, {TW_CEXP, v+247, x}, \ - {TW_CEXP, v+248, x}, {TW_CEXP, v+249, x}, {TW_CEXP, v+250, x}, {TW_CEXP, v+251, x}, \ - {TW_CEXP, v+252, x}, {TW_CEXP, v+253, x}, {TW_CEXP, v+254, x}, {TW_CEXP, v+255, x} -#endif // VTW_SIZE == 256 -#endif // REQ_VTW1 -#if defined(REQ_VTW2) -#if defined(VTW_SIZE) && VTW_SIZE == 1 -#warning "using VTW2 with 1" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_SIN, v+0, -x} -#endif // VTW_SIZE == 1 -#if defined(VTW_SIZE) && VTW_SIZE == 2 -#warning "using VTW2 with 2" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_SIN, v+0, -x}, {TW_SIN, v+0, x} -#endif // VTW_SIZE == 2 -#if defined(VTW_SIZE) && VTW_SIZE == 4 -#warning "using VTW2 with 4" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ - {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x} -#endif // VTW_SIZE == 4 -#if defined(VTW_SIZE) && VTW_SIZE == 8 -#warning "using VTW2 with 8" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ - {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ - {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ - {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x} -#endif // VTW_SIZE == 8 -#if defined(VTW_SIZE) && VTW_SIZE == 16 -#warning "using VTW2 with 16" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ - {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ - {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ - {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ - {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ - {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x} -#endif // VTW_SIZE == 16 -#if defined(VTW_SIZE) && VTW_SIZE == 32 -#warning "using VTW2 with 32" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ - {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ - {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \ - {TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \ - {TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \ - {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ - {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ - {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \ - {TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \ - {TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x} -#endif // VTW_SIZE == 32 -#if defined(VTW_SIZE) && VTW_SIZE == 64 -#warning "using VTW2 with 64" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ - {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ - {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \ - {TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \ - {TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \ - {TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \ - {TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \ - {TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \ - {TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \ - {TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \ - {TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \ - {TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \ - {TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \ - {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ - {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ - {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \ - {TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \ - {TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \ - {TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \ - {TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \ - {TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \ - {TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \ - {TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \ - {TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \ - {TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \ - {TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x} -#endif // VTW_SIZE == 64 -#if defined(VTW_SIZE) && VTW_SIZE == 128 -#warning "using VTW2 with 128" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ - {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ - {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \ - {TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \ - {TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \ - {TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \ - {TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \ - {TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \ - {TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \ - {TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \ - {TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \ - {TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \ - {TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \ - {TW_COS, v+32, x}, {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+33, x}, \ - {TW_COS, v+34, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, {TW_COS, v+35, x}, \ - {TW_COS, v+36, x}, {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+37, x}, \ - {TW_COS, v+38, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, {TW_COS, v+39, x}, \ - {TW_COS, v+40, x}, {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+41, x}, \ - {TW_COS, v+42, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, {TW_COS, v+43, x}, \ - {TW_COS, v+44, x}, {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+45, x}, \ - {TW_COS, v+46, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, {TW_COS, v+47, x}, \ - {TW_COS, v+48, x}, {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+49, x}, \ - {TW_COS, v+50, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, {TW_COS, v+51, x}, \ - {TW_COS, v+52, x}, {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+53, x}, \ - {TW_COS, v+54, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, {TW_COS, v+55, x}, \ - {TW_COS, v+56, x}, {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+57, x}, \ - {TW_COS, v+58, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, {TW_COS, v+59, x}, \ - {TW_COS, v+60, x}, {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+61, x}, \ - {TW_COS, v+62, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, {TW_COS, v+63, x}, \ - {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ - {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ - {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \ - {TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \ - {TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \ - {TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \ - {TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \ - {TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \ - {TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \ - {TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \ - {TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \ - {TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \ - {TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x}, \ - {TW_SIN, v+32, -x}, {TW_SIN, v+32, x}, {TW_SIN, v+33, -x}, {TW_SIN, v+33, x}, \ - {TW_SIN, v+34, -x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, -x}, {TW_SIN, v+35, x}, \ - {TW_SIN, v+36, -x}, {TW_SIN, v+36, x}, {TW_SIN, v+37, -x}, {TW_SIN, v+37, x}, \ - {TW_SIN, v+38, -x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, -x}, {TW_SIN, v+39, x}, \ - {TW_SIN, v+40, -x}, {TW_SIN, v+40, x}, {TW_SIN, v+41, -x}, {TW_SIN, v+41, x}, \ - {TW_SIN, v+42, -x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, -x}, {TW_SIN, v+43, x}, \ - {TW_SIN, v+44, -x}, {TW_SIN, v+44, x}, {TW_SIN, v+45, -x}, {TW_SIN, v+45, x}, \ - {TW_SIN, v+46, -x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, -x}, {TW_SIN, v+47, x}, \ - {TW_SIN, v+48, -x}, {TW_SIN, v+48, x}, {TW_SIN, v+49, -x}, {TW_SIN, v+49, x}, \ - {TW_SIN, v+50, -x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, -x}, {TW_SIN, v+51, x}, \ - {TW_SIN, v+52, -x}, {TW_SIN, v+52, x}, {TW_SIN, v+53, -x}, {TW_SIN, v+53, x}, \ - {TW_SIN, v+54, -x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, -x}, {TW_SIN, v+55, x}, \ - {TW_SIN, v+56, -x}, {TW_SIN, v+56, x}, {TW_SIN, v+57, -x}, {TW_SIN, v+57, x}, \ - {TW_SIN, v+58, -x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, -x}, {TW_SIN, v+59, x}, \ - {TW_SIN, v+60, -x}, {TW_SIN, v+60, x}, {TW_SIN, v+61, -x}, {TW_SIN, v+61, x}, \ - {TW_SIN, v+62, -x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, -x}, {TW_SIN, v+63, x} -#endif // VTW_SIZE == 128 -#if defined(VTW_SIZE) && VTW_SIZE == 256 -#warning "using VTW2 with 256" -#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ - {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \ - {TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \ - {TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \ - {TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \ - {TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \ - {TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \ - {TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \ - {TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \ - {TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \ - {TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \ - {TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \ - {TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \ - {TW_COS, v+32, x}, {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+33, x}, \ - {TW_COS, v+34, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, {TW_COS, v+35, x}, \ - {TW_COS, v+36, x}, {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+37, x}, \ - {TW_COS, v+38, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, {TW_COS, v+39, x}, \ - {TW_COS, v+40, x}, {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+41, x}, \ - {TW_COS, v+42, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, {TW_COS, v+43, x}, \ - {TW_COS, v+44, x}, {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+45, x}, \ - {TW_COS, v+46, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, {TW_COS, v+47, x}, \ - {TW_COS, v+48, x}, {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+49, x}, \ - {TW_COS, v+50, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, {TW_COS, v+51, x}, \ - {TW_COS, v+52, x}, {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+53, x}, \ - {TW_COS, v+54, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, {TW_COS, v+55, x}, \ - {TW_COS, v+56, x}, {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+57, x}, \ - {TW_COS, v+58, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, {TW_COS, v+59, x}, \ - {TW_COS, v+60, x}, {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+61, x}, \ - {TW_COS, v+62, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, {TW_COS, v+63, x}, \ - {TW_COS, v+64, x}, {TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+65, x}, \ - {TW_COS, v+66, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, {TW_COS, v+67, x}, \ - {TW_COS, v+68, x}, {TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+69, x}, \ - {TW_COS, v+70, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, {TW_COS, v+71, x}, \ - {TW_COS, v+72, x}, {TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+73, x}, \ - {TW_COS, v+74, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, {TW_COS, v+75, x}, \ - {TW_COS, v+76, x}, {TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+77, x}, \ - {TW_COS, v+78, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, {TW_COS, v+79, x}, \ - {TW_COS, v+80, x}, {TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+81, x}, \ - {TW_COS, v+82, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, {TW_COS, v+83, x}, \ - {TW_COS, v+84, x}, {TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+85, x}, \ - {TW_COS, v+86, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, {TW_COS, v+87, x}, \ - {TW_COS, v+88, x}, {TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+89, x}, \ - {TW_COS, v+90, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, {TW_COS, v+91, x}, \ - {TW_COS, v+92, x}, {TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+93, x}, \ - {TW_COS, v+94, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, {TW_COS, v+95, x}, \ - {TW_COS, v+96, x}, {TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+97, x}, \ - {TW_COS, v+98, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, {TW_COS, v+99, x}, \ - {TW_COS, v+100, x}, {TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+101, x}, \ - {TW_COS, v+102, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, {TW_COS, v+103, x}, \ - {TW_COS, v+104, x}, {TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+105, x}, \ - {TW_COS, v+106, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, {TW_COS, v+107, x}, \ - {TW_COS, v+108, x}, {TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+109, x}, \ - {TW_COS, v+110, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, {TW_COS, v+111, x}, \ - {TW_COS, v+112, x}, {TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+113, x}, \ - {TW_COS, v+114, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, {TW_COS, v+115, x}, \ - {TW_COS, v+116, x}, {TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+117, x}, \ - {TW_COS, v+118, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, {TW_COS, v+119, x}, \ - {TW_COS, v+120, x}, {TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+121, x}, \ - {TW_COS, v+122, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, {TW_COS, v+123, x}, \ - {TW_COS, v+124, x}, {TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+125, x}, \ - {TW_COS, v+126, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, {TW_COS, v+127, x}, \ - {TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \ - {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \ - {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \ - {TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \ - {TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \ - {TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \ - {TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \ - {TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \ - {TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \ - {TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \ - {TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \ - {TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \ - {TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x}, \ - {TW_SIN, v+32, -x}, {TW_SIN, v+32, x}, {TW_SIN, v+33, -x}, {TW_SIN, v+33, x}, \ - {TW_SIN, v+34, -x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, -x}, {TW_SIN, v+35, x}, \ - {TW_SIN, v+36, -x}, {TW_SIN, v+36, x}, {TW_SIN, v+37, -x}, {TW_SIN, v+37, x}, \ - {TW_SIN, v+38, -x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, -x}, {TW_SIN, v+39, x}, \ - {TW_SIN, v+40, -x}, {TW_SIN, v+40, x}, {TW_SIN, v+41, -x}, {TW_SIN, v+41, x}, \ - {TW_SIN, v+42, -x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, -x}, {TW_SIN, v+43, x}, \ - {TW_SIN, v+44, -x}, {TW_SIN, v+44, x}, {TW_SIN, v+45, -x}, {TW_SIN, v+45, x}, \ - {TW_SIN, v+46, -x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, -x}, {TW_SIN, v+47, x}, \ - {TW_SIN, v+48, -x}, {TW_SIN, v+48, x}, {TW_SIN, v+49, -x}, {TW_SIN, v+49, x}, \ - {TW_SIN, v+50, -x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, -x}, {TW_SIN, v+51, x}, \ - {TW_SIN, v+52, -x}, {TW_SIN, v+52, x}, {TW_SIN, v+53, -x}, {TW_SIN, v+53, x}, \ - {TW_SIN, v+54, -x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, -x}, {TW_SIN, v+55, x}, \ - {TW_SIN, v+56, -x}, {TW_SIN, v+56, x}, {TW_SIN, v+57, -x}, {TW_SIN, v+57, x}, \ - {TW_SIN, v+58, -x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, -x}, {TW_SIN, v+59, x}, \ - {TW_SIN, v+60, -x}, {TW_SIN, v+60, x}, {TW_SIN, v+61, -x}, {TW_SIN, v+61, x}, \ - {TW_SIN, v+62, -x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, -x}, {TW_SIN, v+63, x}, \ - {TW_SIN, v+64, -x}, {TW_SIN, v+64, x}, {TW_SIN, v+65, -x}, {TW_SIN, v+65, x}, \ - {TW_SIN, v+66, -x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, -x}, {TW_SIN, v+67, x}, \ - {TW_SIN, v+68, -x}, {TW_SIN, v+68, x}, {TW_SIN, v+69, -x}, {TW_SIN, v+69, x}, \ - {TW_SIN, v+70, -x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, -x}, {TW_SIN, v+71, x}, \ - {TW_SIN, v+72, -x}, {TW_SIN, v+72, x}, {TW_SIN, v+73, -x}, {TW_SIN, v+73, x}, \ - {TW_SIN, v+74, -x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, -x}, {TW_SIN, v+75, x}, \ - {TW_SIN, v+76, -x}, {TW_SIN, v+76, x}, {TW_SIN, v+77, -x}, {TW_SIN, v+77, x}, \ - {TW_SIN, v+78, -x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, -x}, {TW_SIN, v+79, x}, \ - {TW_SIN, v+80, -x}, {TW_SIN, v+80, x}, {TW_SIN, v+81, -x}, {TW_SIN, v+81, x}, \ - {TW_SIN, v+82, -x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, -x}, {TW_SIN, v+83, x}, \ - {TW_SIN, v+84, -x}, {TW_SIN, v+84, x}, {TW_SIN, v+85, -x}, {TW_SIN, v+85, x}, \ - {TW_SIN, v+86, -x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, -x}, {TW_SIN, v+87, x}, \ - {TW_SIN, v+88, -x}, {TW_SIN, v+88, x}, {TW_SIN, v+89, -x}, {TW_SIN, v+89, x}, \ - {TW_SIN, v+90, -x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, -x}, {TW_SIN, v+91, x}, \ - {TW_SIN, v+92, -x}, {TW_SIN, v+92, x}, {TW_SIN, v+93, -x}, {TW_SIN, v+93, x}, \ - {TW_SIN, v+94, -x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, -x}, {TW_SIN, v+95, x}, \ - {TW_SIN, v+96, -x}, {TW_SIN, v+96, x}, {TW_SIN, v+97, -x}, {TW_SIN, v+97, x}, \ - {TW_SIN, v+98, -x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, -x}, {TW_SIN, v+99, x}, \ - {TW_SIN, v+100, -x}, {TW_SIN, v+100, x}, {TW_SIN, v+101, -x}, {TW_SIN, v+101, x}, \ - {TW_SIN, v+102, -x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, -x}, {TW_SIN, v+103, x}, \ - {TW_SIN, v+104, -x}, {TW_SIN, v+104, x}, {TW_SIN, v+105, -x}, {TW_SIN, v+105, x}, \ - {TW_SIN, v+106, -x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, -x}, {TW_SIN, v+107, x}, \ - {TW_SIN, v+108, -x}, {TW_SIN, v+108, x}, {TW_SIN, v+109, -x}, {TW_SIN, v+109, x}, \ - {TW_SIN, v+110, -x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, -x}, {TW_SIN, v+111, x}, \ - {TW_SIN, v+112, -x}, {TW_SIN, v+112, x}, {TW_SIN, v+113, -x}, {TW_SIN, v+113, x}, \ - {TW_SIN, v+114, -x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, -x}, {TW_SIN, v+115, x}, \ - {TW_SIN, v+116, -x}, {TW_SIN, v+116, x}, {TW_SIN, v+117, -x}, {TW_SIN, v+117, x}, \ - {TW_SIN, v+118, -x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, -x}, {TW_SIN, v+119, x}, \ - {TW_SIN, v+120, -x}, {TW_SIN, v+120, x}, {TW_SIN, v+121, -x}, {TW_SIN, v+121, x}, \ - {TW_SIN, v+122, -x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, -x}, {TW_SIN, v+123, x}, \ - {TW_SIN, v+124, -x}, {TW_SIN, v+124, x}, {TW_SIN, v+125, -x}, {TW_SIN, v+125, x}, \ - {TW_SIN, v+126, -x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, -x}, {TW_SIN, v+127, x} -#endif // VTW_SIZE == 256 -#endif // REQ_VTW2 -#if defined(REQ_VTWS) -#if defined(VTW_SIZE) && VTW_SIZE == 1 -#warning "using VTWS with 1" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_SIN, v+0, x} -#endif // VTW_SIZE == 1 -#if defined(VTW_SIZE) && VTW_SIZE == 2 -#warning "using VTWS with 2" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, x} -#endif // VTW_SIZE == 2 -#if defined(VTW_SIZE) && VTW_SIZE == 4 -#warning "using VTWS with 4" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ - {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x} -#endif // VTW_SIZE == 4 -#if defined(VTW_SIZE) && VTW_SIZE == 8 -#warning "using VTWS with 8" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ - {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x} -#endif // VTW_SIZE == 8 -#if defined(VTW_SIZE) && VTW_SIZE == 16 -#warning "using VTWS with 16" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ - {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x} -#endif // VTW_SIZE == 16 -#if defined(VTW_SIZE) && VTW_SIZE == 32 -#warning "using VTWS with 32" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ - {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \ - {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \ - {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \ - {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \ - {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \ - {TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \ - {TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \ - {TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \ - {TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x} -#endif // VTW_SIZE == 32 -#if defined(VTW_SIZE) && VTW_SIZE == 64 -#warning "using VTWS with 64" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ - {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \ - {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \ - {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \ - {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \ - {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \ - {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \ - {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \ - {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \ - {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \ - {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \ - {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \ - {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \ - {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \ - {TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \ - {TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \ - {TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \ - {TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \ - {TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \ - {TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \ - {TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \ - {TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \ - {TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \ - {TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \ - {TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \ - {TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x} -#endif // VTW_SIZE == 64 -#if defined(VTW_SIZE) && VTW_SIZE == 128 -#warning "using VTWS with 128" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ - {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \ - {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \ - {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \ - {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \ - {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \ - {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \ - {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \ - {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \ - {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \ - {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \ - {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \ - {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \ - {TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, \ - {TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, \ - {TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, \ - {TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, \ - {TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, \ - {TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, \ - {TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, \ - {TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, \ - {TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, \ - {TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, \ - {TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, \ - {TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, \ - {TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, \ - {TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, \ - {TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, \ - {TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, \ - {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \ - {TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \ - {TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \ - {TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \ - {TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \ - {TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \ - {TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \ - {TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \ - {TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \ - {TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \ - {TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \ - {TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \ - {TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x}, \ - {TW_SIN, v+64, x}, {TW_SIN, v+65, x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, x}, \ - {TW_SIN, v+68, x}, {TW_SIN, v+69, x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, x}, \ - {TW_SIN, v+72, x}, {TW_SIN, v+73, x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, x}, \ - {TW_SIN, v+76, x}, {TW_SIN, v+77, x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, x}, \ - {TW_SIN, v+80, x}, {TW_SIN, v+81, x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, x}, \ - {TW_SIN, v+84, x}, {TW_SIN, v+85, x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, x}, \ - {TW_SIN, v+88, x}, {TW_SIN, v+89, x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, x}, \ - {TW_SIN, v+92, x}, {TW_SIN, v+93, x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, x}, \ - {TW_SIN, v+96, x}, {TW_SIN, v+97, x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, x}, \ - {TW_SIN, v+100, x}, {TW_SIN, v+101, x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, x}, \ - {TW_SIN, v+104, x}, {TW_SIN, v+105, x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, x}, \ - {TW_SIN, v+108, x}, {TW_SIN, v+109, x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, x}, \ - {TW_SIN, v+112, x}, {TW_SIN, v+113, x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, x}, \ - {TW_SIN, v+116, x}, {TW_SIN, v+117, x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, x}, \ - {TW_SIN, v+120, x}, {TW_SIN, v+121, x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, x}, \ - {TW_SIN, v+124, x}, {TW_SIN, v+125, x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, x} -#endif // VTW_SIZE == 128 -#if defined(VTW_SIZE) && VTW_SIZE == 256 -#warning "using VTWS with 256" -#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ - {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \ - {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \ - {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \ - {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \ - {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \ - {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \ - {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \ - {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \ - {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \ - {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \ - {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \ - {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \ - {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \ - {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \ - {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \ - {TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, \ - {TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, \ - {TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, \ - {TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, \ - {TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, \ - {TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, \ - {TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, \ - {TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, \ - {TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, \ - {TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, \ - {TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, \ - {TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, \ - {TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, \ - {TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, \ - {TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, \ - {TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, \ - {TW_COS, v+128, x}, {TW_COS, v+129, x}, {TW_COS, v+130, x}, {TW_COS, v+131, x}, \ - {TW_COS, v+132, x}, {TW_COS, v+133, x}, {TW_COS, v+134, x}, {TW_COS, v+135, x}, \ - {TW_COS, v+136, x}, {TW_COS, v+137, x}, {TW_COS, v+138, x}, {TW_COS, v+139, x}, \ - {TW_COS, v+140, x}, {TW_COS, v+141, x}, {TW_COS, v+142, x}, {TW_COS, v+143, x}, \ - {TW_COS, v+144, x}, {TW_COS, v+145, x}, {TW_COS, v+146, x}, {TW_COS, v+147, x}, \ - {TW_COS, v+148, x}, {TW_COS, v+149, x}, {TW_COS, v+150, x}, {TW_COS, v+151, x}, \ - {TW_COS, v+152, x}, {TW_COS, v+153, x}, {TW_COS, v+154, x}, {TW_COS, v+155, x}, \ - {TW_COS, v+156, x}, {TW_COS, v+157, x}, {TW_COS, v+158, x}, {TW_COS, v+159, x}, \ - {TW_COS, v+160, x}, {TW_COS, v+161, x}, {TW_COS, v+162, x}, {TW_COS, v+163, x}, \ - {TW_COS, v+164, x}, {TW_COS, v+165, x}, {TW_COS, v+166, x}, {TW_COS, v+167, x}, \ - {TW_COS, v+168, x}, {TW_COS, v+169, x}, {TW_COS, v+170, x}, {TW_COS, v+171, x}, \ - {TW_COS, v+172, x}, {TW_COS, v+173, x}, {TW_COS, v+174, x}, {TW_COS, v+175, x}, \ - {TW_COS, v+176, x}, {TW_COS, v+177, x}, {TW_COS, v+178, x}, {TW_COS, v+179, x}, \ - {TW_COS, v+180, x}, {TW_COS, v+181, x}, {TW_COS, v+182, x}, {TW_COS, v+183, x}, \ - {TW_COS, v+184, x}, {TW_COS, v+185, x}, {TW_COS, v+186, x}, {TW_COS, v+187, x}, \ - {TW_COS, v+188, x}, {TW_COS, v+189, x}, {TW_COS, v+190, x}, {TW_COS, v+191, x}, \ - {TW_COS, v+192, x}, {TW_COS, v+193, x}, {TW_COS, v+194, x}, {TW_COS, v+195, x}, \ - {TW_COS, v+196, x}, {TW_COS, v+197, x}, {TW_COS, v+198, x}, {TW_COS, v+199, x}, \ - {TW_COS, v+200, x}, {TW_COS, v+201, x}, {TW_COS, v+202, x}, {TW_COS, v+203, x}, \ - {TW_COS, v+204, x}, {TW_COS, v+205, x}, {TW_COS, v+206, x}, {TW_COS, v+207, x}, \ - {TW_COS, v+208, x}, {TW_COS, v+209, x}, {TW_COS, v+210, x}, {TW_COS, v+211, x}, \ - {TW_COS, v+212, x}, {TW_COS, v+213, x}, {TW_COS, v+214, x}, {TW_COS, v+215, x}, \ - {TW_COS, v+216, x}, {TW_COS, v+217, x}, {TW_COS, v+218, x}, {TW_COS, v+219, x}, \ - {TW_COS, v+220, x}, {TW_COS, v+221, x}, {TW_COS, v+222, x}, {TW_COS, v+223, x}, \ - {TW_COS, v+224, x}, {TW_COS, v+225, x}, {TW_COS, v+226, x}, {TW_COS, v+227, x}, \ - {TW_COS, v+228, x}, {TW_COS, v+229, x}, {TW_COS, v+230, x}, {TW_COS, v+231, x}, \ - {TW_COS, v+232, x}, {TW_COS, v+233, x}, {TW_COS, v+234, x}, {TW_COS, v+235, x}, \ - {TW_COS, v+236, x}, {TW_COS, v+237, x}, {TW_COS, v+238, x}, {TW_COS, v+239, x}, \ - {TW_COS, v+240, x}, {TW_COS, v+241, x}, {TW_COS, v+242, x}, {TW_COS, v+243, x}, \ - {TW_COS, v+244, x}, {TW_COS, v+245, x}, {TW_COS, v+246, x}, {TW_COS, v+247, x}, \ - {TW_COS, v+248, x}, {TW_COS, v+249, x}, {TW_COS, v+250, x}, {TW_COS, v+251, x}, \ - {TW_COS, v+252, x}, {TW_COS, v+253, x}, {TW_COS, v+254, x}, {TW_COS, v+255, x}, \ - {TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \ - {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \ - {TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \ - {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \ - {TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \ - {TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \ - {TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \ - {TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \ - {TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \ - {TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \ - {TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \ - {TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \ - {TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \ - {TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \ - {TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \ - {TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x}, \ - {TW_SIN, v+64, x}, {TW_SIN, v+65, x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, x}, \ - {TW_SIN, v+68, x}, {TW_SIN, v+69, x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, x}, \ - {TW_SIN, v+72, x}, {TW_SIN, v+73, x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, x}, \ - {TW_SIN, v+76, x}, {TW_SIN, v+77, x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, x}, \ - {TW_SIN, v+80, x}, {TW_SIN, v+81, x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, x}, \ - {TW_SIN, v+84, x}, {TW_SIN, v+85, x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, x}, \ - {TW_SIN, v+88, x}, {TW_SIN, v+89, x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, x}, \ - {TW_SIN, v+92, x}, {TW_SIN, v+93, x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, x}, \ - {TW_SIN, v+96, x}, {TW_SIN, v+97, x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, x}, \ - {TW_SIN, v+100, x}, {TW_SIN, v+101, x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, x}, \ - {TW_SIN, v+104, x}, {TW_SIN, v+105, x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, x}, \ - {TW_SIN, v+108, x}, {TW_SIN, v+109, x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, x}, \ - {TW_SIN, v+112, x}, {TW_SIN, v+113, x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, x}, \ - {TW_SIN, v+116, x}, {TW_SIN, v+117, x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, x}, \ - {TW_SIN, v+120, x}, {TW_SIN, v+121, x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, x}, \ - {TW_SIN, v+124, x}, {TW_SIN, v+125, x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, x}, \ - {TW_SIN, v+128, x}, {TW_SIN, v+129, x}, {TW_SIN, v+130, x}, {TW_SIN, v+131, x}, \ - {TW_SIN, v+132, x}, {TW_SIN, v+133, x}, {TW_SIN, v+134, x}, {TW_SIN, v+135, x}, \ - {TW_SIN, v+136, x}, {TW_SIN, v+137, x}, {TW_SIN, v+138, x}, {TW_SIN, v+139, x}, \ - {TW_SIN, v+140, x}, {TW_SIN, v+141, x}, {TW_SIN, v+142, x}, {TW_SIN, v+143, x}, \ - {TW_SIN, v+144, x}, {TW_SIN, v+145, x}, {TW_SIN, v+146, x}, {TW_SIN, v+147, x}, \ - {TW_SIN, v+148, x}, {TW_SIN, v+149, x}, {TW_SIN, v+150, x}, {TW_SIN, v+151, x}, \ - {TW_SIN, v+152, x}, {TW_SIN, v+153, x}, {TW_SIN, v+154, x}, {TW_SIN, v+155, x}, \ - {TW_SIN, v+156, x}, {TW_SIN, v+157, x}, {TW_SIN, v+158, x}, {TW_SIN, v+159, x}, \ - {TW_SIN, v+160, x}, {TW_SIN, v+161, x}, {TW_SIN, v+162, x}, {TW_SIN, v+163, x}, \ - {TW_SIN, v+164, x}, {TW_SIN, v+165, x}, {TW_SIN, v+166, x}, {TW_SIN, v+167, x}, \ - {TW_SIN, v+168, x}, {TW_SIN, v+169, x}, {TW_SIN, v+170, x}, {TW_SIN, v+171, x}, \ - {TW_SIN, v+172, x}, {TW_SIN, v+173, x}, {TW_SIN, v+174, x}, {TW_SIN, v+175, x}, \ - {TW_SIN, v+176, x}, {TW_SIN, v+177, x}, {TW_SIN, v+178, x}, {TW_SIN, v+179, x}, \ - {TW_SIN, v+180, x}, {TW_SIN, v+181, x}, {TW_SIN, v+182, x}, {TW_SIN, v+183, x}, \ - {TW_SIN, v+184, x}, {TW_SIN, v+185, x}, {TW_SIN, v+186, x}, {TW_SIN, v+187, x}, \ - {TW_SIN, v+188, x}, {TW_SIN, v+189, x}, {TW_SIN, v+190, x}, {TW_SIN, v+191, x}, \ - {TW_SIN, v+192, x}, {TW_SIN, v+193, x}, {TW_SIN, v+194, x}, {TW_SIN, v+195, x}, \ - {TW_SIN, v+196, x}, {TW_SIN, v+197, x}, {TW_SIN, v+198, x}, {TW_SIN, v+199, x}, \ - {TW_SIN, v+200, x}, {TW_SIN, v+201, x}, {TW_SIN, v+202, x}, {TW_SIN, v+203, x}, \ - {TW_SIN, v+204, x}, {TW_SIN, v+205, x}, {TW_SIN, v+206, x}, {TW_SIN, v+207, x}, \ - {TW_SIN, v+208, x}, {TW_SIN, v+209, x}, {TW_SIN, v+210, x}, {TW_SIN, v+211, x}, \ - {TW_SIN, v+212, x}, {TW_SIN, v+213, x}, {TW_SIN, v+214, x}, {TW_SIN, v+215, x}, \ - {TW_SIN, v+216, x}, {TW_SIN, v+217, x}, {TW_SIN, v+218, x}, {TW_SIN, v+219, x}, \ - {TW_SIN, v+220, x}, {TW_SIN, v+221, x}, {TW_SIN, v+222, x}, {TW_SIN, v+223, x}, \ - {TW_SIN, v+224, x}, {TW_SIN, v+225, x}, {TW_SIN, v+226, x}, {TW_SIN, v+227, x}, \ - {TW_SIN, v+228, x}, {TW_SIN, v+229, x}, {TW_SIN, v+230, x}, {TW_SIN, v+231, x}, \ - {TW_SIN, v+232, x}, {TW_SIN, v+233, x}, {TW_SIN, v+234, x}, {TW_SIN, v+235, x}, \ - {TW_SIN, v+236, x}, {TW_SIN, v+237, x}, {TW_SIN, v+238, x}, {TW_SIN, v+239, x}, \ - {TW_SIN, v+240, x}, {TW_SIN, v+241, x}, {TW_SIN, v+242, x}, {TW_SIN, v+243, x}, \ - {TW_SIN, v+244, x}, {TW_SIN, v+245, x}, {TW_SIN, v+246, x}, {TW_SIN, v+247, x}, \ - {TW_SIN, v+248, x}, {TW_SIN, v+249, x}, {TW_SIN, v+250, x}, {TW_SIN, v+251, x}, \ - {TW_SIN, v+252, x}, {TW_SIN, v+253, x}, {TW_SIN, v+254, x}, {TW_SIN, v+255, x} -#endif // VTW_SIZE == 256 -#endif // REQ_VTWS From 14084d09a8d9023bcfa366c91c238ee201572697 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet <gilles@rist.or.jp> Date: Sun, 26 Jul 2020 00:11:10 +0900 Subject: [PATCH 04/13] update .gitignore ignore all files automatically generated for SVE support --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 355ca76ef..5d1dc4cab 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ rdft/simd/common/*.c rdft/simd/kcvi/*.c rdft/simd/neon/*.c rdft/simd/sse2/*.c +rdft/simd/sve*/*.c rdft/simd/vsx/*.c rdft/scalar/r2cb/*.c rdft/scalar/r2cf/*.c @@ -45,6 +46,7 @@ dft/simd/common/*.c dft/simd/kcvi/*.c dft/simd/neon/*.c dft/simd/sse2/*.c +dft/simd/sve*/*.c dft/simd/vsx/*.c # other generated files @@ -54,6 +56,8 @@ api/fftw3*.f* *.cmake mpi/f03-wrap.c mpi/fftw3*-mpi.f* +simd-support/vtw.h +simd-support/generate_vtw # other build products tests/bench From 1348189b56a031f455c8db96a963170fb0375755 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet <gilles@rist.or.jp> Date: Sat, 18 Jul 2020 15:47:08 +0900 Subject: [PATCH 05/13] sve: correctly support negative offsets --- simd-support/simd-maskedsve.h | 52 +++++++++++++++++------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h index 459d2bb8b..606854739 100644 --- a/simd-support/simd-maskedsve.h +++ b/simd-support/simd-maskedsve.h @@ -161,12 +161,12 @@ static inline void STA(R *x, V v, INT ovs, const R *aligned_like) { static inline V LDu(const R *x, INT ivs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ - svuint32_t gvvl = svindex_u32(0, 1); - gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ivs); - gvvl = svzip1_u32(gvvl, gvvl); - gvvl = svadd_u32_x(svptrue_b32(), gvvl, svdupq_n_u32(0,sizeof(R),0,sizeof(R))); + svint32_t gvvl = svindex_s32(0, 1); + gvvl = svmul_n_s32_x(MASKA, gvvl, sizeof(R)*ivs); + gvvl = svzip1_s32(gvvl, gvvl); + gvvl = svadd_s32_x(MASKA, gvvl, svdupq_n_s32(0,sizeof(R),0,sizeof(R))); - return svld1_gather_u32offset_f32(MASKA, x, gvvl); + return svld1_gather_s32offset_f32(MASKA, x, gvvl); } static inline void STu(R *x, V v, INT ovs, const R *aligned_like) @@ -175,12 +175,12 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like) if (ovs==0) { // FIXME: hack for extra_iter hack support v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); } - svuint32_t gvvl = svindex_u32(0, 1); - gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ovs); - gvvl = svzip1_u32(gvvl, gvvl); - gvvl = svadd_u32_x(svptrue_b32(), gvvl, svdupq_n_u32(0,sizeof(R),0,sizeof(R))); + svint32_t gvvl = svindex_s32(0, 1); + gvvl = svmul_n_s32_x(MASKA, gvvl, sizeof(R)*ovs); + gvvl = svzip1_s32(gvvl, gvvl); + gvvl = svadd_s32_x(MASKA, gvvl, svdupq_n_s32(0,sizeof(R),0,sizeof(R))); - svst1_scatter_u32offset_f32(MASKA, x, gvvl, v); + svst1_scatter_s32offset_f32(MASKA, x, gvvl, v); } #else /* !FFTW_SINGLE */ @@ -189,12 +189,12 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ (void)aligned_like; /* UNUSED */ - svuint64_t gvvl = svindex_u64(0, 1); - gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ivs); - gvvl = svzip1_u64(gvvl, gvvl); - gvvl = svadd_u64_x(svptrue_b64(), gvvl, svdupq_n_u64(0,sizeof(R))); + svint64_t gvvl = svindex_s64(0, 1); + gvvl = svmul_n_s64_x(MASKA, gvvl, sizeof(R)*ivs); + gvvl = svzip1_s64(gvvl, gvvl); + gvvl = svadd_s64_x(MASKA, gvvl, svdupq_n_s64(0,sizeof(R))); - return svld1_gather_u64offset_f64(MASKA, x, gvvl); + return svld1_gather_s64offset_f64(MASKA, x, gvvl); } static inline void STu(R *x, V v, INT ovs, const R *aligned_like) @@ -203,12 +203,12 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like) if (ovs==0) { // FIXME: hack for extra_iter hack support v = svdupq_lane_f64(v,0); } - svuint64_t gvvl = svindex_u64(0, 1); - gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ovs); - gvvl = svzip1_u64(gvvl, gvvl); - gvvl = svadd_u64_x(svptrue_b64(), gvvl, svdupq_n_u64(0,sizeof(R))); + svint64_t gvvl = svindex_s64(0, 1); + gvvl = svmul_n_s64_x(MASKA, gvvl, sizeof(R)*ovs); + gvvl = svzip1_s64(gvvl, gvvl); + gvvl = svadd_s64_x(MASKA, gvvl, svdupq_n_s64(0,sizeof(R))); - svst1_scatter_u64offset_f64(MASKA, x, gvvl, v); + svst1_scatter_s64offset_f64(MASKA, x, gvvl, v); } #endif /* FFTW_SINGLE */ @@ -224,10 +224,10 @@ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ (void)aligned_like; /* UNUSED */ - svuint32_t gvvl = svindex_u32(0, 1); - gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ovs); + svint32_t gvvl = svindex_s32(0, 1); + gvvl = svmul_n_s32_x(svptrue_b32(), gvvl, sizeof(R)*ovs); - svst1_scatter_u32offset_f32(MASKA, x, gvvl, v); + svst1_scatter_s32offset_f32(MASKA, x, gvvl, v); } #define STN4(x, v0, v1, v2, v3, ovs) /* no-op */ #else /* !FFTW_SINGLE */ @@ -238,10 +238,10 @@ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ (void)aligned_like; /* UNUSED */ - svuint64_t gvvl = svindex_u64(0, 1); - gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ovs); + svint64_t gvvl = svindex_s64(0, 1); + gvvl = svmul_n_s64_x(svptrue_b64(), gvvl, sizeof(R)*ovs); - svst1_scatter_u64offset_f64(MASKA, x, gvvl, v); + svst1_scatter_s64offset_f64(MASKA, x, gvvl, v); } #define STN4(x, v0, v1, v2, v3, ovs) /* no-op */ #endif /* FFTW_SINGLE */ From 38ca2c6ebf5ee26d94a373c970d71e458b34a7ed Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet <gilles@rist.or.jp> Date: Sat, 18 Jul 2020 16:02:37 +0900 Subject: [PATCH 06/13] sve: go brrr --- simd-support/simd-maskedsve.h | 42 +++++++++++++---------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h index 606854739..e4b67590f 100644 --- a/simd-support/simd-maskedsve.h +++ b/simd-support/simd-maskedsve.h @@ -161,12 +161,9 @@ static inline void STA(R *x, V v, INT ovs, const R *aligned_like) { static inline V LDu(const R *x, INT ivs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ - svint32_t gvvl = svindex_s32(0, 1); - gvvl = svmul_n_s32_x(MASKA, gvvl, sizeof(R)*ivs); - gvvl = svzip1_s32(gvvl, gvvl); - gvvl = svadd_s32_x(MASKA, gvvl, svdupq_n_s32(0,sizeof(R),0,sizeof(R))); - - return svld1_gather_s32offset_f32(MASKA, x, gvvl); + svint64_t gvvl = svindex_s64(0, ivs/2); + + return svreinterpret_f32_f64(svld1_gather_s64index_f64(MASKA, (const double *)x, gvvl)); } static inline void STu(R *x, V v, INT ovs, const R *aligned_like) @@ -175,12 +172,9 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like) if (ovs==0) { // FIXME: hack for extra_iter hack support v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); } - svint32_t gvvl = svindex_s32(0, 1); - gvvl = svmul_n_s32_x(MASKA, gvvl, sizeof(R)*ovs); - gvvl = svzip1_s32(gvvl, gvvl); - gvvl = svadd_s32_x(MASKA, gvvl, svdupq_n_s32(0,sizeof(R),0,sizeof(R))); + svint64_t gvvl = svindex_s64(0, ovs/2); - svst1_scatter_s32offset_f32(MASKA, x, gvvl, v); + svst1_scatter_s64index_f64(MASKA, (double *)x, gvvl, svreinterpret_f64_f32(v)); } #else /* !FFTW_SINGLE */ @@ -189,12 +183,10 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ (void)aligned_like; /* UNUSED */ - svint64_t gvvl = svindex_s64(0, 1); - gvvl = svmul_n_s64_x(MASKA, gvvl, sizeof(R)*ivs); - gvvl = svzip1_s64(gvvl, gvvl); - gvvl = svadd_s64_x(MASKA, gvvl, svdupq_n_s64(0,sizeof(R))); + svint64_t gvvl = svindex_s64(0, ivs); + gvvl = svzip1_s64(gvvl, svadd_n_s64_x(MASKA, gvvl, 1)); - return svld1_gather_s64offset_f64(MASKA, x, gvvl); + return svld1_gather_s64index_f64(MASKA, x, gvvl); } static inline void STu(R *x, V v, INT ovs, const R *aligned_like) @@ -203,12 +195,10 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like) if (ovs==0) { // FIXME: hack for extra_iter hack support v = svdupq_lane_f64(v,0); } - svint64_t gvvl = svindex_s64(0, 1); - gvvl = svmul_n_s64_x(MASKA, gvvl, sizeof(R)*ovs); - gvvl = svzip1_s64(gvvl, gvvl); - gvvl = svadd_s64_x(MASKA, gvvl, svdupq_n_s64(0,sizeof(R))); + svint64_t gvvl = svindex_s64(0, ovs); + gvvl = svzip1_s64(gvvl, svadd_n_s64_x(MASKA, gvvl, 1)); - svst1_scatter_s64offset_f64(MASKA, x, gvvl, v); + svst1_scatter_s64index_f64(MASKA, x, gvvl, v); } #endif /* FFTW_SINGLE */ @@ -224,10 +214,9 @@ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ (void)aligned_like; /* UNUSED */ - svint32_t gvvl = svindex_s32(0, 1); - gvvl = svmul_n_s32_x(svptrue_b32(), gvvl, sizeof(R)*ovs); + svint32_t gvvl = svindex_s32(0, ovs); - svst1_scatter_s32offset_f32(MASKA, x, gvvl, v); + svst1_scatter_s32index_f32(MASKA, x, gvvl, v); } #define STN4(x, v0, v1, v2, v3, ovs) /* no-op */ #else /* !FFTW_SINGLE */ @@ -238,10 +227,9 @@ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ (void)aligned_like; /* UNUSED */ - svint64_t gvvl = svindex_s64(0, 1); - gvvl = svmul_n_s64_x(svptrue_b64(), gvvl, sizeof(R)*ovs); + svint64_t gvvl = svindex_s64(0, ovs); - svst1_scatter_s64offset_f64(MASKA, x, gvvl, v); + svst1_scatter_s64index_f64(MASKA, x, gvvl, v); } #define STN4(x, v0, v1, v2, v3, ovs) /* no-op */ #endif /* FFTW_SINGLE */ From f85c7f7e4d3a043d0a4c41f962292df8d1f78639 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau <romain@dolbeau.org> Date: Wed, 2 Sep 2020 09:19:01 -0400 Subject: [PATCH 07/13] Remove test&branch (ovs==0) in STu, replace by masking. Also, improve VBYI. --- simd-support/simd-maskedsve.h | 40 +++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h index e4b67590f..0a1c9454b 100644 --- a/simd-support/simd-maskedsve.h +++ b/simd-support/simd-maskedsve.h @@ -94,7 +94,11 @@ typedef DS(svfloat64_t, svfloat32_t) V; /* FXIME: there is a better way, surely */ /* #define VCONJ(x) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VRONE,0),x,VRONE,270) */ #define VCONJ(x) TYPESUF(svmul,_x)(MASKA,x,VCONEMI) +#if 0 #define VBYI(x) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VCI,0),x,VCI,90) +#else +#define VBYI(x) TYPESUF(svcadd,_x)(MASKA,VZERO,x,90) +#endif #define VNEG(a) TYPESUF(svneg,_x)(MASKA,a) #define VADD(a,b) TYPESUF(svadd,_x)(MASKA,a,b) @@ -169,12 +173,19 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like) static inline void STu(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ - if (ovs==0) { // FIXME: hack for extra_iter hack support - v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); - } - svint64_t gvvl = svindex_s64(0, ovs/2); - - svst1_scatter_s64index_f64(MASKA, (double *)x, gvvl, svreinterpret_f64_f32(v)); +/* if (ovs==0) { // FIXME: hack for extra_iter hack support */ +/* v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); */ +/* } */ + const svint64_t gvvl = svindex_s64(0, ovs/2); + + /* no-branch implementation of extra_iter hack support + * if ovs is non-zero, keep the original MASKA; + * if not, only store one 64 bits element (two 32 bits consecutive) + */ + const svbool_t which = svdupq_n_b64(ovs != 0, ovs != 0); + const svbool_t mask = svsel_b(which, MASKA, svptrue_pat_b64(SV_VL1)); + + svst1_scatter_s64index_f64(mask, (double *)x, gvvl, svreinterpret_f64_f32(v)); } #else /* !FFTW_SINGLE */ @@ -192,13 +203,20 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like) static inline void STu(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ - if (ovs==0) { // FIXME: hack for extra_iter hack support - v = svdupq_lane_f64(v,0); - } - svint64_t gvvl = svindex_s64(0, ovs); +/* if (ovs==0) { // FIXME: hack for extra_iter hack support */ +/* v = svdupq_lane_f64(v,0); */ +/* } */ + svint64_t gvvl = svindex_s64(0, ovs); gvvl = svzip1_s64(gvvl, svadd_n_s64_x(MASKA, gvvl, 1)); - svst1_scatter_s64index_f64(MASKA, x, gvvl, v); + /* no-branch implementation of extra_iter hack support + * if ovs is non-zero, keep the original MASKA; + * if not, only store two 64 bits elements + */ + const svbool_t which = svdupq_n_b64(ovs != 0, ovs != 0); + const svbool_t mask = svsel_b(which, MASKA, svptrue_pat_b64(SV_VL2)); + + svst1_scatter_s64index_f64(mask, x, gvvl, v); } #endif /* FFTW_SINGLE */ From 332007f250c4ef583d50c38f7535c027c372136f Mon Sep 17 00:00:00 2001 From: Romain Dolbeau <romain@dolbeau.org> Date: Wed, 9 Sep 2020 08:36:58 -0400 Subject: [PATCH 08/13] Experimental change for performance - non-masked ADD/SUB/MUL ADD/SUB/MUL are three-addresses in SVE, but the masked form is only two-adresses. And there's a lot of reuse in FFTW3 (and complex arithmetic). But ACLE/SVE (i.e. intrinsics) don't have the non-masked form :-( So used inline ASM for force the non-masked version to be used. Masked-out lanes should be mostly zero, and are never stored anyway, so computing on them should be fine. This one will be reversed if it's not a performance win. --- simd-support/simd-maskedsve.h | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h index 0a1c9454b..415bf0c70 100644 --- a/simd-support/simd-maskedsve.h +++ b/simd-support/simd-maskedsve.h @@ -40,7 +40,6 @@ # define ALLA svptrue_b64() #endif /* FFTW_SINGLE */ -//#define SIMD_SUFFIX _sve /* for renaming */ #if SVE_SIZE == 2048 #define VL DS(16, 32) /* SIMD complex vector length */ #define MASKA DS(svptrue_pat_b64(SV_VL32),svptrue_pat_b32(SV_VL64)) @@ -70,8 +69,19 @@ typedef DS(svfloat64_t, svfloat32_t) V; +/* The goal is to limit to the required width by using masking. + * However, some SVE instructions are limited to two-addresses + * rather than three adresses when masked. + * (i.e. they do X op= Y, not X = Z op X) + * Loads will put zero in masked-out value. + * For performance reason, we want to use non-masked for the instructions + * with a two-addresses masked form: add & sub. + * But ACLE doesn't have the non-masked form... + */ + +/* do we need to mask VLIT somehow ?*/ #define VLIT(re, im) DS(svdupq_n_f64(re,im),svdupq_n_f32(re,im,re,im)) -#define VLIT1(val) DS(svdup_n_f64(val), svdup_n_f32(val)) +#define VLIT1(val) TYPESUF(svdup_n,_z)(MASKA,val) #define LDK(x) x #define DVK(var, val) V var = VLIT1(val) #define VZERO VLIT1(DS(0.,0.f)) @@ -91,7 +101,7 @@ typedef DS(svfloat64_t, svfloat32_t) V; #define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x) #endif -/* FXIME: there is a better way, surely */ +/* FIXME: there is a better way, surely */ /* #define VCONJ(x) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VRONE,0),x,VRONE,270) */ #define VCONJ(x) TYPESUF(svmul,_x)(MASKA,x,VCONEMI) #if 0 @@ -101,9 +111,27 @@ typedef DS(svfloat64_t, svfloat32_t) V; #endif #define VNEG(a) TYPESUF(svneg,_x)(MASKA,a) +#if 0 #define VADD(a,b) TYPESUF(svadd,_x)(MASKA,a,b) #define VSUB(a,b) TYPESUF(svsub,_x)(MASKA,a,b) #define VMUL(a,b) TYPESUF(svmul,_x)(MASKA,a,b) +#else +static inline V VADD(const V a, const V b) { + V r; + asm("fadd %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); + return r; +} +static inline V VSUB(const V a, const V b) { + V r; + asm("fsub %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); + return r; +} +static inline V VMUL(const V a, const V b) { + V r; + asm("fmul %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); + return r; +} +#endif #define VFMA(a, b, c) TYPESUF(svmad,_x)(MASKA,b,a,c) #define VFMS(a, b, c) TYPESUF(svnmsb,_x)(MASKA,b,a,c) #define VFNMS(a, b, c) TYPESUF(svmsb,_x)(MASKA,b,a,c) From b0bf4c6c20e417486355061e2390577e3055834b Mon Sep 17 00:00:00 2001 From: Romain Dolbeau <romain@dolbeau.org> Date: Wed, 9 Sep 2020 09:14:04 -0400 Subject: [PATCH 09/13] Make some variants based on #define, as their behavior seems to be compiler/hardware dependent, and more tests are needed before settling on some defaults. --- simd-support/simd-maskedsve.h | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h index 415bf0c70..51bcaa397 100644 --- a/simd-support/simd-maskedsve.h +++ b/simd-support/simd-maskedsve.h @@ -77,7 +77,13 @@ typedef DS(svfloat64_t, svfloat32_t) V; * For performance reason, we want to use non-masked for the instructions * with a two-addresses masked form: add & sub. * But ACLE doesn't have the non-masked form... + * clang 11 & armclang 20.2 used masked form in assembly and lots of copies + * gcc 10 uses the non-masked form (!) and no copies */ +#define USE_UNMASKED_ASSEMBLY +/* Define below to use masking instead of branching in STu + */ +//#define BRANCHLESS_STU /* do we need to mask VLIT somehow ?*/ #define VLIT(re, im) DS(svdupq_n_f64(re,im),svdupq_n_f32(re,im,re,im)) @@ -111,7 +117,7 @@ typedef DS(svfloat64_t, svfloat32_t) V; #endif #define VNEG(a) TYPESUF(svneg,_x)(MASKA,a) -#if 0 +#if !defined(USE_UNMASKED_ASSEMBLY) #define VADD(a,b) TYPESUF(svadd,_x)(MASKA,a,b) #define VSUB(a,b) TYPESUF(svsub,_x)(MASKA,a,b) #define VMUL(a,b) TYPESUF(svmul,_x)(MASKA,a,b) @@ -201,19 +207,21 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like) static inline void STu(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ -/* if (ovs==0) { // FIXME: hack for extra_iter hack support */ -/* v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); */ -/* } */ const svint64_t gvvl = svindex_s64(0, ovs/2); - +#if !defined(BRANCHLESS_STU) + if (ovs==0) { // FIXME: hack for extra_iter hack support + v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); + } + svst1_scatter_s64index_f64(MASKA, (double *)x, gvvl, svreinterpret_f64_f32(v)); +#else /* no-branch implementation of extra_iter hack support * if ovs is non-zero, keep the original MASKA; * if not, only store one 64 bits element (two 32 bits consecutive) */ const svbool_t which = svdupq_n_b64(ovs != 0, ovs != 0); const svbool_t mask = svsel_b(which, MASKA, svptrue_pat_b64(SV_VL1)); - svst1_scatter_s64index_f64(mask, (double *)x, gvvl, svreinterpret_f64_f32(v)); +#endif } #else /* !FFTW_SINGLE */ @@ -231,12 +239,14 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like) static inline void STu(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ -/* if (ovs==0) { // FIXME: hack for extra_iter hack support */ -/* v = svdupq_lane_f64(v,0); */ -/* } */ svint64_t gvvl = svindex_s64(0, ovs); gvvl = svzip1_s64(gvvl, svadd_n_s64_x(MASKA, gvvl, 1)); - +#if !defined(BRANCHLESS_STU) + if (ovs==0) { // FIXME: hack for extra_iter hack support + v = svdupq_lane_f64(v,0); + } + svst1_scatter_s64index_f64(MASKA, x, gvvl, v); +#else /* no-branch implementation of extra_iter hack support * if ovs is non-zero, keep the original MASKA; * if not, only store two 64 bits elements @@ -245,6 +255,7 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like) const svbool_t mask = svsel_b(which, MASKA, svptrue_pat_b64(SV_VL2)); svst1_scatter_s64index_f64(mask, x, gvvl, v); +#endif } #endif /* FFTW_SINGLE */ From e3150025ade7e4bc5d6af821ab8e849e48ff6a3e Mon Sep 17 00:00:00 2001 From: Romain Dolbeau <romain@dolbeau.org> Date: Thu, 4 Mar 2021 13:26:09 +0100 Subject: [PATCH 10/13] oups, missing ASM for SP --- simd-support/simd-maskedsve.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h index 51bcaa397..9c5be42ce 100644 --- a/simd-support/simd-maskedsve.h +++ b/simd-support/simd-maskedsve.h @@ -124,17 +124,29 @@ typedef DS(svfloat64_t, svfloat32_t) V; #else static inline V VADD(const V a, const V b) { V r; +#ifdef FFTW_SINGLE + asm("fadd %[r].s, %[a].s, %[b].s\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); +#else asm("fadd %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); +#endif return r; } static inline V VSUB(const V a, const V b) { V r; +#ifdef FFTW_SINGLE + asm("fsub %[r].s, %[a].s, %[b].s\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); +#else asm("fsub %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); +#endif return r; } static inline V VMUL(const V a, const V b) { V r; +#ifdef FFTW_SINGLE + asm("fmul %[r].s, %[a].s, %[b].s\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); +#else asm("fmul %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b)); +#endif return r; } #endif From eefa1b44b279100e7e994d047c4c102131e31cc1 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau <romain@dolbeau.org> Date: Fri, 5 Mar 2021 09:10:15 +0100 Subject: [PATCH 11/13] disable USE_UNMASKED_ASSEMBLY by default so it can be reenabled from the command line --- simd-support/simd-maskedsve.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h index 9c5be42ce..ca7cccc00 100644 --- a/simd-support/simd-maskedsve.h +++ b/simd-support/simd-maskedsve.h @@ -80,7 +80,7 @@ typedef DS(svfloat64_t, svfloat32_t) V; * clang 11 & armclang 20.2 used masked form in assembly and lots of copies * gcc 10 uses the non-masked form (!) and no copies */ -#define USE_UNMASKED_ASSEMBLY +//#define USE_UNMASKED_ASSEMBLY /* Define below to use masking instead of branching in STu */ //#define BRANCHLESS_STU From 8cb2fbd8f4623d6c9d66a03f7684781ee9158622 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau <romain@dolbeau.org> Date: Mon, 4 Mar 2024 10:10:14 +0000 Subject: [PATCH 12/13] improve VZMULI[j], clean-up old code --- simd-support/simd-maskedsve.h | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h index ca7cccc00..d21ff6cdb 100644 --- a/simd-support/simd-maskedsve.h +++ b/simd-support/simd-maskedsve.h @@ -101,20 +101,14 @@ typedef DS(svfloat64_t, svfloat32_t) V; #define VDUPH(x) TYPE(svtrn2)(x,x) #ifdef FFTW_SINGLE -//#define FLIP_RI(x) svreinterpret_f32_u64(svrevw_u64_x(MASKA,svreinterpret_u64_f32(x))) #define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x) #else #define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x) #endif -/* FIXME: there is a better way, surely */ -/* #define VCONJ(x) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VRONE,0),x,VRONE,270) */ +/* there might be a better way */ #define VCONJ(x) TYPESUF(svmul,_x)(MASKA,x,VCONEMI) -#if 0 -#define VBYI(x) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VCI,0),x,VCI,90) -#else #define VBYI(x) TYPESUF(svcadd,_x)(MASKA,VZERO,x,90) -#endif #define VNEG(a) TYPESUF(svneg,_x)(MASKA,a) #if !defined(USE_UNMASKED_ASSEMBLY) @@ -155,26 +149,14 @@ static inline V VMUL(const V a, const V b) { #define VFNMS(a, b, c) TYPESUF(svmsb,_x)(MASKA,b,a,c) #define VFMAI(b, c) TYPESUF(svcadd,_x)(MASKA,c,b,90) #define VFNMSI(b, c) TYPESUF(svcadd,_x)(MASKA,c,b,270) -/* FIXME: next 3 overkill ? */ -#if 0 -#define VFMACONJ(b,c) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,c,b,VRONE,0),b,VRONE,270) -#else -/* Use inline functions instead of macros to avoid replicating inputs */ + static inline V VFMACONJ(V b, V c) { V m = TYPESUF(svcmla,_x)(MASKA,c,b,VRONE,0); return TYPESUF(svcmla,_x)(MASKA,m,b,VRONE,270); } -#endif #define VFMSCONJ(b,c) VFMACONJ(b,VNEG(c)) #define VFNMSCONJ(b,c) VNEG(VFMSCONJ(b,c)) -#if 0 -#define VZMUL(a,b) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0),a,b,90) -#define VZMULJ(a,b) TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0),a,b,270) -#define VZMULI(a,b) VZMUL(VCI,VZMUL(a,b)) -#define VZMULIJ(a,b) VZMUL(VCI,VZMULJ(a,b)) -#else -/* Use inline functions instead of macros to avoid replicating inputs */ static inline V VZMUL(V a, V b) { V m = TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0); return TYPESUF(svcmla,_x)(MASKA,m,a,b,90); @@ -183,17 +165,16 @@ static inline V VZMULJ(V a, V b) { V m = TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0); return TYPESUF(svcmla,_x)(MASKA,m,a,b,270); } -/* FIXME: there's probably a better way */ +/* there might be a better way */ static inline V VZMULI(V a, V b) { V m = VZMUL(a,b); - return VZMUL(VCI,m); + return VFMAI(m, VZERO); } -/* FIXME: there's probably a better way */ +/* there might be a better way */ static inline V VZMULIJ(V a, V b) { V m = VZMULJ(a,b); - return VZMUL(VCI,m); + return VFMAI(m, VZERO); } -#endif static inline V LDA(const R *x, INT ivs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ From ff3dfb01f038a83ec274ae3a88902ef0fa4de1c5 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet <gilles@rist.or.jp> Date: Tue, 16 Apr 2024 15:09:01 +0900 Subject: [PATCH 13/13] try building a sample SVE program When configure'd with --enable-sve, try to build a sample SVE program and abort on failure, otherwise configure successes but make will fail. --- configure.ac | 10 ++++++++-- m4/acx_sve.m4 | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 m4/acx_sve.m4 diff --git a/configure.ac b/configure.ac index e6fd591bf..24f9735a3 100644 --- a/configure.ac +++ b/configure.ac @@ -237,11 +237,10 @@ AM_CONDITIONAL(HAVE_GENERIC_SIMD256, test "$have_generic_simd256" = "yes") AC_ARG_ENABLE(sve, [AC_HELP_STRING([--enable-sve],[enable ARM SVE optimizations])], have_sve=$enableval, have_sve=no) if test "$have_sve" = "yes"; then - AC_DEFINE(HAVE_SVE,1,[Define to enable ARM SVE optimizations.]) + AC_DEFINE(HAVE_SVE,1,[Define to enable ARM SVE optimizations]) fi AM_CONDITIONAL(HAVE_SVE, test "$have_sve" = "yes") - dnl FIXME: dnl AC_ARG_ENABLE(mips-ps, [AS_HELP_STRING([--enable-mips-ps],[enable MIPS pair-single optimizations])], have_mips_ps=$enableval, have_mips_ps=no) dnl if test "$have_mips_ps" = "yes"; then @@ -683,6 +682,13 @@ if test "$enable_openmp" = "yes"; then AX_OPENMP([], [AC_MSG_ERROR([don't know how to enable OpenMP])]) fi +if test "$have_sve" = "yes"; then + ACX_SVE([sve_ok=yes], [sve_ok=no]) + if test "$sve_ok" != "yes"; then + AC_MSG_ERROR([Cannot build a SVE program, aborting]) + fi +fi + AC_ARG_ENABLE(threads, [AS_HELP_STRING([--enable-threads],[compile FFTW SMP threads library])], enable_threads=$enableval, enable_threads=no) if test "$enable_threads" = "yes"; then diff --git a/m4/acx_sve.m4 b/m4/acx_sve.m4 new file mode 100644 index 000000000..6a981fe8e --- /dev/null +++ b/m4/acx_sve.m4 @@ -0,0 +1,26 @@ +dnl @synopsis ACX_SVE([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) +dnl @summary figure out whether a simple SVE program can be compiled +dnl @category InstalledPackages +dnl +dnl This macro tries to compile a simple SVE program that uses +dnl the ACLE SVE extensions. +dnl +dnl ACTION-IF-FOUND is a list of shell commands to run if a SVE +dnl program can be compiled, and ACTION-IF-NOT-FOUND is a list of commands +dnl to run it cannot. +dnl +dnl @version 2024-04-15 +dnl @license GPLWithACException +dnl @author Gilles Gouaillardet <gilles@rist.or.jp> + +AC_DEFUN([ACX_SVE], [ + + AC_MSG_CHECKING([whether a SVE program can be compiled]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>]], + [[#if defined(__GNUC__) && !defined(__ARM_FEATURE_SVE) +#error compiling without SVE support +#endif]])],[AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) +])dnl ACX_SVE