From 0a3490c382ba63accf4fe4abc1744c63ec6717cf Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain@dolbeau.org>
Date: Sat, 25 Jul 2020 13:05:43 +0200
Subject: [PATCH 01/13] Clean rebuild of the arm-sve-alt branch (at
 3b1a5c7468af05f1ce20c3b48a82e0948d093dfe)

---
 Makefile.am                       |  17 +-
 api/version.c                     |   4 +
 configure.ac                      |  16 +
 dft/codelet-dft.h                 |   5 +
 dft/conf.c                        |  12 +
 dft/simd/Makefile.am              |   2 +-
 dft/simd/sve1024/Makefile.am      |  13 +
 dft/simd/sve128/Makefile.am       |  13 +
 dft/simd/sve2048/Makefile.am      |  13 +
 dft/simd/sve256/Makefile.am       |  13 +
 dft/simd/sve512/Makefile.am       |  13 +
 kernel/ifftw.h                    |   1 +
 rdft/codelet-rdft.h               |   5 +
 rdft/conf.c                       |  12 +
 rdft/simd/Makefile.am             |   2 +-
 rdft/simd/sve1024/Makefile.am     |  13 +
 rdft/simd/sve128/Makefile.am      |  13 +
 rdft/simd/sve2048/Makefile.am     |  13 +
 rdft/simd/sve256/Makefile.am      |  13 +
 rdft/simd/sve512/Makefile.am      |  13 +
 simd-support/Makefile.am          |   3 +-
 simd-support/generate_vtw.c       |  79 ++++
 simd-support/generate_vtw.sh      |  13 +
 simd-support/simd-common.h        |   2 +-
 simd-support/simd-maskedsve.h     | 305 +++++++++++++
 simd-support/simd-maskedsve1024.h |  31 ++
 simd-support/simd-maskedsve128.h  |  31 ++
 simd-support/simd-maskedsve2048.h |  31 ++
 simd-support/simd-maskedsve256.h  |  31 ++
 simd-support/simd-maskedsve512.h  |  31 ++
 simd-support/sve.c                |  49 ++
 simd-support/vtw.h                | 729 ++++++++++++++++++++++++++++++
 32 files changed, 1536 insertions(+), 5 deletions(-)
 create mode 100644 dft/simd/sve1024/Makefile.am
 create mode 100644 dft/simd/sve128/Makefile.am
 create mode 100644 dft/simd/sve2048/Makefile.am
 create mode 100644 dft/simd/sve256/Makefile.am
 create mode 100644 dft/simd/sve512/Makefile.am
 create mode 100644 rdft/simd/sve1024/Makefile.am
 create mode 100644 rdft/simd/sve128/Makefile.am
 create mode 100644 rdft/simd/sve2048/Makefile.am
 create mode 100644 rdft/simd/sve256/Makefile.am
 create mode 100644 rdft/simd/sve512/Makefile.am
 create mode 100644 simd-support/generate_vtw.c
 create mode 100755 simd-support/generate_vtw.sh
 create mode 100644 simd-support/simd-maskedsve.h
 create mode 100644 simd-support/simd-maskedsve1024.h
 create mode 100644 simd-support/simd-maskedsve128.h
 create mode 100644 simd-support/simd-maskedsve2048.h
 create mode 100644 simd-support/simd-maskedsve256.h
 create mode 100644 simd-support/simd-maskedsve512.h
 create mode 100644 simd-support/sve.c
 create mode 100644 simd-support/vtw.h

diff --git a/Makefile.am b/Makefile.am
index eaf131cca..1704670d3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -94,6 +94,21 @@ NEON_LIBS = dft/simd/neon/libdft_neon_codelets.la	\
 rdft/simd/neon/librdft_neon_codelets.la
 endif
 
+if HAVE_SVE
+SVE_LIBS = \
+dft/simd/sve128/libdft_sve128_codelets.la       \
+rdft/simd/sve128/librdft_sve128_codelets.la \
+dft/simd/sve256/libdft_sve256_codelets.la       \
+rdft/simd/sve256/librdft_sve256_codelets.la \
+dft/simd/sve512/libdft_sve512_codelets.la       \
+rdft/simd/sve512/librdft_sve512_codelets.la	\
+dft/simd/sve1024/libdft_sve1024_codelets.la       \
+rdft/simd/sve1024/librdft_sve1024_codelets.la	\
+dft/simd/sve2048/libdft_sve2048_codelets.la	\
+rdft/simd/sve2048/librdft_sve2048_codelets.la
+endif
+
+
 if HAVE_GENERIC_SIMD128
 GENERIC_SIMD128_LIBS = dft/simd/generic-simd128/libdft_generic_simd128_codelets.la \
 rdft/simd/generic-simd128/librdft_generic_simd128_codelets.la
@@ -126,7 +141,7 @@ libfftw3@PREC_SUFFIX@_la_LIBADD =			\
 	api/libapi.la					\
         $(SIMD_LIBS) $(SSE2_LIBS) $(AVX_LIBS) $(AVX_128_FMA_LIBS) \
         $(AVX2_LIBS) $(ALTIVEC_LIBS) \
-        $(VSX_LIBS) $(NEON_LIBS) $(KCVI_LIBS) $(AVX512_LIBS) \
+        $(VSX_LIBS) $(NEON_LIBS) $(SVE_LIBS) $(KCVI_LIBS) $(AVX512_LIBS) \
         $(GENERIC_SIMD128_LIBS) $(GENERIC_SIMD256_LIBS) \
 	$(COMBINED_THREADLIBS)
 
diff --git a/api/version.c b/api/version.c
index 4f14de157..0f79ce759 100644
--- a/api/version.c
+++ b/api/version.c
@@ -77,6 +77,10 @@ const char X(version)[] = PACKAGE "-" PACKAGE_VERSION
    "-neon"
 #endif
 
+#if HAVE_SVE
+   "-sve"
+#endif
+
 #if defined(HAVE_GENERIC_SIMD128)
    "-generic_simd128"
 #endif
diff --git a/configure.ac b/configure.ac
index 5e33b1c2b..e6fd591bf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -235,6 +235,12 @@ if test "$have_generic_simd256" = "yes"; then
 fi
 AM_CONDITIONAL(HAVE_GENERIC_SIMD256, test "$have_generic_simd256" = "yes")
 
+AC_ARG_ENABLE(sve, [AC_HELP_STRING([--enable-sve],[enable ARM SVE optimizations])], have_sve=$enableval, have_sve=no)
+if test "$have_sve" = "yes"; then
+        AC_DEFINE(HAVE_SVE,1,[Define to enable ARM SVE optimizations.])
+fi
+AM_CONDITIONAL(HAVE_SVE, test "$have_sve" = "yes")
+
 
 dnl FIXME:
 dnl AC_ARG_ENABLE(mips-ps, [AS_HELP_STRING([--enable-mips-ps],[enable MIPS pair-single optimizations])], have_mips_ps=$enableval, have_mips_ps=no)
@@ -766,6 +772,11 @@ AC_CONFIG_FILES([
    dft/simd/altivec/Makefile
    dft/simd/vsx/Makefile
    dft/simd/neon/Makefile
+   dft/simd/sve128/Makefile
+   dft/simd/sve256/Makefile
+   dft/simd/sve512/Makefile
+   dft/simd/sve1024/Makefile
+   dft/simd/sve2048/Makefile
    dft/simd/generic-simd128/Makefile
    dft/simd/generic-simd256/Makefile
 
@@ -786,6 +797,11 @@ AC_CONFIG_FILES([
    rdft/simd/altivec/Makefile
    rdft/simd/vsx/Makefile
    rdft/simd/neon/Makefile
+   rdft/simd/sve128/Makefile
+   rdft/simd/sve256/Makefile
+   rdft/simd/sve512/Makefile
+   rdft/simd/sve1024/Makefile
+   rdft/simd/sve2048/Makefile
    rdft/simd/generic-simd128/Makefile
    rdft/simd/generic-simd256/Makefile
 
diff --git a/dft/codelet-dft.h b/dft/codelet-dft.h
index b78e135c8..2ba9bcb66 100644
--- a/dft/codelet-dft.h
+++ b/dft/codelet-dft.h
@@ -106,6 +106,11 @@ extern const solvtab X(solvtab_dft_kcvi);
 extern const solvtab X(solvtab_dft_altivec);
 extern const solvtab X(solvtab_dft_vsx);
 extern const solvtab X(solvtab_dft_neon);
+extern const solvtab X(solvtab_dft_sve128);
+extern const solvtab X(solvtab_dft_sve256);
+extern const solvtab X(solvtab_dft_sve512);
+extern const solvtab X(solvtab_dft_sve1024);
+extern const solvtab X(solvtab_dft_sve2048);
 extern const solvtab X(solvtab_dft_generic_simd128);
 extern const solvtab X(solvtab_dft_generic_simd256);
 
diff --git a/dft/conf.c b/dft/conf.c
index d0951de5d..2cad5c0cd 100644
--- a/dft/conf.c
+++ b/dft/conf.c
@@ -79,6 +79,18 @@ void X(dft_conf_standard)(planner *p)
      if (X(have_simd_neon)())
 	  X(solvtab_exec)(X(solvtab_dft_neon), p);
 #endif
+#if HAVE_SVE
+     if (X(have_simd_sve)(128))
+          X(solvtab_exec)(X(solvtab_dft_sve128), p);
+     if (X(have_simd_sve)(256))
+          X(solvtab_exec)(X(solvtab_dft_sve256), p);
+     if (X(have_simd_sve)(512))
+          X(solvtab_exec)(X(solvtab_dft_sve512), p);
+     if (X(have_simd_sve)(1024))
+          X(solvtab_exec)(X(solvtab_dft_sve1024), p);
+     if (X(have_simd_sve)(2048))
+          X(solvtab_exec)(X(solvtab_dft_sve2048), p);
+#endif
 #if HAVE_GENERIC_SIMD128
      X(solvtab_exec)(X(solvtab_dft_generic_simd128), p);
 #endif
diff --git a/dft/simd/Makefile.am b/dft/simd/Makefile.am
index 315d74474..7b5f28b1b 100644
--- a/dft/simd/Makefile.am
+++ b/dft/simd/Makefile.am
@@ -1,4 +1,4 @@
 AM_CPPFLAGS = -I $(top_srcdir)
-SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256
+SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon sve128 sve256 sve512 sve1024 sve2048 generic-simd128 generic-simd256
 EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h	\
 t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk
diff --git a/dft/simd/sve1024/Makefile.am b/dft/simd/sve1024/Makefile.am
new file mode 100644
index 000000000..89b996197
--- /dev/null
+++ b/dft/simd/sve1024/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve1024.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_SVE
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_sve1024_codelets.la
+libdft_sve1024_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/dft/simd/sve128/Makefile.am b/dft/simd/sve128/Makefile.am
new file mode 100644
index 000000000..9609917bf
--- /dev/null
+++ b/dft/simd/sve128/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve128.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_SVE
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_sve128_codelets.la
+libdft_sve128_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/dft/simd/sve2048/Makefile.am b/dft/simd/sve2048/Makefile.am
new file mode 100644
index 000000000..f633df29b
--- /dev/null
+++ b/dft/simd/sve2048/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve2048.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_SVE
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_sve2048_codelets.la
+libdft_sve2048_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/dft/simd/sve256/Makefile.am b/dft/simd/sve256/Makefile.am
new file mode 100644
index 000000000..6f21f3a3b
--- /dev/null
+++ b/dft/simd/sve256/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve256.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_SVE
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_sve256_codelets.la
+libdft_sve256_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/dft/simd/sve512/Makefile.am b/dft/simd/sve512/Makefile.am
new file mode 100644
index 000000000..dc57ffb5e
--- /dev/null
+++ b/dft/simd/sve512/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve512.h
+
+include $(top_srcdir)/dft/simd/codlist.mk
+include $(top_srcdir)/dft/simd/simd.mk
+
+if HAVE_SVE
+
+BUILT_SOURCES = $(EXTRA_DIST)
+noinst_LTLIBRARIES = libdft_sve512_codelets.la
+libdft_sve512_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/kernel/ifftw.h b/kernel/ifftw.h
index b4705ba8d..bd6efacc7 100644
--- a/kernel/ifftw.h
+++ b/kernel/ifftw.h
@@ -119,6 +119,7 @@ extern int X(have_simd_avx512)(void);
 extern int X(have_simd_altivec)(void);
 extern int X(have_simd_vsx)(void);
 extern int X(have_simd_neon)(void);
+extern int X(have_simd_sve)(int minwidth);
 
 /* forward declarations */
 typedef struct problem_s problem;
diff --git a/rdft/codelet-rdft.h b/rdft/codelet-rdft.h
index 789040f65..07b62312b 100644
--- a/rdft/codelet-rdft.h
+++ b/rdft/codelet-rdft.h
@@ -145,6 +145,11 @@ extern const solvtab X(solvtab_rdft_kcvi);
 extern const solvtab X(solvtab_rdft_altivec);
 extern const solvtab X(solvtab_rdft_vsx);
 extern const solvtab X(solvtab_rdft_neon);
+extern const solvtab X(solvtab_rdft_sve128);
+extern const solvtab X(solvtab_rdft_sve256);
+extern const solvtab X(solvtab_rdft_sve512);
+extern const solvtab X(solvtab_rdft_sve1024);
+extern const solvtab X(solvtab_rdft_sve2048);
 extern const solvtab X(solvtab_rdft_generic_simd128);
 extern const solvtab X(solvtab_rdft_generic_simd256);
 
diff --git a/rdft/conf.c b/rdft/conf.c
index 5fe8d665f..752d25820 100644
--- a/rdft/conf.c
+++ b/rdft/conf.c
@@ -96,6 +96,18 @@ void X(rdft_conf_standard)(planner *p)
      if (X(have_simd_neon)())
 	  X(solvtab_exec)(X(solvtab_rdft_neon), p);
 #endif
+#if HAVE_SVE
+     if (X(have_simd_sve)(128))
+          X(solvtab_exec)(X(solvtab_rdft_sve128), p);
+     if (X(have_simd_sve)(256))
+          X(solvtab_exec)(X(solvtab_rdft_sve256), p);
+     if (X(have_simd_sve)(512))
+          X(solvtab_exec)(X(solvtab_rdft_sve512), p);
+     if (X(have_simd_sve)(1024))
+          X(solvtab_exec)(X(solvtab_rdft_sve1024), p);
+     if (X(have_simd_sve)(2048))
+          X(solvtab_exec)(X(solvtab_rdft_sve2048), p);
+#endif
 #if HAVE_GENERIC_SIMD128
      X(solvtab_exec)(X(solvtab_rdft_generic_simd128), p);
 #endif
diff --git a/rdft/simd/Makefile.am b/rdft/simd/Makefile.am
index 53de164f0..42dc4d74f 100644
--- a/rdft/simd/Makefile.am
+++ b/rdft/simd/Makefile.am
@@ -1,4 +1,4 @@
 
 AM_CPPFLAGS = -I $(top_srcdir)
-SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256
+SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon sve128 sve256 sve512 sve1024 sve2048 generic-simd128 generic-simd256
 EXTRA_DIST = hc2cbv.h hc2cfv.h codlist.mk simd.mk
diff --git a/rdft/simd/sve1024/Makefile.am b/rdft/simd/sve1024/Makefile.am
new file mode 100644
index 000000000..e02438d1a
--- /dev/null
+++ b/rdft/simd/sve1024/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve1024.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_SVE
+
+noinst_LTLIBRARIES = librdft_sve1024_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_sve1024_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/rdft/simd/sve128/Makefile.am b/rdft/simd/sve128/Makefile.am
new file mode 100644
index 000000000..3bc5216d4
--- /dev/null
+++ b/rdft/simd/sve128/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve128.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_SVE
+
+noinst_LTLIBRARIES = librdft_sve128_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_sve128_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/rdft/simd/sve2048/Makefile.am b/rdft/simd/sve2048/Makefile.am
new file mode 100644
index 000000000..025aa4de4
--- /dev/null
+++ b/rdft/simd/sve2048/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve2048.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_SVE
+
+noinst_LTLIBRARIES = librdft_sve2048_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_sve2048_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/rdft/simd/sve256/Makefile.am b/rdft/simd/sve256/Makefile.am
new file mode 100644
index 000000000..c58adb578
--- /dev/null
+++ b/rdft/simd/sve256/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve256.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_SVE
+
+noinst_LTLIBRARIES = librdft_sve256_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_sve256_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/rdft/simd/sve512/Makefile.am b/rdft/simd/sve512/Makefile.am
new file mode 100644
index 000000000..db9c030cb
--- /dev/null
+++ b/rdft/simd/sve512/Makefile.am
@@ -0,0 +1,13 @@
+AM_CFLAGS = $(SVE_CFLAGS)
+SIMD_HEADER=simd-support/simd-maskedsve512.h
+
+include $(top_srcdir)/rdft/simd/codlist.mk
+include $(top_srcdir)/rdft/simd/simd.mk
+
+if HAVE_SVE
+
+noinst_LTLIBRARIES = librdft_sve512_codelets.la
+BUILT_SOURCES = $(EXTRA_DIST)
+librdft_sve512_codelets_la_SOURCES = $(BUILT_SOURCES)
+
+endif
diff --git a/simd-support/Makefile.am b/simd-support/Makefile.am
index 26db46e93..60b705377 100644
--- a/simd-support/Makefile.am
+++ b/simd-support/Makefile.am
@@ -11,5 +11,6 @@ avx512.c simd-avx512.h \
 kcvi.c simd-kcvi.h \
 altivec.c simd-altivec.h vsx.c simd-vsx.h \
 neon.c simd-neon.h \
-simd-generic128.h simd-generic256.h
+simd-generic128.h simd-generic256.h \
+sve.c simd-maskedsve.h simd-maskedsve128.h simd-maskedsve256.h simd-maskedsve512.h simd-maskedsve1024.h simd-maskedsve2048.h
 
diff --git a/simd-support/generate_vtw.c b/simd-support/generate_vtw.c
new file mode 100644
index 000000000..505a5804c
--- /dev/null
+++ b/simd-support/generate_vtw.c
@@ -0,0 +1,79 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+unsigned int rp2(unsigned int size) {
+  size = size | (size >> 1);
+  size = size | (size >> 2);
+  size = size | (size >> 4);
+  size = size | (size >> 8);
+  size = size | (size >> 16);
+//  size = size | (size >> 32);
+  size = size - (size >> 1);
+  return size;
+}
+
+int main(int argc, char **argv) {
+	if (argc < 3) {
+	printf("usage: %s <array_name> <width>\n", argv[0]);
+	exit(-1);
+	}
+	if (strncmp(argv[1], "VTW1", 4) == 0) {
+		unsigned int osize = atoi(argv[2]);
+		unsigned int size = rp2(osize);
+		if (osize != size)
+			exit(-4);
+		if (size < 1)
+			exit(-2);
+		if (size > 256)
+			exit(-3);
+		printf("#define VTW1(v,x) ");
+		for (unsigned int i = 0 ; i < size ; i++) {
+			printf("{TW_CEXP, v+%d, x}%s%s", i, (i == size-1?"":","), ((i%4==3 && i!=size-1)?" \\\n\t":" "));
+		}
+		printf("\n");
+	}
+        if (strncmp(argv[1], "VTW2", 4) == 0) {
+                unsigned int osize = atoi(argv[2]);
+                unsigned int size = rp2(osize);
+                if (osize != size)
+                        exit(-4);
+                if (size < 1)
+                        exit(-2);
+                if (size > 256)
+                        exit(-3);
+                printf("#define VTW2(v,x) ");
+                for (unsigned int i = 0 ; i < size ; i++) {
+                        printf("{TW_COS, v+%d, x}%s%s", i/2, ",", ((i%4==3)?" \\\n\t":" "));
+                }
+		for (unsigned int i = 0 ; i < size ; i++) {
+                        printf("{TW_SIN, v+%d, %sx}%s%s", i/2, (i%2==0?"-":""), (i == size-1?"":","), ((i%4==3 && i!=size-1)?" \\\n\t":" "));
+                }
+
+                printf("\n");
+        }
+        if (strncmp(argv[1], "VTWS", 4) == 0) {
+                unsigned int osize = atoi(argv[2]);
+                unsigned int size = rp2(osize);
+                if (osize != size)
+                        exit(-4);
+                if (size < 1)
+                        exit(-2);
+                if (size > 256)
+                        exit(-3);
+                printf("#define VTWS(v,x) ");
+                for (unsigned int i = 0 ; i < size ; i++) {
+                        printf("{TW_COS, v+%d, x}%s%s", i, ",", ((i%4==3)?" \\\n\t":" "));
+                }
+                for (unsigned int i = 0 ; i < size ; i++) {
+                        printf("{TW_SIN, v+%d, x}%s%s", i, (i == size-1?"":","), ((i%4==3 && i!=size-1)?" \\\n\t":" "));
+                }
+
+                printf("\n");
+        }
+
+
+
+	return 0;
+}
diff --git a/simd-support/generate_vtw.sh b/simd-support/generate_vtw.sh
new file mode 100755
index 000000000..f4a1cfa15
--- /dev/null
+++ b/simd-support/generate_vtw.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+echo "/* auto-generated */"
+for A in VTW1 VTW2 VTWS; do
+	echo "#if defined(REQ_$A)"
+	for X in 1 2 4 8 16 32 64 128 256; do
+		echo "#if defined(VTW_SIZE) && VTW_SIZE == $X"
+		echo "#warning \"using $A with $X\""
+		./generate_vtw $A $X 
+		echo "#endif // VTW_SIZE == $X"
+	done
+	echo "#endif // REQ_$A"
+done
diff --git a/simd-support/simd-common.h b/simd-support/simd-common.h
index ad2c96fa1..147d9f692 100644
--- a/simd-support/simd-common.h
+++ b/simd-support/simd-common.h
@@ -34,7 +34,7 @@
 #elif defined(HAVE_ALTIVEC)
 #  define ALIGNMENT 8     /* Alignment for the LD/ST macros */
 #  define ALIGNMENTA 16   /* Alignment for the LDA/STA macros */
-#elif defined(HAVE_NEON) || defined(HAVE_VSX)
+#elif defined(HAVE_NEON) || defined(HAVE_VSX) || defined(HAVE_SVE)
 #  define ALIGNMENT 8     /* Alignment for the LD/ST macros */
 #  define ALIGNMENTA 8    /* Alignment for the LDA/STA macros */
 #elif defined(HAVE_KCVI)
diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
new file mode 100644
index 000000000..459d2bb8b
--- /dev/null
+++ b/simd-support/simd-maskedsve.h
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * ARM SVE support implemented by Romain Dolbeau. (c) 2017 Romain Dolbeau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "SVE vector instructions only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define TYPE(name) name ## _f32
+#  define TYPESUF(name,suf) name ## _f32 ## suf
+#  define ALLA	svptrue_b32()
+#else /* !FFTW_SINGLE */
+#  define DS(d,s) d /* double-precision option */
+#  define TYPE(name) name ## _f64
+#  define TYPESUF(name,suf) name ## _f64 ## suf
+#  define ALLA  svptrue_b64()
+#endif /* FFTW_SINGLE */
+
+//#define SIMD_SUFFIX  _sve  /* for renaming */
+#if SVE_SIZE == 2048
+#define VL DS(16, 32)        /* SIMD complex vector length */
+#define MASKA DS(svptrue_pat_b64(SV_VL32),svptrue_pat_b32(SV_VL64))
+#elif SVE_SIZE == 1024
+#define VL DS(8, 16)        /* SIMD complex vector length */
+#define MASKA DS(svptrue_pat_b64(SV_VL16),svptrue_pat_b32(SV_VL32))
+#elif SVE_SIZE == 512
+#define VL DS(4, 8)        /* SIMD complex vector length */
+#define MASKA DS(svptrue_pat_b64(SV_VL8),svptrue_pat_b32(SV_VL16))
+#elif SVE_SIZE == 256
+#define VL DS(2, 4)        /* SIMD complex vector length */
+#define MASKA DS(svptrue_pat_b64(SV_VL4),svptrue_pat_b32(SV_VL8))
+#elif SVE_SIZE == 128
+#define VL DS(1, 2)        /* SIMD complex vector length */
+#define MASKA DS(svptrue_pat_b64(SV_VL2),svptrue_pat_b32(SV_VL4))
+#else /* SVE_SIZE */
+#error "SVE_SIZE must be 128, 256, 512, 1024, 2048 bits"
+#endif /* SVE_SIZE */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2) 
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(__ARM_FEATURE_SVE) /* sanity check */
+#error "compiling simd-sve.h without SVE support"
+#endif
+
+#include <arm_sve.h>
+
+typedef DS(svfloat64_t, svfloat32_t) V;
+
+#define VLIT(re, im) DS(svdupq_n_f64(re,im),svdupq_n_f32(re,im,re,im))
+#define VLIT1(val) DS(svdup_n_f64(val), svdup_n_f32(val))
+#define LDK(x) x
+#define DVK(var, val) V var = VLIT1(val)
+#define VZERO VLIT1(DS(0.,0.f))
+#define VRONE VLIT(DS(1.,1.f),DS(0.,0.f))
+#define VCI VLIT(DS(0.,0.f),DS(1.,1.f))
+#define VCONEMI VLIT(DS(1.,1.f),DS(-1.,-1.f))
+#define VONE  VLIT1(DS(1.,1.f))
+#define VMINUSONE VLIT1(DS(-1.,-1.f))
+
+#define VDUPL(x) TYPE(svtrn1)(x,x)
+#define VDUPH(x) TYPE(svtrn2)(x,x)
+
+#ifdef FFTW_SINGLE
+//#define FLIP_RI(x) svreinterpret_f32_u64(svrevw_u64_x(MASKA,svreinterpret_u64_f32(x)))
+#define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x)
+#else
+#define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x)
+#endif
+
+/* FXIME: there is a better way, surely */
+/* #define VCONJ(x)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VRONE,0),x,VRONE,270) */
+#define VCONJ(x) TYPESUF(svmul,_x)(MASKA,x,VCONEMI)
+#define VBYI(x)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VCI,0),x,VCI,90)
+
+#define VNEG(a)   TYPESUF(svneg,_x)(MASKA,a)
+#define VADD(a,b) TYPESUF(svadd,_x)(MASKA,a,b)
+#define VSUB(a,b) TYPESUF(svsub,_x)(MASKA,a,b)
+#define VMUL(a,b) TYPESUF(svmul,_x)(MASKA,a,b)
+#define VFMA(a, b, c)  TYPESUF(svmad,_x)(MASKA,b,a,c)
+#define VFMS(a, b, c)  TYPESUF(svnmsb,_x)(MASKA,b,a,c)
+#define VFNMS(a, b, c) TYPESUF(svmsb,_x)(MASKA,b,a,c)
+#define VFMAI(b, c)    TYPESUF(svcadd,_x)(MASKA,c,b,90)
+#define VFNMSI(b, c)   TYPESUF(svcadd,_x)(MASKA,c,b,270)
+/* FIXME: next 3 overkill ? */
+#if 0
+#define VFMACONJ(b,c)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,c,b,VRONE,0),b,VRONE,270)
+#else
+/* Use inline functions instead of macros to avoid replicating inputs */
+static inline V VFMACONJ(V b, V c) {
+	V m = TYPESUF(svcmla,_x)(MASKA,c,b,VRONE,0);
+	return TYPESUF(svcmla,_x)(MASKA,m,b,VRONE,270);
+}
+#endif
+#define VFMSCONJ(b,c)  VFMACONJ(b,VNEG(c))
+#define VFNMSCONJ(b,c) VNEG(VFMSCONJ(b,c))
+
+#if 0
+#define VZMUL(a,b)    TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0),a,b,90)
+#define VZMULJ(a,b)   TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0),a,b,270)
+#define VZMULI(a,b)   VZMUL(VCI,VZMUL(a,b))
+#define VZMULIJ(a,b)   VZMUL(VCI,VZMULJ(a,b))
+#else
+/* Use inline functions instead of macros to avoid replicating inputs */
+static inline V VZMUL(V a, V b) {
+	V m = TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0);
+	return TYPESUF(svcmla,_x)(MASKA,m,a,b,90);
+}
+static inline V VZMULJ(V a, V b) {
+        V m = TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0);
+        return TYPESUF(svcmla,_x)(MASKA,m,a,b,270);
+}
+/* FIXME: there's probably a better way */
+static inline V VZMULI(V a, V b) {
+	V m = VZMUL(a,b);
+	return VZMUL(VCI,m);
+}
+/* FIXME: there's probably a better way */
+static inline V VZMULIJ(V a, V b) {
+	V m = VZMULJ(a,b);
+	return VZMUL(VCI,m);
+}
+#endif
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like) {
+  (void)aligned_like; /* UNUSED */
+  (void)ivs; /* UNUSED */
+  return TYPE(svld1)(MASKA,x);
+}
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
+  (void)aligned_like; /* UNUSED */
+  (void)ovs; /* UNUSED */
+  TYPE(svst1)(MASKA,x,v);
+}
+
+#if FFTW_SINGLE
+
+static inline V LDu(const R *x, INT ivs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  svuint32_t  gvvl = svindex_u32(0, 1);
+  gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ivs);
+  gvvl = svzip1_u32(gvvl, gvvl);
+  gvvl = svadd_u32_x(svptrue_b32(), gvvl, svdupq_n_u32(0,sizeof(R),0,sizeof(R)));
+  
+  return svld1_gather_u32offset_f32(MASKA, x, gvvl);
+}
+
+static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  if (ovs==0) { // FIXME: hack for extra_iter hack support
+    v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0));
+  }
+  svuint32_t  gvvl = svindex_u32(0, 1);
+  gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ovs);
+  gvvl = svzip1_u32(gvvl, gvvl);
+  gvvl = svadd_u32_x(svptrue_b32(), gvvl, svdupq_n_u32(0,sizeof(R),0,sizeof(R)));
+
+  svst1_scatter_u32offset_f32(MASKA, x, gvvl, v);
+}
+
+#else /* !FFTW_SINGLE */
+
+static inline V LDu(const R *x, INT ivs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  (void)aligned_like; /* UNUSED */
+  svuint64_t  gvvl = svindex_u64(0, 1);
+  gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ivs);
+  gvvl = svzip1_u64(gvvl, gvvl);
+  gvvl = svadd_u64_x(svptrue_b64(), gvvl, svdupq_n_u64(0,sizeof(R)));
+
+  return svld1_gather_u64offset_f64(MASKA, x, gvvl);
+}
+
+static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  if (ovs==0) { // FIXME: hack for extra_iter hack support
+    v = svdupq_lane_f64(v,0);
+  }
+  svuint64_t  gvvl = svindex_u64(0, 1);
+  gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ovs);
+  gvvl = svzip1_u64(gvvl, gvvl);
+  gvvl = svadd_u64_x(svptrue_b64(), gvvl, svdupq_n_u64(0,sizeof(R)));
+
+  svst1_scatter_u64offset_f64(MASKA, x, gvvl, v);
+}
+
+#endif /* FFTW_SINGLE */
+
+#define LD LDu
+#define ST STu
+
+#ifdef FFTW_SINGLE
+#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  (void)aligned_like; /* UNUSED */
+  svuint32_t  gvvl = svindex_u32(0, 1);
+  gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ovs);
+
+  svst1_scatter_u32offset_f32(MASKA, x, gvvl, v);
+}
+#define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
+#else /* !FFTW_SINGLE */
+#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  (void)aligned_like; /* UNUSED */
+  svuint64_t  gvvl = svindex_u64(0, 1);
+  gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ovs);
+
+  svst1_scatter_u64offset_f64(MASKA, x, gvvl, v);
+}
+#define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
+#endif /* FFTW_SINGLE */
+
+/* twiddle storage #1: compact, slower */
+#define REQ_VTW1
+#define VTW_SIZE VL
+#include "vtw.h"
+#define TWVL1 (VL)
+#undef VTW_SIZE
+#undef REQ_VTW1
+
+static inline V BYTW1(const R *t, V sr)
+{
+     return VZMUL(LDA(t, 2, t), sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     return VZMULJ(LDA(t, 2, t), sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#define REQ_VTW2
+#define VTW_SIZE (2*VL)
+#include "vtw.h"
+#define TWVL2 (2*VL)
+#undef VTW_SIZE
+#undef REQ_VTW2
+
+static inline V BYTW2(const R *t, V sr)
+{
+     V si = FLIP_RI(sr);
+     V ti = LDA(t + 2*VL, 2, t + 4*VL);
+     V tr = LDA(t, 2, t);
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     V si = FLIP_RI(sr);
+     V ti = LDA(t + 2*VL, 2, t + 4*VL);
+     V tr = LDA(t, 2, t);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3(v,x) VTW1(v,x)
+#define TWVL3 TWVL1
+
+/* twiddle storage for split arrays */
+#define REQ_VTWS
+#define VTW_SIZE (2*VL)
+#include "vtw.h"
+#define TWVLS (2*VL)
+#undef VTW_SIZE
+#undef REQ_VTWS
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
diff --git a/simd-support/simd-maskedsve1024.h b/simd-support/simd-maskedsve1024.h
new file mode 100644
index 000000000..736eaf135
--- /dev/null
+++ b/simd-support/simd-maskedsve1024.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+
+#define SIMD_SUFFIX  _sve1024  /* for renaming */
+#define SVE_SIZE 1024
+#include "simd-maskedsve.h"
+
diff --git a/simd-support/simd-maskedsve128.h b/simd-support/simd-maskedsve128.h
new file mode 100644
index 000000000..a97ffe400
--- /dev/null
+++ b/simd-support/simd-maskedsve128.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+
+#define SIMD_SUFFIX  _sve128  /* for renaming */
+#define SVE_SIZE 128
+#include "simd-maskedsve.h"
+
diff --git a/simd-support/simd-maskedsve2048.h b/simd-support/simd-maskedsve2048.h
new file mode 100644
index 000000000..966a46614
--- /dev/null
+++ b/simd-support/simd-maskedsve2048.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+
+#define SIMD_SUFFIX  _sve2048  /* for renaming */
+#define SVE_SIZE 2048
+#include "simd-maskedsve.h"
+
diff --git a/simd-support/simd-maskedsve256.h b/simd-support/simd-maskedsve256.h
new file mode 100644
index 000000000..e36be395b
--- /dev/null
+++ b/simd-support/simd-maskedsve256.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+
+#define SIMD_SUFFIX  _sve256  /* for renaming */
+#define SVE_SIZE 256
+#include "simd-maskedsve.h"
+
diff --git a/simd-support/simd-maskedsve512.h b/simd-support/simd-maskedsve512.h
new file mode 100644
index 000000000..0fc09b944
--- /dev/null
+++ b/simd-support/simd-maskedsve512.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * ARM SVE support implemented by Romain Dolbeau. (c) 2017-2019 Romain Dolbeau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+
+#define SIMD_SUFFIX  _sve512  /* for renaming */
+#define SVE_SIZE 512
+#include "simd-maskedsve.h"
+
diff --git a/simd-support/sve.c b/simd-support/sve.c
new file mode 100644
index 000000000..9efc8df5e
--- /dev/null
+++ b/simd-support/sve.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#if HAVE_SVE
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#endif
+
+  static int sve_getwidth(void) {
+#if defined(__GNUC__) && !defined(__ARM_FEATURE_SVE)
+#warning "SVE not supported"
+    return -1;
+#else
+    return svcntb()*8;
+#endif
+  }
+
+  int X(have_simd_sve)(int minwidth)
+  {
+       static int init = 0, res;
+
+       if (!init) {
+	    init = sve_getwidth();
+       }
+       return ((init > 0) ? (minwidth <= init ? 1 : 0) : 0);
+  }
+
+
+#endif
diff --git a/simd-support/vtw.h b/simd-support/vtw.h
new file mode 100644
index 000000000..0c31a32b1
--- /dev/null
+++ b/simd-support/vtw.h
@@ -0,0 +1,729 @@
+/* auto-generated */
+#if defined(REQ_VTW1)
+#if defined(VTW_SIZE) && VTW_SIZE == 1
+#warning "using VTW1 with 1"
+#define VTW1(v,x) {TW_CEXP, v+0, x} 
+#endif // VTW_SIZE == 1
+#if defined(VTW_SIZE) && VTW_SIZE == 2
+#warning "using VTW1 with 2"
+#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x} 
+#endif // VTW_SIZE == 2
+#if defined(VTW_SIZE) && VTW_SIZE == 4
+#warning "using VTW1 with 4"
+#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x} 
+#endif // VTW_SIZE == 4
+#if defined(VTW_SIZE) && VTW_SIZE == 8
+#warning "using VTW1 with 8"
+#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
+	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x} 
+#endif // VTW_SIZE == 8
+#if defined(VTW_SIZE) && VTW_SIZE == 16
+#warning "using VTW1 with 16"
+#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
+	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
+	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
+	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x} 
+#endif // VTW_SIZE == 16
+#if defined(VTW_SIZE) && VTW_SIZE == 32
+#warning "using VTW1 with 32"
+#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
+	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
+	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
+	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \
+	{TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \
+	{TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \
+	{TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \
+	{TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x} 
+#endif // VTW_SIZE == 32
+#if defined(VTW_SIZE) && VTW_SIZE == 64
+#warning "using VTW1 with 64"
+#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
+	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
+	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
+	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \
+	{TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \
+	{TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \
+	{TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \
+	{TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \
+	{TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \
+	{TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \
+	{TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \
+	{TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \
+	{TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \
+	{TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \
+	{TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \
+	{TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x} 
+#endif // VTW_SIZE == 64
+#if defined(VTW_SIZE) && VTW_SIZE == 128
+#warning "using VTW1 with 128"
+#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
+	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
+	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
+	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \
+	{TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \
+	{TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \
+	{TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \
+	{TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \
+	{TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \
+	{TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \
+	{TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \
+	{TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \
+	{TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \
+	{TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \
+	{TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \
+	{TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x}, \
+	{TW_CEXP, v+64, x}, {TW_CEXP, v+65, x}, {TW_CEXP, v+66, x}, {TW_CEXP, v+67, x}, \
+	{TW_CEXP, v+68, x}, {TW_CEXP, v+69, x}, {TW_CEXP, v+70, x}, {TW_CEXP, v+71, x}, \
+	{TW_CEXP, v+72, x}, {TW_CEXP, v+73, x}, {TW_CEXP, v+74, x}, {TW_CEXP, v+75, x}, \
+	{TW_CEXP, v+76, x}, {TW_CEXP, v+77, x}, {TW_CEXP, v+78, x}, {TW_CEXP, v+79, x}, \
+	{TW_CEXP, v+80, x}, {TW_CEXP, v+81, x}, {TW_CEXP, v+82, x}, {TW_CEXP, v+83, x}, \
+	{TW_CEXP, v+84, x}, {TW_CEXP, v+85, x}, {TW_CEXP, v+86, x}, {TW_CEXP, v+87, x}, \
+	{TW_CEXP, v+88, x}, {TW_CEXP, v+89, x}, {TW_CEXP, v+90, x}, {TW_CEXP, v+91, x}, \
+	{TW_CEXP, v+92, x}, {TW_CEXP, v+93, x}, {TW_CEXP, v+94, x}, {TW_CEXP, v+95, x}, \
+	{TW_CEXP, v+96, x}, {TW_CEXP, v+97, x}, {TW_CEXP, v+98, x}, {TW_CEXP, v+99, x}, \
+	{TW_CEXP, v+100, x}, {TW_CEXP, v+101, x}, {TW_CEXP, v+102, x}, {TW_CEXP, v+103, x}, \
+	{TW_CEXP, v+104, x}, {TW_CEXP, v+105, x}, {TW_CEXP, v+106, x}, {TW_CEXP, v+107, x}, \
+	{TW_CEXP, v+108, x}, {TW_CEXP, v+109, x}, {TW_CEXP, v+110, x}, {TW_CEXP, v+111, x}, \
+	{TW_CEXP, v+112, x}, {TW_CEXP, v+113, x}, {TW_CEXP, v+114, x}, {TW_CEXP, v+115, x}, \
+	{TW_CEXP, v+116, x}, {TW_CEXP, v+117, x}, {TW_CEXP, v+118, x}, {TW_CEXP, v+119, x}, \
+	{TW_CEXP, v+120, x}, {TW_CEXP, v+121, x}, {TW_CEXP, v+122, x}, {TW_CEXP, v+123, x}, \
+	{TW_CEXP, v+124, x}, {TW_CEXP, v+125, x}, {TW_CEXP, v+126, x}, {TW_CEXP, v+127, x} 
+#endif // VTW_SIZE == 128
+#if defined(VTW_SIZE) && VTW_SIZE == 256
+#warning "using VTW1 with 256"
+#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
+	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
+	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
+	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \
+	{TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \
+	{TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \
+	{TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \
+	{TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \
+	{TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \
+	{TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \
+	{TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \
+	{TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \
+	{TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \
+	{TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \
+	{TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \
+	{TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x}, \
+	{TW_CEXP, v+64, x}, {TW_CEXP, v+65, x}, {TW_CEXP, v+66, x}, {TW_CEXP, v+67, x}, \
+	{TW_CEXP, v+68, x}, {TW_CEXP, v+69, x}, {TW_CEXP, v+70, x}, {TW_CEXP, v+71, x}, \
+	{TW_CEXP, v+72, x}, {TW_CEXP, v+73, x}, {TW_CEXP, v+74, x}, {TW_CEXP, v+75, x}, \
+	{TW_CEXP, v+76, x}, {TW_CEXP, v+77, x}, {TW_CEXP, v+78, x}, {TW_CEXP, v+79, x}, \
+	{TW_CEXP, v+80, x}, {TW_CEXP, v+81, x}, {TW_CEXP, v+82, x}, {TW_CEXP, v+83, x}, \
+	{TW_CEXP, v+84, x}, {TW_CEXP, v+85, x}, {TW_CEXP, v+86, x}, {TW_CEXP, v+87, x}, \
+	{TW_CEXP, v+88, x}, {TW_CEXP, v+89, x}, {TW_CEXP, v+90, x}, {TW_CEXP, v+91, x}, \
+	{TW_CEXP, v+92, x}, {TW_CEXP, v+93, x}, {TW_CEXP, v+94, x}, {TW_CEXP, v+95, x}, \
+	{TW_CEXP, v+96, x}, {TW_CEXP, v+97, x}, {TW_CEXP, v+98, x}, {TW_CEXP, v+99, x}, \
+	{TW_CEXP, v+100, x}, {TW_CEXP, v+101, x}, {TW_CEXP, v+102, x}, {TW_CEXP, v+103, x}, \
+	{TW_CEXP, v+104, x}, {TW_CEXP, v+105, x}, {TW_CEXP, v+106, x}, {TW_CEXP, v+107, x}, \
+	{TW_CEXP, v+108, x}, {TW_CEXP, v+109, x}, {TW_CEXP, v+110, x}, {TW_CEXP, v+111, x}, \
+	{TW_CEXP, v+112, x}, {TW_CEXP, v+113, x}, {TW_CEXP, v+114, x}, {TW_CEXP, v+115, x}, \
+	{TW_CEXP, v+116, x}, {TW_CEXP, v+117, x}, {TW_CEXP, v+118, x}, {TW_CEXP, v+119, x}, \
+	{TW_CEXP, v+120, x}, {TW_CEXP, v+121, x}, {TW_CEXP, v+122, x}, {TW_CEXP, v+123, x}, \
+	{TW_CEXP, v+124, x}, {TW_CEXP, v+125, x}, {TW_CEXP, v+126, x}, {TW_CEXP, v+127, x}, \
+	{TW_CEXP, v+128, x}, {TW_CEXP, v+129, x}, {TW_CEXP, v+130, x}, {TW_CEXP, v+131, x}, \
+	{TW_CEXP, v+132, x}, {TW_CEXP, v+133, x}, {TW_CEXP, v+134, x}, {TW_CEXP, v+135, x}, \
+	{TW_CEXP, v+136, x}, {TW_CEXP, v+137, x}, {TW_CEXP, v+138, x}, {TW_CEXP, v+139, x}, \
+	{TW_CEXP, v+140, x}, {TW_CEXP, v+141, x}, {TW_CEXP, v+142, x}, {TW_CEXP, v+143, x}, \
+	{TW_CEXP, v+144, x}, {TW_CEXP, v+145, x}, {TW_CEXP, v+146, x}, {TW_CEXP, v+147, x}, \
+	{TW_CEXP, v+148, x}, {TW_CEXP, v+149, x}, {TW_CEXP, v+150, x}, {TW_CEXP, v+151, x}, \
+	{TW_CEXP, v+152, x}, {TW_CEXP, v+153, x}, {TW_CEXP, v+154, x}, {TW_CEXP, v+155, x}, \
+	{TW_CEXP, v+156, x}, {TW_CEXP, v+157, x}, {TW_CEXP, v+158, x}, {TW_CEXP, v+159, x}, \
+	{TW_CEXP, v+160, x}, {TW_CEXP, v+161, x}, {TW_CEXP, v+162, x}, {TW_CEXP, v+163, x}, \
+	{TW_CEXP, v+164, x}, {TW_CEXP, v+165, x}, {TW_CEXP, v+166, x}, {TW_CEXP, v+167, x}, \
+	{TW_CEXP, v+168, x}, {TW_CEXP, v+169, x}, {TW_CEXP, v+170, x}, {TW_CEXP, v+171, x}, \
+	{TW_CEXP, v+172, x}, {TW_CEXP, v+173, x}, {TW_CEXP, v+174, x}, {TW_CEXP, v+175, x}, \
+	{TW_CEXP, v+176, x}, {TW_CEXP, v+177, x}, {TW_CEXP, v+178, x}, {TW_CEXP, v+179, x}, \
+	{TW_CEXP, v+180, x}, {TW_CEXP, v+181, x}, {TW_CEXP, v+182, x}, {TW_CEXP, v+183, x}, \
+	{TW_CEXP, v+184, x}, {TW_CEXP, v+185, x}, {TW_CEXP, v+186, x}, {TW_CEXP, v+187, x}, \
+	{TW_CEXP, v+188, x}, {TW_CEXP, v+189, x}, {TW_CEXP, v+190, x}, {TW_CEXP, v+191, x}, \
+	{TW_CEXP, v+192, x}, {TW_CEXP, v+193, x}, {TW_CEXP, v+194, x}, {TW_CEXP, v+195, x}, \
+	{TW_CEXP, v+196, x}, {TW_CEXP, v+197, x}, {TW_CEXP, v+198, x}, {TW_CEXP, v+199, x}, \
+	{TW_CEXP, v+200, x}, {TW_CEXP, v+201, x}, {TW_CEXP, v+202, x}, {TW_CEXP, v+203, x}, \
+	{TW_CEXP, v+204, x}, {TW_CEXP, v+205, x}, {TW_CEXP, v+206, x}, {TW_CEXP, v+207, x}, \
+	{TW_CEXP, v+208, x}, {TW_CEXP, v+209, x}, {TW_CEXP, v+210, x}, {TW_CEXP, v+211, x}, \
+	{TW_CEXP, v+212, x}, {TW_CEXP, v+213, x}, {TW_CEXP, v+214, x}, {TW_CEXP, v+215, x}, \
+	{TW_CEXP, v+216, x}, {TW_CEXP, v+217, x}, {TW_CEXP, v+218, x}, {TW_CEXP, v+219, x}, \
+	{TW_CEXP, v+220, x}, {TW_CEXP, v+221, x}, {TW_CEXP, v+222, x}, {TW_CEXP, v+223, x}, \
+	{TW_CEXP, v+224, x}, {TW_CEXP, v+225, x}, {TW_CEXP, v+226, x}, {TW_CEXP, v+227, x}, \
+	{TW_CEXP, v+228, x}, {TW_CEXP, v+229, x}, {TW_CEXP, v+230, x}, {TW_CEXP, v+231, x}, \
+	{TW_CEXP, v+232, x}, {TW_CEXP, v+233, x}, {TW_CEXP, v+234, x}, {TW_CEXP, v+235, x}, \
+	{TW_CEXP, v+236, x}, {TW_CEXP, v+237, x}, {TW_CEXP, v+238, x}, {TW_CEXP, v+239, x}, \
+	{TW_CEXP, v+240, x}, {TW_CEXP, v+241, x}, {TW_CEXP, v+242, x}, {TW_CEXP, v+243, x}, \
+	{TW_CEXP, v+244, x}, {TW_CEXP, v+245, x}, {TW_CEXP, v+246, x}, {TW_CEXP, v+247, x}, \
+	{TW_CEXP, v+248, x}, {TW_CEXP, v+249, x}, {TW_CEXP, v+250, x}, {TW_CEXP, v+251, x}, \
+	{TW_CEXP, v+252, x}, {TW_CEXP, v+253, x}, {TW_CEXP, v+254, x}, {TW_CEXP, v+255, x} 
+#endif // VTW_SIZE == 256
+#endif // REQ_VTW1
+#if defined(REQ_VTW2)
+#if defined(VTW_SIZE) && VTW_SIZE == 1
+#warning "using VTW2 with 1"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_SIN, v+0, -x} 
+#endif // VTW_SIZE == 1
+#if defined(VTW_SIZE) && VTW_SIZE == 2
+#warning "using VTW2 with 2"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_SIN, v+0, -x}, {TW_SIN, v+0, x} 
+#endif // VTW_SIZE == 2
+#if defined(VTW_SIZE) && VTW_SIZE == 4
+#warning "using VTW2 with 4"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
+	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x} 
+#endif // VTW_SIZE == 4
+#if defined(VTW_SIZE) && VTW_SIZE == 8
+#warning "using VTW2 with 8"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
+	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x} 
+#endif // VTW_SIZE == 8
+#if defined(VTW_SIZE) && VTW_SIZE == 16
+#warning "using VTW2 with 16"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
+	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
+	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
+	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
+	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x} 
+#endif // VTW_SIZE == 16
+#if defined(VTW_SIZE) && VTW_SIZE == 32
+#warning "using VTW2 with 32"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
+	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
+	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \
+	{TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \
+	{TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \
+	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
+	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \
+	{TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \
+	{TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x} 
+#endif // VTW_SIZE == 32
+#if defined(VTW_SIZE) && VTW_SIZE == 64
+#warning "using VTW2 with 64"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
+	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
+	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \
+	{TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \
+	{TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \
+	{TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \
+	{TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \
+	{TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \
+	{TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \
+	{TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \
+	{TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \
+	{TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \
+	{TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \
+	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
+	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \
+	{TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \
+	{TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \
+	{TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \
+	{TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \
+	{TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \
+	{TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \
+	{TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \
+	{TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \
+	{TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \
+	{TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x} 
+#endif // VTW_SIZE == 64
+#if defined(VTW_SIZE) && VTW_SIZE == 128
+#warning "using VTW2 with 128"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
+	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
+	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \
+	{TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \
+	{TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \
+	{TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \
+	{TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \
+	{TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \
+	{TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \
+	{TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \
+	{TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \
+	{TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \
+	{TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \
+	{TW_COS, v+32, x}, {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+33, x}, \
+	{TW_COS, v+34, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, {TW_COS, v+35, x}, \
+	{TW_COS, v+36, x}, {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+37, x}, \
+	{TW_COS, v+38, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, {TW_COS, v+39, x}, \
+	{TW_COS, v+40, x}, {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+41, x}, \
+	{TW_COS, v+42, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, {TW_COS, v+43, x}, \
+	{TW_COS, v+44, x}, {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+45, x}, \
+	{TW_COS, v+46, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, {TW_COS, v+47, x}, \
+	{TW_COS, v+48, x}, {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+49, x}, \
+	{TW_COS, v+50, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, {TW_COS, v+51, x}, \
+	{TW_COS, v+52, x}, {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+53, x}, \
+	{TW_COS, v+54, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, {TW_COS, v+55, x}, \
+	{TW_COS, v+56, x}, {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+57, x}, \
+	{TW_COS, v+58, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, {TW_COS, v+59, x}, \
+	{TW_COS, v+60, x}, {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+61, x}, \
+	{TW_COS, v+62, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, {TW_COS, v+63, x}, \
+	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
+	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \
+	{TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \
+	{TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \
+	{TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \
+	{TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \
+	{TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \
+	{TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \
+	{TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \
+	{TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \
+	{TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \
+	{TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x}, \
+	{TW_SIN, v+32, -x}, {TW_SIN, v+32, x}, {TW_SIN, v+33, -x}, {TW_SIN, v+33, x}, \
+	{TW_SIN, v+34, -x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, -x}, {TW_SIN, v+35, x}, \
+	{TW_SIN, v+36, -x}, {TW_SIN, v+36, x}, {TW_SIN, v+37, -x}, {TW_SIN, v+37, x}, \
+	{TW_SIN, v+38, -x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, -x}, {TW_SIN, v+39, x}, \
+	{TW_SIN, v+40, -x}, {TW_SIN, v+40, x}, {TW_SIN, v+41, -x}, {TW_SIN, v+41, x}, \
+	{TW_SIN, v+42, -x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, -x}, {TW_SIN, v+43, x}, \
+	{TW_SIN, v+44, -x}, {TW_SIN, v+44, x}, {TW_SIN, v+45, -x}, {TW_SIN, v+45, x}, \
+	{TW_SIN, v+46, -x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, -x}, {TW_SIN, v+47, x}, \
+	{TW_SIN, v+48, -x}, {TW_SIN, v+48, x}, {TW_SIN, v+49, -x}, {TW_SIN, v+49, x}, \
+	{TW_SIN, v+50, -x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, -x}, {TW_SIN, v+51, x}, \
+	{TW_SIN, v+52, -x}, {TW_SIN, v+52, x}, {TW_SIN, v+53, -x}, {TW_SIN, v+53, x}, \
+	{TW_SIN, v+54, -x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, -x}, {TW_SIN, v+55, x}, \
+	{TW_SIN, v+56, -x}, {TW_SIN, v+56, x}, {TW_SIN, v+57, -x}, {TW_SIN, v+57, x}, \
+	{TW_SIN, v+58, -x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, -x}, {TW_SIN, v+59, x}, \
+	{TW_SIN, v+60, -x}, {TW_SIN, v+60, x}, {TW_SIN, v+61, -x}, {TW_SIN, v+61, x}, \
+	{TW_SIN, v+62, -x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, -x}, {TW_SIN, v+63, x} 
+#endif // VTW_SIZE == 128
+#if defined(VTW_SIZE) && VTW_SIZE == 256
+#warning "using VTW2 with 256"
+#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
+	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
+	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \
+	{TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \
+	{TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \
+	{TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \
+	{TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \
+	{TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \
+	{TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \
+	{TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \
+	{TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \
+	{TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \
+	{TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \
+	{TW_COS, v+32, x}, {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+33, x}, \
+	{TW_COS, v+34, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, {TW_COS, v+35, x}, \
+	{TW_COS, v+36, x}, {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+37, x}, \
+	{TW_COS, v+38, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, {TW_COS, v+39, x}, \
+	{TW_COS, v+40, x}, {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+41, x}, \
+	{TW_COS, v+42, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, {TW_COS, v+43, x}, \
+	{TW_COS, v+44, x}, {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+45, x}, \
+	{TW_COS, v+46, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, {TW_COS, v+47, x}, \
+	{TW_COS, v+48, x}, {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+49, x}, \
+	{TW_COS, v+50, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, {TW_COS, v+51, x}, \
+	{TW_COS, v+52, x}, {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+53, x}, \
+	{TW_COS, v+54, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, {TW_COS, v+55, x}, \
+	{TW_COS, v+56, x}, {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+57, x}, \
+	{TW_COS, v+58, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, {TW_COS, v+59, x}, \
+	{TW_COS, v+60, x}, {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+61, x}, \
+	{TW_COS, v+62, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, {TW_COS, v+63, x}, \
+	{TW_COS, v+64, x}, {TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+65, x}, \
+	{TW_COS, v+66, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, {TW_COS, v+67, x}, \
+	{TW_COS, v+68, x}, {TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+69, x}, \
+	{TW_COS, v+70, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, {TW_COS, v+71, x}, \
+	{TW_COS, v+72, x}, {TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+73, x}, \
+	{TW_COS, v+74, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, {TW_COS, v+75, x}, \
+	{TW_COS, v+76, x}, {TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+77, x}, \
+	{TW_COS, v+78, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, {TW_COS, v+79, x}, \
+	{TW_COS, v+80, x}, {TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+81, x}, \
+	{TW_COS, v+82, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, {TW_COS, v+83, x}, \
+	{TW_COS, v+84, x}, {TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+85, x}, \
+	{TW_COS, v+86, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, {TW_COS, v+87, x}, \
+	{TW_COS, v+88, x}, {TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+89, x}, \
+	{TW_COS, v+90, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, {TW_COS, v+91, x}, \
+	{TW_COS, v+92, x}, {TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+93, x}, \
+	{TW_COS, v+94, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, {TW_COS, v+95, x}, \
+	{TW_COS, v+96, x}, {TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+97, x}, \
+	{TW_COS, v+98, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, {TW_COS, v+99, x}, \
+	{TW_COS, v+100, x}, {TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+101, x}, \
+	{TW_COS, v+102, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, {TW_COS, v+103, x}, \
+	{TW_COS, v+104, x}, {TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+105, x}, \
+	{TW_COS, v+106, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, {TW_COS, v+107, x}, \
+	{TW_COS, v+108, x}, {TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+109, x}, \
+	{TW_COS, v+110, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, {TW_COS, v+111, x}, \
+	{TW_COS, v+112, x}, {TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+113, x}, \
+	{TW_COS, v+114, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, {TW_COS, v+115, x}, \
+	{TW_COS, v+116, x}, {TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+117, x}, \
+	{TW_COS, v+118, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, {TW_COS, v+119, x}, \
+	{TW_COS, v+120, x}, {TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+121, x}, \
+	{TW_COS, v+122, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, {TW_COS, v+123, x}, \
+	{TW_COS, v+124, x}, {TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+125, x}, \
+	{TW_COS, v+126, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, {TW_COS, v+127, x}, \
+	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
+	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \
+	{TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \
+	{TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \
+	{TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \
+	{TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \
+	{TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \
+	{TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \
+	{TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \
+	{TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \
+	{TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \
+	{TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x}, \
+	{TW_SIN, v+32, -x}, {TW_SIN, v+32, x}, {TW_SIN, v+33, -x}, {TW_SIN, v+33, x}, \
+	{TW_SIN, v+34, -x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, -x}, {TW_SIN, v+35, x}, \
+	{TW_SIN, v+36, -x}, {TW_SIN, v+36, x}, {TW_SIN, v+37, -x}, {TW_SIN, v+37, x}, \
+	{TW_SIN, v+38, -x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, -x}, {TW_SIN, v+39, x}, \
+	{TW_SIN, v+40, -x}, {TW_SIN, v+40, x}, {TW_SIN, v+41, -x}, {TW_SIN, v+41, x}, \
+	{TW_SIN, v+42, -x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, -x}, {TW_SIN, v+43, x}, \
+	{TW_SIN, v+44, -x}, {TW_SIN, v+44, x}, {TW_SIN, v+45, -x}, {TW_SIN, v+45, x}, \
+	{TW_SIN, v+46, -x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, -x}, {TW_SIN, v+47, x}, \
+	{TW_SIN, v+48, -x}, {TW_SIN, v+48, x}, {TW_SIN, v+49, -x}, {TW_SIN, v+49, x}, \
+	{TW_SIN, v+50, -x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, -x}, {TW_SIN, v+51, x}, \
+	{TW_SIN, v+52, -x}, {TW_SIN, v+52, x}, {TW_SIN, v+53, -x}, {TW_SIN, v+53, x}, \
+	{TW_SIN, v+54, -x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, -x}, {TW_SIN, v+55, x}, \
+	{TW_SIN, v+56, -x}, {TW_SIN, v+56, x}, {TW_SIN, v+57, -x}, {TW_SIN, v+57, x}, \
+	{TW_SIN, v+58, -x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, -x}, {TW_SIN, v+59, x}, \
+	{TW_SIN, v+60, -x}, {TW_SIN, v+60, x}, {TW_SIN, v+61, -x}, {TW_SIN, v+61, x}, \
+	{TW_SIN, v+62, -x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, -x}, {TW_SIN, v+63, x}, \
+	{TW_SIN, v+64, -x}, {TW_SIN, v+64, x}, {TW_SIN, v+65, -x}, {TW_SIN, v+65, x}, \
+	{TW_SIN, v+66, -x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, -x}, {TW_SIN, v+67, x}, \
+	{TW_SIN, v+68, -x}, {TW_SIN, v+68, x}, {TW_SIN, v+69, -x}, {TW_SIN, v+69, x}, \
+	{TW_SIN, v+70, -x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, -x}, {TW_SIN, v+71, x}, \
+	{TW_SIN, v+72, -x}, {TW_SIN, v+72, x}, {TW_SIN, v+73, -x}, {TW_SIN, v+73, x}, \
+	{TW_SIN, v+74, -x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, -x}, {TW_SIN, v+75, x}, \
+	{TW_SIN, v+76, -x}, {TW_SIN, v+76, x}, {TW_SIN, v+77, -x}, {TW_SIN, v+77, x}, \
+	{TW_SIN, v+78, -x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, -x}, {TW_SIN, v+79, x}, \
+	{TW_SIN, v+80, -x}, {TW_SIN, v+80, x}, {TW_SIN, v+81, -x}, {TW_SIN, v+81, x}, \
+	{TW_SIN, v+82, -x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, -x}, {TW_SIN, v+83, x}, \
+	{TW_SIN, v+84, -x}, {TW_SIN, v+84, x}, {TW_SIN, v+85, -x}, {TW_SIN, v+85, x}, \
+	{TW_SIN, v+86, -x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, -x}, {TW_SIN, v+87, x}, \
+	{TW_SIN, v+88, -x}, {TW_SIN, v+88, x}, {TW_SIN, v+89, -x}, {TW_SIN, v+89, x}, \
+	{TW_SIN, v+90, -x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, -x}, {TW_SIN, v+91, x}, \
+	{TW_SIN, v+92, -x}, {TW_SIN, v+92, x}, {TW_SIN, v+93, -x}, {TW_SIN, v+93, x}, \
+	{TW_SIN, v+94, -x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, -x}, {TW_SIN, v+95, x}, \
+	{TW_SIN, v+96, -x}, {TW_SIN, v+96, x}, {TW_SIN, v+97, -x}, {TW_SIN, v+97, x}, \
+	{TW_SIN, v+98, -x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, -x}, {TW_SIN, v+99, x}, \
+	{TW_SIN, v+100, -x}, {TW_SIN, v+100, x}, {TW_SIN, v+101, -x}, {TW_SIN, v+101, x}, \
+	{TW_SIN, v+102, -x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, -x}, {TW_SIN, v+103, x}, \
+	{TW_SIN, v+104, -x}, {TW_SIN, v+104, x}, {TW_SIN, v+105, -x}, {TW_SIN, v+105, x}, \
+	{TW_SIN, v+106, -x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, -x}, {TW_SIN, v+107, x}, \
+	{TW_SIN, v+108, -x}, {TW_SIN, v+108, x}, {TW_SIN, v+109, -x}, {TW_SIN, v+109, x}, \
+	{TW_SIN, v+110, -x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, -x}, {TW_SIN, v+111, x}, \
+	{TW_SIN, v+112, -x}, {TW_SIN, v+112, x}, {TW_SIN, v+113, -x}, {TW_SIN, v+113, x}, \
+	{TW_SIN, v+114, -x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, -x}, {TW_SIN, v+115, x}, \
+	{TW_SIN, v+116, -x}, {TW_SIN, v+116, x}, {TW_SIN, v+117, -x}, {TW_SIN, v+117, x}, \
+	{TW_SIN, v+118, -x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, -x}, {TW_SIN, v+119, x}, \
+	{TW_SIN, v+120, -x}, {TW_SIN, v+120, x}, {TW_SIN, v+121, -x}, {TW_SIN, v+121, x}, \
+	{TW_SIN, v+122, -x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, -x}, {TW_SIN, v+123, x}, \
+	{TW_SIN, v+124, -x}, {TW_SIN, v+124, x}, {TW_SIN, v+125, -x}, {TW_SIN, v+125, x}, \
+	{TW_SIN, v+126, -x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, -x}, {TW_SIN, v+127, x} 
+#endif // VTW_SIZE == 256
+#endif // REQ_VTW2
+#if defined(REQ_VTWS)
+#if defined(VTW_SIZE) && VTW_SIZE == 1
+#warning "using VTWS with 1"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_SIN, v+0, x} 
+#endif // VTW_SIZE == 1
+#if defined(VTW_SIZE) && VTW_SIZE == 2
+#warning "using VTWS with 2"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, x} 
+#endif // VTW_SIZE == 2
+#if defined(VTW_SIZE) && VTW_SIZE == 4
+#warning "using VTWS with 4"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x} 
+#endif // VTW_SIZE == 4
+#if defined(VTW_SIZE) && VTW_SIZE == 8
+#warning "using VTWS with 8"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x} 
+#endif // VTW_SIZE == 8
+#if defined(VTW_SIZE) && VTW_SIZE == 16
+#warning "using VTWS with 16"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
+	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x} 
+#endif // VTW_SIZE == 16
+#if defined(VTW_SIZE) && VTW_SIZE == 32
+#warning "using VTWS with 32"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
+	{TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \
+	{TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \
+	{TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \
+	{TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \
+	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \
+	{TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \
+	{TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \
+	{TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \
+	{TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x} 
+#endif // VTW_SIZE == 32
+#if defined(VTW_SIZE) && VTW_SIZE == 64
+#warning "using VTWS with 64"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
+	{TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \
+	{TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \
+	{TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \
+	{TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \
+	{TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \
+	{TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \
+	{TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \
+	{TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \
+	{TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \
+	{TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \
+	{TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \
+	{TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \
+	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \
+	{TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \
+	{TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \
+	{TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \
+	{TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \
+	{TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \
+	{TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \
+	{TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \
+	{TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \
+	{TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \
+	{TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \
+	{TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \
+	{TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x} 
+#endif // VTW_SIZE == 64
+#if defined(VTW_SIZE) && VTW_SIZE == 128
+#warning "using VTWS with 128"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
+	{TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \
+	{TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \
+	{TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \
+	{TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \
+	{TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \
+	{TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \
+	{TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \
+	{TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \
+	{TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \
+	{TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \
+	{TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \
+	{TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \
+	{TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, \
+	{TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, \
+	{TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, \
+	{TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, \
+	{TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, \
+	{TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, \
+	{TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, \
+	{TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, \
+	{TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, \
+	{TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, \
+	{TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, \
+	{TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, \
+	{TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, \
+	{TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, \
+	{TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, \
+	{TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, \
+	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \
+	{TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \
+	{TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \
+	{TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \
+	{TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \
+	{TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \
+	{TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \
+	{TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \
+	{TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \
+	{TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \
+	{TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \
+	{TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \
+	{TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x}, \
+	{TW_SIN, v+64, x}, {TW_SIN, v+65, x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, x}, \
+	{TW_SIN, v+68, x}, {TW_SIN, v+69, x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, x}, \
+	{TW_SIN, v+72, x}, {TW_SIN, v+73, x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, x}, \
+	{TW_SIN, v+76, x}, {TW_SIN, v+77, x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, x}, \
+	{TW_SIN, v+80, x}, {TW_SIN, v+81, x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, x}, \
+	{TW_SIN, v+84, x}, {TW_SIN, v+85, x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, x}, \
+	{TW_SIN, v+88, x}, {TW_SIN, v+89, x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, x}, \
+	{TW_SIN, v+92, x}, {TW_SIN, v+93, x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, x}, \
+	{TW_SIN, v+96, x}, {TW_SIN, v+97, x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, x}, \
+	{TW_SIN, v+100, x}, {TW_SIN, v+101, x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, x}, \
+	{TW_SIN, v+104, x}, {TW_SIN, v+105, x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, x}, \
+	{TW_SIN, v+108, x}, {TW_SIN, v+109, x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, x}, \
+	{TW_SIN, v+112, x}, {TW_SIN, v+113, x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, x}, \
+	{TW_SIN, v+116, x}, {TW_SIN, v+117, x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, x}, \
+	{TW_SIN, v+120, x}, {TW_SIN, v+121, x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, x}, \
+	{TW_SIN, v+124, x}, {TW_SIN, v+125, x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, x} 
+#endif // VTW_SIZE == 128
+#if defined(VTW_SIZE) && VTW_SIZE == 256
+#warning "using VTWS with 256"
+#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
+	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
+	{TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \
+	{TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \
+	{TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \
+	{TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \
+	{TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \
+	{TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \
+	{TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \
+	{TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \
+	{TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \
+	{TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \
+	{TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \
+	{TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \
+	{TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, \
+	{TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, \
+	{TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, \
+	{TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, \
+	{TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, \
+	{TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, \
+	{TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, \
+	{TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, \
+	{TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, \
+	{TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, \
+	{TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, \
+	{TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, \
+	{TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, \
+	{TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, \
+	{TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, \
+	{TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, \
+	{TW_COS, v+128, x}, {TW_COS, v+129, x}, {TW_COS, v+130, x}, {TW_COS, v+131, x}, \
+	{TW_COS, v+132, x}, {TW_COS, v+133, x}, {TW_COS, v+134, x}, {TW_COS, v+135, x}, \
+	{TW_COS, v+136, x}, {TW_COS, v+137, x}, {TW_COS, v+138, x}, {TW_COS, v+139, x}, \
+	{TW_COS, v+140, x}, {TW_COS, v+141, x}, {TW_COS, v+142, x}, {TW_COS, v+143, x}, \
+	{TW_COS, v+144, x}, {TW_COS, v+145, x}, {TW_COS, v+146, x}, {TW_COS, v+147, x}, \
+	{TW_COS, v+148, x}, {TW_COS, v+149, x}, {TW_COS, v+150, x}, {TW_COS, v+151, x}, \
+	{TW_COS, v+152, x}, {TW_COS, v+153, x}, {TW_COS, v+154, x}, {TW_COS, v+155, x}, \
+	{TW_COS, v+156, x}, {TW_COS, v+157, x}, {TW_COS, v+158, x}, {TW_COS, v+159, x}, \
+	{TW_COS, v+160, x}, {TW_COS, v+161, x}, {TW_COS, v+162, x}, {TW_COS, v+163, x}, \
+	{TW_COS, v+164, x}, {TW_COS, v+165, x}, {TW_COS, v+166, x}, {TW_COS, v+167, x}, \
+	{TW_COS, v+168, x}, {TW_COS, v+169, x}, {TW_COS, v+170, x}, {TW_COS, v+171, x}, \
+	{TW_COS, v+172, x}, {TW_COS, v+173, x}, {TW_COS, v+174, x}, {TW_COS, v+175, x}, \
+	{TW_COS, v+176, x}, {TW_COS, v+177, x}, {TW_COS, v+178, x}, {TW_COS, v+179, x}, \
+	{TW_COS, v+180, x}, {TW_COS, v+181, x}, {TW_COS, v+182, x}, {TW_COS, v+183, x}, \
+	{TW_COS, v+184, x}, {TW_COS, v+185, x}, {TW_COS, v+186, x}, {TW_COS, v+187, x}, \
+	{TW_COS, v+188, x}, {TW_COS, v+189, x}, {TW_COS, v+190, x}, {TW_COS, v+191, x}, \
+	{TW_COS, v+192, x}, {TW_COS, v+193, x}, {TW_COS, v+194, x}, {TW_COS, v+195, x}, \
+	{TW_COS, v+196, x}, {TW_COS, v+197, x}, {TW_COS, v+198, x}, {TW_COS, v+199, x}, \
+	{TW_COS, v+200, x}, {TW_COS, v+201, x}, {TW_COS, v+202, x}, {TW_COS, v+203, x}, \
+	{TW_COS, v+204, x}, {TW_COS, v+205, x}, {TW_COS, v+206, x}, {TW_COS, v+207, x}, \
+	{TW_COS, v+208, x}, {TW_COS, v+209, x}, {TW_COS, v+210, x}, {TW_COS, v+211, x}, \
+	{TW_COS, v+212, x}, {TW_COS, v+213, x}, {TW_COS, v+214, x}, {TW_COS, v+215, x}, \
+	{TW_COS, v+216, x}, {TW_COS, v+217, x}, {TW_COS, v+218, x}, {TW_COS, v+219, x}, \
+	{TW_COS, v+220, x}, {TW_COS, v+221, x}, {TW_COS, v+222, x}, {TW_COS, v+223, x}, \
+	{TW_COS, v+224, x}, {TW_COS, v+225, x}, {TW_COS, v+226, x}, {TW_COS, v+227, x}, \
+	{TW_COS, v+228, x}, {TW_COS, v+229, x}, {TW_COS, v+230, x}, {TW_COS, v+231, x}, \
+	{TW_COS, v+232, x}, {TW_COS, v+233, x}, {TW_COS, v+234, x}, {TW_COS, v+235, x}, \
+	{TW_COS, v+236, x}, {TW_COS, v+237, x}, {TW_COS, v+238, x}, {TW_COS, v+239, x}, \
+	{TW_COS, v+240, x}, {TW_COS, v+241, x}, {TW_COS, v+242, x}, {TW_COS, v+243, x}, \
+	{TW_COS, v+244, x}, {TW_COS, v+245, x}, {TW_COS, v+246, x}, {TW_COS, v+247, x}, \
+	{TW_COS, v+248, x}, {TW_COS, v+249, x}, {TW_COS, v+250, x}, {TW_COS, v+251, x}, \
+	{TW_COS, v+252, x}, {TW_COS, v+253, x}, {TW_COS, v+254, x}, {TW_COS, v+255, x}, \
+	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
+	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
+	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \
+	{TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \
+	{TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \
+	{TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \
+	{TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \
+	{TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \
+	{TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \
+	{TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \
+	{TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \
+	{TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \
+	{TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \
+	{TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \
+	{TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x}, \
+	{TW_SIN, v+64, x}, {TW_SIN, v+65, x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, x}, \
+	{TW_SIN, v+68, x}, {TW_SIN, v+69, x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, x}, \
+	{TW_SIN, v+72, x}, {TW_SIN, v+73, x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, x}, \
+	{TW_SIN, v+76, x}, {TW_SIN, v+77, x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, x}, \
+	{TW_SIN, v+80, x}, {TW_SIN, v+81, x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, x}, \
+	{TW_SIN, v+84, x}, {TW_SIN, v+85, x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, x}, \
+	{TW_SIN, v+88, x}, {TW_SIN, v+89, x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, x}, \
+	{TW_SIN, v+92, x}, {TW_SIN, v+93, x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, x}, \
+	{TW_SIN, v+96, x}, {TW_SIN, v+97, x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, x}, \
+	{TW_SIN, v+100, x}, {TW_SIN, v+101, x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, x}, \
+	{TW_SIN, v+104, x}, {TW_SIN, v+105, x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, x}, \
+	{TW_SIN, v+108, x}, {TW_SIN, v+109, x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, x}, \
+	{TW_SIN, v+112, x}, {TW_SIN, v+113, x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, x}, \
+	{TW_SIN, v+116, x}, {TW_SIN, v+117, x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, x}, \
+	{TW_SIN, v+120, x}, {TW_SIN, v+121, x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, x}, \
+	{TW_SIN, v+124, x}, {TW_SIN, v+125, x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, x}, \
+	{TW_SIN, v+128, x}, {TW_SIN, v+129, x}, {TW_SIN, v+130, x}, {TW_SIN, v+131, x}, \
+	{TW_SIN, v+132, x}, {TW_SIN, v+133, x}, {TW_SIN, v+134, x}, {TW_SIN, v+135, x}, \
+	{TW_SIN, v+136, x}, {TW_SIN, v+137, x}, {TW_SIN, v+138, x}, {TW_SIN, v+139, x}, \
+	{TW_SIN, v+140, x}, {TW_SIN, v+141, x}, {TW_SIN, v+142, x}, {TW_SIN, v+143, x}, \
+	{TW_SIN, v+144, x}, {TW_SIN, v+145, x}, {TW_SIN, v+146, x}, {TW_SIN, v+147, x}, \
+	{TW_SIN, v+148, x}, {TW_SIN, v+149, x}, {TW_SIN, v+150, x}, {TW_SIN, v+151, x}, \
+	{TW_SIN, v+152, x}, {TW_SIN, v+153, x}, {TW_SIN, v+154, x}, {TW_SIN, v+155, x}, \
+	{TW_SIN, v+156, x}, {TW_SIN, v+157, x}, {TW_SIN, v+158, x}, {TW_SIN, v+159, x}, \
+	{TW_SIN, v+160, x}, {TW_SIN, v+161, x}, {TW_SIN, v+162, x}, {TW_SIN, v+163, x}, \
+	{TW_SIN, v+164, x}, {TW_SIN, v+165, x}, {TW_SIN, v+166, x}, {TW_SIN, v+167, x}, \
+	{TW_SIN, v+168, x}, {TW_SIN, v+169, x}, {TW_SIN, v+170, x}, {TW_SIN, v+171, x}, \
+	{TW_SIN, v+172, x}, {TW_SIN, v+173, x}, {TW_SIN, v+174, x}, {TW_SIN, v+175, x}, \
+	{TW_SIN, v+176, x}, {TW_SIN, v+177, x}, {TW_SIN, v+178, x}, {TW_SIN, v+179, x}, \
+	{TW_SIN, v+180, x}, {TW_SIN, v+181, x}, {TW_SIN, v+182, x}, {TW_SIN, v+183, x}, \
+	{TW_SIN, v+184, x}, {TW_SIN, v+185, x}, {TW_SIN, v+186, x}, {TW_SIN, v+187, x}, \
+	{TW_SIN, v+188, x}, {TW_SIN, v+189, x}, {TW_SIN, v+190, x}, {TW_SIN, v+191, x}, \
+	{TW_SIN, v+192, x}, {TW_SIN, v+193, x}, {TW_SIN, v+194, x}, {TW_SIN, v+195, x}, \
+	{TW_SIN, v+196, x}, {TW_SIN, v+197, x}, {TW_SIN, v+198, x}, {TW_SIN, v+199, x}, \
+	{TW_SIN, v+200, x}, {TW_SIN, v+201, x}, {TW_SIN, v+202, x}, {TW_SIN, v+203, x}, \
+	{TW_SIN, v+204, x}, {TW_SIN, v+205, x}, {TW_SIN, v+206, x}, {TW_SIN, v+207, x}, \
+	{TW_SIN, v+208, x}, {TW_SIN, v+209, x}, {TW_SIN, v+210, x}, {TW_SIN, v+211, x}, \
+	{TW_SIN, v+212, x}, {TW_SIN, v+213, x}, {TW_SIN, v+214, x}, {TW_SIN, v+215, x}, \
+	{TW_SIN, v+216, x}, {TW_SIN, v+217, x}, {TW_SIN, v+218, x}, {TW_SIN, v+219, x}, \
+	{TW_SIN, v+220, x}, {TW_SIN, v+221, x}, {TW_SIN, v+222, x}, {TW_SIN, v+223, x}, \
+	{TW_SIN, v+224, x}, {TW_SIN, v+225, x}, {TW_SIN, v+226, x}, {TW_SIN, v+227, x}, \
+	{TW_SIN, v+228, x}, {TW_SIN, v+229, x}, {TW_SIN, v+230, x}, {TW_SIN, v+231, x}, \
+	{TW_SIN, v+232, x}, {TW_SIN, v+233, x}, {TW_SIN, v+234, x}, {TW_SIN, v+235, x}, \
+	{TW_SIN, v+236, x}, {TW_SIN, v+237, x}, {TW_SIN, v+238, x}, {TW_SIN, v+239, x}, \
+	{TW_SIN, v+240, x}, {TW_SIN, v+241, x}, {TW_SIN, v+242, x}, {TW_SIN, v+243, x}, \
+	{TW_SIN, v+244, x}, {TW_SIN, v+245, x}, {TW_SIN, v+246, x}, {TW_SIN, v+247, x}, \
+	{TW_SIN, v+248, x}, {TW_SIN, v+249, x}, {TW_SIN, v+250, x}, {TW_SIN, v+251, x}, \
+	{TW_SIN, v+252, x}, {TW_SIN, v+253, x}, {TW_SIN, v+254, x}, {TW_SIN, v+255, x} 
+#endif // VTW_SIZE == 256
+#endif // REQ_VTWS

From bfe9886c1b632ca7a25361627c364bf9b7f23983 Mon Sep 17 00:00:00 2001
From: Gilles Gouaillardet <gilles@rist.or.jp>
Date: Sun, 26 Jul 2020 00:02:19 +0900
Subject: [PATCH 02/13] simd/support/generate_vtw: fix include files for OSX

---
 simd-support/generate_vtw.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/simd-support/generate_vtw.c b/simd-support/generate_vtw.c
index 505a5804c..9b70f7f16 100644
--- a/simd-support/generate_vtw.c
+++ b/simd-support/generate_vtw.c
@@ -1,7 +1,12 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#ifdef HAVE_MALLOC_H
 #include <malloc.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
 
 unsigned int rp2(unsigned int size) {
   size = size | (size >> 1);

From 867eaec14b15e06ef50b35e2e79f6be3dab00e04 Mon Sep 17 00:00:00 2001
From: Gilles Gouaillardet <gilles@rist.or.jp>
Date: Sun, 26 Jul 2020 00:03:16 +0900
Subject: [PATCH 03/13] automatically generate simd-support/vtw.h in
 bootstrap.sh

and add simd-support/{generate_vtw.sh,vtw.h} into the dist tarball
---
 simd-support/Makefile.am |  18 +-
 simd-support/vtw.h       | 729 ---------------------------------------
 2 files changed, 17 insertions(+), 730 deletions(-)
 delete mode 100644 simd-support/vtw.h

diff --git a/simd-support/Makefile.am b/simd-support/Makefile.am
index 60b705377..e624cc3c2 100644
--- a/simd-support/Makefile.am
+++ b/simd-support/Makefile.am
@@ -1,6 +1,21 @@
 AM_CPPFLAGS = -I $(top_srcdir)
 noinst_LTLIBRARIES = libsimd_support.la 
 
+noinst_PROGRAMS =
+
+if MAINTAINER_MODE
+noinst_PROGRAMS += generate_vtw
+
+vtw.h: generate_vtw
+	$(top_srcdir)/simd-support/generate_vtw.sh > vtw.h
+
+generate_vtw_SOURCES = generate_vtw.c
+
+sve.c: vtw.h
+endif
+
+libsimd_support_la: vtw.h
+
 libsimd_support_la_SOURCES = taint.c simd-common.h \
 x86-cpuid.h amd64-cpuid.h \
 simd-sse2.h sse2.c \
@@ -12,5 +27,6 @@ kcvi.c simd-kcvi.h \
 altivec.c simd-altivec.h vsx.c simd-vsx.h \
 neon.c simd-neon.h \
 simd-generic128.h simd-generic256.h \
-sve.c simd-maskedsve.h simd-maskedsve128.h simd-maskedsve256.h simd-maskedsve512.h simd-maskedsve1024.h simd-maskedsve2048.h
+sve.c simd-maskedsve.h simd-maskedsve128.h simd-maskedsve256.h simd-maskedsve512.h simd-maskedsve1024.h simd-maskedsve2048.h vtw.h
 
+EXTRA_DIST = generate_vtw.sh
diff --git a/simd-support/vtw.h b/simd-support/vtw.h
deleted file mode 100644
index 0c31a32b1..000000000
--- a/simd-support/vtw.h
+++ /dev/null
@@ -1,729 +0,0 @@
-/* auto-generated */
-#if defined(REQ_VTW1)
-#if defined(VTW_SIZE) && VTW_SIZE == 1
-#warning "using VTW1 with 1"
-#define VTW1(v,x) {TW_CEXP, v+0, x} 
-#endif // VTW_SIZE == 1
-#if defined(VTW_SIZE) && VTW_SIZE == 2
-#warning "using VTW1 with 2"
-#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x} 
-#endif // VTW_SIZE == 2
-#if defined(VTW_SIZE) && VTW_SIZE == 4
-#warning "using VTW1 with 4"
-#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x} 
-#endif // VTW_SIZE == 4
-#if defined(VTW_SIZE) && VTW_SIZE == 8
-#warning "using VTW1 with 8"
-#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
-	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x} 
-#endif // VTW_SIZE == 8
-#if defined(VTW_SIZE) && VTW_SIZE == 16
-#warning "using VTW1 with 16"
-#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
-	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
-	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
-	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x} 
-#endif // VTW_SIZE == 16
-#if defined(VTW_SIZE) && VTW_SIZE == 32
-#warning "using VTW1 with 32"
-#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
-	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
-	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
-	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \
-	{TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \
-	{TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \
-	{TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \
-	{TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x} 
-#endif // VTW_SIZE == 32
-#if defined(VTW_SIZE) && VTW_SIZE == 64
-#warning "using VTW1 with 64"
-#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
-	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
-	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
-	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \
-	{TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \
-	{TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \
-	{TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \
-	{TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \
-	{TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \
-	{TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \
-	{TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \
-	{TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \
-	{TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \
-	{TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \
-	{TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \
-	{TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x} 
-#endif // VTW_SIZE == 64
-#if defined(VTW_SIZE) && VTW_SIZE == 128
-#warning "using VTW1 with 128"
-#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
-	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
-	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
-	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \
-	{TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \
-	{TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \
-	{TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \
-	{TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \
-	{TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \
-	{TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \
-	{TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \
-	{TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \
-	{TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \
-	{TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \
-	{TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \
-	{TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x}, \
-	{TW_CEXP, v+64, x}, {TW_CEXP, v+65, x}, {TW_CEXP, v+66, x}, {TW_CEXP, v+67, x}, \
-	{TW_CEXP, v+68, x}, {TW_CEXP, v+69, x}, {TW_CEXP, v+70, x}, {TW_CEXP, v+71, x}, \
-	{TW_CEXP, v+72, x}, {TW_CEXP, v+73, x}, {TW_CEXP, v+74, x}, {TW_CEXP, v+75, x}, \
-	{TW_CEXP, v+76, x}, {TW_CEXP, v+77, x}, {TW_CEXP, v+78, x}, {TW_CEXP, v+79, x}, \
-	{TW_CEXP, v+80, x}, {TW_CEXP, v+81, x}, {TW_CEXP, v+82, x}, {TW_CEXP, v+83, x}, \
-	{TW_CEXP, v+84, x}, {TW_CEXP, v+85, x}, {TW_CEXP, v+86, x}, {TW_CEXP, v+87, x}, \
-	{TW_CEXP, v+88, x}, {TW_CEXP, v+89, x}, {TW_CEXP, v+90, x}, {TW_CEXP, v+91, x}, \
-	{TW_CEXP, v+92, x}, {TW_CEXP, v+93, x}, {TW_CEXP, v+94, x}, {TW_CEXP, v+95, x}, \
-	{TW_CEXP, v+96, x}, {TW_CEXP, v+97, x}, {TW_CEXP, v+98, x}, {TW_CEXP, v+99, x}, \
-	{TW_CEXP, v+100, x}, {TW_CEXP, v+101, x}, {TW_CEXP, v+102, x}, {TW_CEXP, v+103, x}, \
-	{TW_CEXP, v+104, x}, {TW_CEXP, v+105, x}, {TW_CEXP, v+106, x}, {TW_CEXP, v+107, x}, \
-	{TW_CEXP, v+108, x}, {TW_CEXP, v+109, x}, {TW_CEXP, v+110, x}, {TW_CEXP, v+111, x}, \
-	{TW_CEXP, v+112, x}, {TW_CEXP, v+113, x}, {TW_CEXP, v+114, x}, {TW_CEXP, v+115, x}, \
-	{TW_CEXP, v+116, x}, {TW_CEXP, v+117, x}, {TW_CEXP, v+118, x}, {TW_CEXP, v+119, x}, \
-	{TW_CEXP, v+120, x}, {TW_CEXP, v+121, x}, {TW_CEXP, v+122, x}, {TW_CEXP, v+123, x}, \
-	{TW_CEXP, v+124, x}, {TW_CEXP, v+125, x}, {TW_CEXP, v+126, x}, {TW_CEXP, v+127, x} 
-#endif // VTW_SIZE == 128
-#if defined(VTW_SIZE) && VTW_SIZE == 256
-#warning "using VTW1 with 256"
-#define VTW1(v,x) {TW_CEXP, v+0, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, \
-	{TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}, \
-	{TW_CEXP, v+8, x}, {TW_CEXP, v+9, x}, {TW_CEXP, v+10, x}, {TW_CEXP, v+11, x}, \
-	{TW_CEXP, v+12, x}, {TW_CEXP, v+13, x}, {TW_CEXP, v+14, x}, {TW_CEXP, v+15, x}, \
-	{TW_CEXP, v+16, x}, {TW_CEXP, v+17, x}, {TW_CEXP, v+18, x}, {TW_CEXP, v+19, x}, \
-	{TW_CEXP, v+20, x}, {TW_CEXP, v+21, x}, {TW_CEXP, v+22, x}, {TW_CEXP, v+23, x}, \
-	{TW_CEXP, v+24, x}, {TW_CEXP, v+25, x}, {TW_CEXP, v+26, x}, {TW_CEXP, v+27, x}, \
-	{TW_CEXP, v+28, x}, {TW_CEXP, v+29, x}, {TW_CEXP, v+30, x}, {TW_CEXP, v+31, x}, \
-	{TW_CEXP, v+32, x}, {TW_CEXP, v+33, x}, {TW_CEXP, v+34, x}, {TW_CEXP, v+35, x}, \
-	{TW_CEXP, v+36, x}, {TW_CEXP, v+37, x}, {TW_CEXP, v+38, x}, {TW_CEXP, v+39, x}, \
-	{TW_CEXP, v+40, x}, {TW_CEXP, v+41, x}, {TW_CEXP, v+42, x}, {TW_CEXP, v+43, x}, \
-	{TW_CEXP, v+44, x}, {TW_CEXP, v+45, x}, {TW_CEXP, v+46, x}, {TW_CEXP, v+47, x}, \
-	{TW_CEXP, v+48, x}, {TW_CEXP, v+49, x}, {TW_CEXP, v+50, x}, {TW_CEXP, v+51, x}, \
-	{TW_CEXP, v+52, x}, {TW_CEXP, v+53, x}, {TW_CEXP, v+54, x}, {TW_CEXP, v+55, x}, \
-	{TW_CEXP, v+56, x}, {TW_CEXP, v+57, x}, {TW_CEXP, v+58, x}, {TW_CEXP, v+59, x}, \
-	{TW_CEXP, v+60, x}, {TW_CEXP, v+61, x}, {TW_CEXP, v+62, x}, {TW_CEXP, v+63, x}, \
-	{TW_CEXP, v+64, x}, {TW_CEXP, v+65, x}, {TW_CEXP, v+66, x}, {TW_CEXP, v+67, x}, \
-	{TW_CEXP, v+68, x}, {TW_CEXP, v+69, x}, {TW_CEXP, v+70, x}, {TW_CEXP, v+71, x}, \
-	{TW_CEXP, v+72, x}, {TW_CEXP, v+73, x}, {TW_CEXP, v+74, x}, {TW_CEXP, v+75, x}, \
-	{TW_CEXP, v+76, x}, {TW_CEXP, v+77, x}, {TW_CEXP, v+78, x}, {TW_CEXP, v+79, x}, \
-	{TW_CEXP, v+80, x}, {TW_CEXP, v+81, x}, {TW_CEXP, v+82, x}, {TW_CEXP, v+83, x}, \
-	{TW_CEXP, v+84, x}, {TW_CEXP, v+85, x}, {TW_CEXP, v+86, x}, {TW_CEXP, v+87, x}, \
-	{TW_CEXP, v+88, x}, {TW_CEXP, v+89, x}, {TW_CEXP, v+90, x}, {TW_CEXP, v+91, x}, \
-	{TW_CEXP, v+92, x}, {TW_CEXP, v+93, x}, {TW_CEXP, v+94, x}, {TW_CEXP, v+95, x}, \
-	{TW_CEXP, v+96, x}, {TW_CEXP, v+97, x}, {TW_CEXP, v+98, x}, {TW_CEXP, v+99, x}, \
-	{TW_CEXP, v+100, x}, {TW_CEXP, v+101, x}, {TW_CEXP, v+102, x}, {TW_CEXP, v+103, x}, \
-	{TW_CEXP, v+104, x}, {TW_CEXP, v+105, x}, {TW_CEXP, v+106, x}, {TW_CEXP, v+107, x}, \
-	{TW_CEXP, v+108, x}, {TW_CEXP, v+109, x}, {TW_CEXP, v+110, x}, {TW_CEXP, v+111, x}, \
-	{TW_CEXP, v+112, x}, {TW_CEXP, v+113, x}, {TW_CEXP, v+114, x}, {TW_CEXP, v+115, x}, \
-	{TW_CEXP, v+116, x}, {TW_CEXP, v+117, x}, {TW_CEXP, v+118, x}, {TW_CEXP, v+119, x}, \
-	{TW_CEXP, v+120, x}, {TW_CEXP, v+121, x}, {TW_CEXP, v+122, x}, {TW_CEXP, v+123, x}, \
-	{TW_CEXP, v+124, x}, {TW_CEXP, v+125, x}, {TW_CEXP, v+126, x}, {TW_CEXP, v+127, x}, \
-	{TW_CEXP, v+128, x}, {TW_CEXP, v+129, x}, {TW_CEXP, v+130, x}, {TW_CEXP, v+131, x}, \
-	{TW_CEXP, v+132, x}, {TW_CEXP, v+133, x}, {TW_CEXP, v+134, x}, {TW_CEXP, v+135, x}, \
-	{TW_CEXP, v+136, x}, {TW_CEXP, v+137, x}, {TW_CEXP, v+138, x}, {TW_CEXP, v+139, x}, \
-	{TW_CEXP, v+140, x}, {TW_CEXP, v+141, x}, {TW_CEXP, v+142, x}, {TW_CEXP, v+143, x}, \
-	{TW_CEXP, v+144, x}, {TW_CEXP, v+145, x}, {TW_CEXP, v+146, x}, {TW_CEXP, v+147, x}, \
-	{TW_CEXP, v+148, x}, {TW_CEXP, v+149, x}, {TW_CEXP, v+150, x}, {TW_CEXP, v+151, x}, \
-	{TW_CEXP, v+152, x}, {TW_CEXP, v+153, x}, {TW_CEXP, v+154, x}, {TW_CEXP, v+155, x}, \
-	{TW_CEXP, v+156, x}, {TW_CEXP, v+157, x}, {TW_CEXP, v+158, x}, {TW_CEXP, v+159, x}, \
-	{TW_CEXP, v+160, x}, {TW_CEXP, v+161, x}, {TW_CEXP, v+162, x}, {TW_CEXP, v+163, x}, \
-	{TW_CEXP, v+164, x}, {TW_CEXP, v+165, x}, {TW_CEXP, v+166, x}, {TW_CEXP, v+167, x}, \
-	{TW_CEXP, v+168, x}, {TW_CEXP, v+169, x}, {TW_CEXP, v+170, x}, {TW_CEXP, v+171, x}, \
-	{TW_CEXP, v+172, x}, {TW_CEXP, v+173, x}, {TW_CEXP, v+174, x}, {TW_CEXP, v+175, x}, \
-	{TW_CEXP, v+176, x}, {TW_CEXP, v+177, x}, {TW_CEXP, v+178, x}, {TW_CEXP, v+179, x}, \
-	{TW_CEXP, v+180, x}, {TW_CEXP, v+181, x}, {TW_CEXP, v+182, x}, {TW_CEXP, v+183, x}, \
-	{TW_CEXP, v+184, x}, {TW_CEXP, v+185, x}, {TW_CEXP, v+186, x}, {TW_CEXP, v+187, x}, \
-	{TW_CEXP, v+188, x}, {TW_CEXP, v+189, x}, {TW_CEXP, v+190, x}, {TW_CEXP, v+191, x}, \
-	{TW_CEXP, v+192, x}, {TW_CEXP, v+193, x}, {TW_CEXP, v+194, x}, {TW_CEXP, v+195, x}, \
-	{TW_CEXP, v+196, x}, {TW_CEXP, v+197, x}, {TW_CEXP, v+198, x}, {TW_CEXP, v+199, x}, \
-	{TW_CEXP, v+200, x}, {TW_CEXP, v+201, x}, {TW_CEXP, v+202, x}, {TW_CEXP, v+203, x}, \
-	{TW_CEXP, v+204, x}, {TW_CEXP, v+205, x}, {TW_CEXP, v+206, x}, {TW_CEXP, v+207, x}, \
-	{TW_CEXP, v+208, x}, {TW_CEXP, v+209, x}, {TW_CEXP, v+210, x}, {TW_CEXP, v+211, x}, \
-	{TW_CEXP, v+212, x}, {TW_CEXP, v+213, x}, {TW_CEXP, v+214, x}, {TW_CEXP, v+215, x}, \
-	{TW_CEXP, v+216, x}, {TW_CEXP, v+217, x}, {TW_CEXP, v+218, x}, {TW_CEXP, v+219, x}, \
-	{TW_CEXP, v+220, x}, {TW_CEXP, v+221, x}, {TW_CEXP, v+222, x}, {TW_CEXP, v+223, x}, \
-	{TW_CEXP, v+224, x}, {TW_CEXP, v+225, x}, {TW_CEXP, v+226, x}, {TW_CEXP, v+227, x}, \
-	{TW_CEXP, v+228, x}, {TW_CEXP, v+229, x}, {TW_CEXP, v+230, x}, {TW_CEXP, v+231, x}, \
-	{TW_CEXP, v+232, x}, {TW_CEXP, v+233, x}, {TW_CEXP, v+234, x}, {TW_CEXP, v+235, x}, \
-	{TW_CEXP, v+236, x}, {TW_CEXP, v+237, x}, {TW_CEXP, v+238, x}, {TW_CEXP, v+239, x}, \
-	{TW_CEXP, v+240, x}, {TW_CEXP, v+241, x}, {TW_CEXP, v+242, x}, {TW_CEXP, v+243, x}, \
-	{TW_CEXP, v+244, x}, {TW_CEXP, v+245, x}, {TW_CEXP, v+246, x}, {TW_CEXP, v+247, x}, \
-	{TW_CEXP, v+248, x}, {TW_CEXP, v+249, x}, {TW_CEXP, v+250, x}, {TW_CEXP, v+251, x}, \
-	{TW_CEXP, v+252, x}, {TW_CEXP, v+253, x}, {TW_CEXP, v+254, x}, {TW_CEXP, v+255, x} 
-#endif // VTW_SIZE == 256
-#endif // REQ_VTW1
-#if defined(REQ_VTW2)
-#if defined(VTW_SIZE) && VTW_SIZE == 1
-#warning "using VTW2 with 1"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_SIN, v+0, -x} 
-#endif // VTW_SIZE == 1
-#if defined(VTW_SIZE) && VTW_SIZE == 2
-#warning "using VTW2 with 2"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_SIN, v+0, -x}, {TW_SIN, v+0, x} 
-#endif // VTW_SIZE == 2
-#if defined(VTW_SIZE) && VTW_SIZE == 4
-#warning "using VTW2 with 4"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
-	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x} 
-#endif // VTW_SIZE == 4
-#if defined(VTW_SIZE) && VTW_SIZE == 8
-#warning "using VTW2 with 8"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
-	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
-	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
-	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x} 
-#endif // VTW_SIZE == 8
-#if defined(VTW_SIZE) && VTW_SIZE == 16
-#warning "using VTW2 with 16"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
-	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
-	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
-	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
-	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
-	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x} 
-#endif // VTW_SIZE == 16
-#if defined(VTW_SIZE) && VTW_SIZE == 32
-#warning "using VTW2 with 32"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
-	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
-	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \
-	{TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \
-	{TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \
-	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
-	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
-	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \
-	{TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \
-	{TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x} 
-#endif // VTW_SIZE == 32
-#if defined(VTW_SIZE) && VTW_SIZE == 64
-#warning "using VTW2 with 64"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
-	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
-	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \
-	{TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \
-	{TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \
-	{TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \
-	{TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \
-	{TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \
-	{TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \
-	{TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \
-	{TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \
-	{TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \
-	{TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \
-	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
-	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
-	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \
-	{TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \
-	{TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \
-	{TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \
-	{TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \
-	{TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \
-	{TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \
-	{TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \
-	{TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \
-	{TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \
-	{TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x} 
-#endif // VTW_SIZE == 64
-#if defined(VTW_SIZE) && VTW_SIZE == 128
-#warning "using VTW2 with 128"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
-	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
-	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \
-	{TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \
-	{TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \
-	{TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \
-	{TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \
-	{TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \
-	{TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \
-	{TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \
-	{TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \
-	{TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \
-	{TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \
-	{TW_COS, v+32, x}, {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+33, x}, \
-	{TW_COS, v+34, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, {TW_COS, v+35, x}, \
-	{TW_COS, v+36, x}, {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+37, x}, \
-	{TW_COS, v+38, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, {TW_COS, v+39, x}, \
-	{TW_COS, v+40, x}, {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+41, x}, \
-	{TW_COS, v+42, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, {TW_COS, v+43, x}, \
-	{TW_COS, v+44, x}, {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+45, x}, \
-	{TW_COS, v+46, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, {TW_COS, v+47, x}, \
-	{TW_COS, v+48, x}, {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+49, x}, \
-	{TW_COS, v+50, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, {TW_COS, v+51, x}, \
-	{TW_COS, v+52, x}, {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+53, x}, \
-	{TW_COS, v+54, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, {TW_COS, v+55, x}, \
-	{TW_COS, v+56, x}, {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+57, x}, \
-	{TW_COS, v+58, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, {TW_COS, v+59, x}, \
-	{TW_COS, v+60, x}, {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+61, x}, \
-	{TW_COS, v+62, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, {TW_COS, v+63, x}, \
-	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
-	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
-	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \
-	{TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \
-	{TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \
-	{TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \
-	{TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \
-	{TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \
-	{TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \
-	{TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \
-	{TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \
-	{TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \
-	{TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x}, \
-	{TW_SIN, v+32, -x}, {TW_SIN, v+32, x}, {TW_SIN, v+33, -x}, {TW_SIN, v+33, x}, \
-	{TW_SIN, v+34, -x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, -x}, {TW_SIN, v+35, x}, \
-	{TW_SIN, v+36, -x}, {TW_SIN, v+36, x}, {TW_SIN, v+37, -x}, {TW_SIN, v+37, x}, \
-	{TW_SIN, v+38, -x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, -x}, {TW_SIN, v+39, x}, \
-	{TW_SIN, v+40, -x}, {TW_SIN, v+40, x}, {TW_SIN, v+41, -x}, {TW_SIN, v+41, x}, \
-	{TW_SIN, v+42, -x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, -x}, {TW_SIN, v+43, x}, \
-	{TW_SIN, v+44, -x}, {TW_SIN, v+44, x}, {TW_SIN, v+45, -x}, {TW_SIN, v+45, x}, \
-	{TW_SIN, v+46, -x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, -x}, {TW_SIN, v+47, x}, \
-	{TW_SIN, v+48, -x}, {TW_SIN, v+48, x}, {TW_SIN, v+49, -x}, {TW_SIN, v+49, x}, \
-	{TW_SIN, v+50, -x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, -x}, {TW_SIN, v+51, x}, \
-	{TW_SIN, v+52, -x}, {TW_SIN, v+52, x}, {TW_SIN, v+53, -x}, {TW_SIN, v+53, x}, \
-	{TW_SIN, v+54, -x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, -x}, {TW_SIN, v+55, x}, \
-	{TW_SIN, v+56, -x}, {TW_SIN, v+56, x}, {TW_SIN, v+57, -x}, {TW_SIN, v+57, x}, \
-	{TW_SIN, v+58, -x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, -x}, {TW_SIN, v+59, x}, \
-	{TW_SIN, v+60, -x}, {TW_SIN, v+60, x}, {TW_SIN, v+61, -x}, {TW_SIN, v+61, x}, \
-	{TW_SIN, v+62, -x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, -x}, {TW_SIN, v+63, x} 
-#endif // VTW_SIZE == 128
-#if defined(VTW_SIZE) && VTW_SIZE == 256
-#warning "using VTW2 with 256"
-#define VTW2(v,x) {TW_COS, v+0, x}, {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
-	{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
-	{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+9, x}, \
-	{TW_COS, v+10, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+13, x}, \
-	{TW_COS, v+14, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, {TW_COS, v+15, x}, \
-	{TW_COS, v+16, x}, {TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+17, x}, \
-	{TW_COS, v+18, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, {TW_COS, v+19, x}, \
-	{TW_COS, v+20, x}, {TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+21, x}, \
-	{TW_COS, v+22, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, {TW_COS, v+23, x}, \
-	{TW_COS, v+24, x}, {TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+25, x}, \
-	{TW_COS, v+26, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, {TW_COS, v+27, x}, \
-	{TW_COS, v+28, x}, {TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+29, x}, \
-	{TW_COS, v+30, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, {TW_COS, v+31, x}, \
-	{TW_COS, v+32, x}, {TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+33, x}, \
-	{TW_COS, v+34, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, {TW_COS, v+35, x}, \
-	{TW_COS, v+36, x}, {TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+37, x}, \
-	{TW_COS, v+38, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, {TW_COS, v+39, x}, \
-	{TW_COS, v+40, x}, {TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+41, x}, \
-	{TW_COS, v+42, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, {TW_COS, v+43, x}, \
-	{TW_COS, v+44, x}, {TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+45, x}, \
-	{TW_COS, v+46, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, {TW_COS, v+47, x}, \
-	{TW_COS, v+48, x}, {TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+49, x}, \
-	{TW_COS, v+50, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, {TW_COS, v+51, x}, \
-	{TW_COS, v+52, x}, {TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+53, x}, \
-	{TW_COS, v+54, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, {TW_COS, v+55, x}, \
-	{TW_COS, v+56, x}, {TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+57, x}, \
-	{TW_COS, v+58, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, {TW_COS, v+59, x}, \
-	{TW_COS, v+60, x}, {TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+61, x}, \
-	{TW_COS, v+62, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, {TW_COS, v+63, x}, \
-	{TW_COS, v+64, x}, {TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+65, x}, \
-	{TW_COS, v+66, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, {TW_COS, v+67, x}, \
-	{TW_COS, v+68, x}, {TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+69, x}, \
-	{TW_COS, v+70, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, {TW_COS, v+71, x}, \
-	{TW_COS, v+72, x}, {TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+73, x}, \
-	{TW_COS, v+74, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, {TW_COS, v+75, x}, \
-	{TW_COS, v+76, x}, {TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+77, x}, \
-	{TW_COS, v+78, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, {TW_COS, v+79, x}, \
-	{TW_COS, v+80, x}, {TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+81, x}, \
-	{TW_COS, v+82, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, {TW_COS, v+83, x}, \
-	{TW_COS, v+84, x}, {TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+85, x}, \
-	{TW_COS, v+86, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, {TW_COS, v+87, x}, \
-	{TW_COS, v+88, x}, {TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+89, x}, \
-	{TW_COS, v+90, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, {TW_COS, v+91, x}, \
-	{TW_COS, v+92, x}, {TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+93, x}, \
-	{TW_COS, v+94, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, {TW_COS, v+95, x}, \
-	{TW_COS, v+96, x}, {TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+97, x}, \
-	{TW_COS, v+98, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, {TW_COS, v+99, x}, \
-	{TW_COS, v+100, x}, {TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+101, x}, \
-	{TW_COS, v+102, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, {TW_COS, v+103, x}, \
-	{TW_COS, v+104, x}, {TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+105, x}, \
-	{TW_COS, v+106, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, {TW_COS, v+107, x}, \
-	{TW_COS, v+108, x}, {TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+109, x}, \
-	{TW_COS, v+110, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, {TW_COS, v+111, x}, \
-	{TW_COS, v+112, x}, {TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+113, x}, \
-	{TW_COS, v+114, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, {TW_COS, v+115, x}, \
-	{TW_COS, v+116, x}, {TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+117, x}, \
-	{TW_COS, v+118, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, {TW_COS, v+119, x}, \
-	{TW_COS, v+120, x}, {TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+121, x}, \
-	{TW_COS, v+122, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, {TW_COS, v+123, x}, \
-	{TW_COS, v+124, x}, {TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+125, x}, \
-	{TW_COS, v+126, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, {TW_COS, v+127, x}, \
-	{TW_SIN, v+0, -x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
-	{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
-	{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, -x}, {TW_SIN, v+8, x}, {TW_SIN, v+9, -x}, {TW_SIN, v+9, x}, \
-	{TW_SIN, v+10, -x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, -x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, -x}, {TW_SIN, v+12, x}, {TW_SIN, v+13, -x}, {TW_SIN, v+13, x}, \
-	{TW_SIN, v+14, -x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, -x}, {TW_SIN, v+15, x}, \
-	{TW_SIN, v+16, -x}, {TW_SIN, v+16, x}, {TW_SIN, v+17, -x}, {TW_SIN, v+17, x}, \
-	{TW_SIN, v+18, -x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, -x}, {TW_SIN, v+19, x}, \
-	{TW_SIN, v+20, -x}, {TW_SIN, v+20, x}, {TW_SIN, v+21, -x}, {TW_SIN, v+21, x}, \
-	{TW_SIN, v+22, -x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, -x}, {TW_SIN, v+23, x}, \
-	{TW_SIN, v+24, -x}, {TW_SIN, v+24, x}, {TW_SIN, v+25, -x}, {TW_SIN, v+25, x}, \
-	{TW_SIN, v+26, -x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, -x}, {TW_SIN, v+27, x}, \
-	{TW_SIN, v+28, -x}, {TW_SIN, v+28, x}, {TW_SIN, v+29, -x}, {TW_SIN, v+29, x}, \
-	{TW_SIN, v+30, -x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, -x}, {TW_SIN, v+31, x}, \
-	{TW_SIN, v+32, -x}, {TW_SIN, v+32, x}, {TW_SIN, v+33, -x}, {TW_SIN, v+33, x}, \
-	{TW_SIN, v+34, -x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, -x}, {TW_SIN, v+35, x}, \
-	{TW_SIN, v+36, -x}, {TW_SIN, v+36, x}, {TW_SIN, v+37, -x}, {TW_SIN, v+37, x}, \
-	{TW_SIN, v+38, -x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, -x}, {TW_SIN, v+39, x}, \
-	{TW_SIN, v+40, -x}, {TW_SIN, v+40, x}, {TW_SIN, v+41, -x}, {TW_SIN, v+41, x}, \
-	{TW_SIN, v+42, -x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, -x}, {TW_SIN, v+43, x}, \
-	{TW_SIN, v+44, -x}, {TW_SIN, v+44, x}, {TW_SIN, v+45, -x}, {TW_SIN, v+45, x}, \
-	{TW_SIN, v+46, -x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, -x}, {TW_SIN, v+47, x}, \
-	{TW_SIN, v+48, -x}, {TW_SIN, v+48, x}, {TW_SIN, v+49, -x}, {TW_SIN, v+49, x}, \
-	{TW_SIN, v+50, -x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, -x}, {TW_SIN, v+51, x}, \
-	{TW_SIN, v+52, -x}, {TW_SIN, v+52, x}, {TW_SIN, v+53, -x}, {TW_SIN, v+53, x}, \
-	{TW_SIN, v+54, -x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, -x}, {TW_SIN, v+55, x}, \
-	{TW_SIN, v+56, -x}, {TW_SIN, v+56, x}, {TW_SIN, v+57, -x}, {TW_SIN, v+57, x}, \
-	{TW_SIN, v+58, -x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, -x}, {TW_SIN, v+59, x}, \
-	{TW_SIN, v+60, -x}, {TW_SIN, v+60, x}, {TW_SIN, v+61, -x}, {TW_SIN, v+61, x}, \
-	{TW_SIN, v+62, -x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, -x}, {TW_SIN, v+63, x}, \
-	{TW_SIN, v+64, -x}, {TW_SIN, v+64, x}, {TW_SIN, v+65, -x}, {TW_SIN, v+65, x}, \
-	{TW_SIN, v+66, -x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, -x}, {TW_SIN, v+67, x}, \
-	{TW_SIN, v+68, -x}, {TW_SIN, v+68, x}, {TW_SIN, v+69, -x}, {TW_SIN, v+69, x}, \
-	{TW_SIN, v+70, -x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, -x}, {TW_SIN, v+71, x}, \
-	{TW_SIN, v+72, -x}, {TW_SIN, v+72, x}, {TW_SIN, v+73, -x}, {TW_SIN, v+73, x}, \
-	{TW_SIN, v+74, -x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, -x}, {TW_SIN, v+75, x}, \
-	{TW_SIN, v+76, -x}, {TW_SIN, v+76, x}, {TW_SIN, v+77, -x}, {TW_SIN, v+77, x}, \
-	{TW_SIN, v+78, -x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, -x}, {TW_SIN, v+79, x}, \
-	{TW_SIN, v+80, -x}, {TW_SIN, v+80, x}, {TW_SIN, v+81, -x}, {TW_SIN, v+81, x}, \
-	{TW_SIN, v+82, -x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, -x}, {TW_SIN, v+83, x}, \
-	{TW_SIN, v+84, -x}, {TW_SIN, v+84, x}, {TW_SIN, v+85, -x}, {TW_SIN, v+85, x}, \
-	{TW_SIN, v+86, -x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, -x}, {TW_SIN, v+87, x}, \
-	{TW_SIN, v+88, -x}, {TW_SIN, v+88, x}, {TW_SIN, v+89, -x}, {TW_SIN, v+89, x}, \
-	{TW_SIN, v+90, -x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, -x}, {TW_SIN, v+91, x}, \
-	{TW_SIN, v+92, -x}, {TW_SIN, v+92, x}, {TW_SIN, v+93, -x}, {TW_SIN, v+93, x}, \
-	{TW_SIN, v+94, -x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, -x}, {TW_SIN, v+95, x}, \
-	{TW_SIN, v+96, -x}, {TW_SIN, v+96, x}, {TW_SIN, v+97, -x}, {TW_SIN, v+97, x}, \
-	{TW_SIN, v+98, -x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, -x}, {TW_SIN, v+99, x}, \
-	{TW_SIN, v+100, -x}, {TW_SIN, v+100, x}, {TW_SIN, v+101, -x}, {TW_SIN, v+101, x}, \
-	{TW_SIN, v+102, -x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, -x}, {TW_SIN, v+103, x}, \
-	{TW_SIN, v+104, -x}, {TW_SIN, v+104, x}, {TW_SIN, v+105, -x}, {TW_SIN, v+105, x}, \
-	{TW_SIN, v+106, -x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, -x}, {TW_SIN, v+107, x}, \
-	{TW_SIN, v+108, -x}, {TW_SIN, v+108, x}, {TW_SIN, v+109, -x}, {TW_SIN, v+109, x}, \
-	{TW_SIN, v+110, -x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, -x}, {TW_SIN, v+111, x}, \
-	{TW_SIN, v+112, -x}, {TW_SIN, v+112, x}, {TW_SIN, v+113, -x}, {TW_SIN, v+113, x}, \
-	{TW_SIN, v+114, -x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, -x}, {TW_SIN, v+115, x}, \
-	{TW_SIN, v+116, -x}, {TW_SIN, v+116, x}, {TW_SIN, v+117, -x}, {TW_SIN, v+117, x}, \
-	{TW_SIN, v+118, -x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, -x}, {TW_SIN, v+119, x}, \
-	{TW_SIN, v+120, -x}, {TW_SIN, v+120, x}, {TW_SIN, v+121, -x}, {TW_SIN, v+121, x}, \
-	{TW_SIN, v+122, -x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, -x}, {TW_SIN, v+123, x}, \
-	{TW_SIN, v+124, -x}, {TW_SIN, v+124, x}, {TW_SIN, v+125, -x}, {TW_SIN, v+125, x}, \
-	{TW_SIN, v+126, -x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, -x}, {TW_SIN, v+127, x} 
-#endif // VTW_SIZE == 256
-#endif // REQ_VTW2
-#if defined(REQ_VTWS)
-#if defined(VTW_SIZE) && VTW_SIZE == 1
-#warning "using VTWS with 1"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_SIN, v+0, x} 
-#endif // VTW_SIZE == 1
-#if defined(VTW_SIZE) && VTW_SIZE == 2
-#warning "using VTWS with 2"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_SIN, v+0, x}, {TW_SIN, v+1, x} 
-#endif // VTW_SIZE == 2
-#if defined(VTW_SIZE) && VTW_SIZE == 4
-#warning "using VTWS with 4"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
-	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x} 
-#endif // VTW_SIZE == 4
-#if defined(VTW_SIZE) && VTW_SIZE == 8
-#warning "using VTWS with 8"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
-	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x} 
-#endif // VTW_SIZE == 8
-#if defined(VTW_SIZE) && VTW_SIZE == 16
-#warning "using VTWS with 16"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
-	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x} 
-#endif // VTW_SIZE == 16
-#if defined(VTW_SIZE) && VTW_SIZE == 32
-#warning "using VTWS with 32"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
-	{TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \
-	{TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \
-	{TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \
-	{TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \
-	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \
-	{TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \
-	{TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \
-	{TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \
-	{TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x} 
-#endif // VTW_SIZE == 32
-#if defined(VTW_SIZE) && VTW_SIZE == 64
-#warning "using VTWS with 64"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
-	{TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \
-	{TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \
-	{TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \
-	{TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \
-	{TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \
-	{TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \
-	{TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \
-	{TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \
-	{TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \
-	{TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \
-	{TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \
-	{TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \
-	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \
-	{TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \
-	{TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \
-	{TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \
-	{TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \
-	{TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \
-	{TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \
-	{TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \
-	{TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \
-	{TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \
-	{TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \
-	{TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \
-	{TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x} 
-#endif // VTW_SIZE == 64
-#if defined(VTW_SIZE) && VTW_SIZE == 128
-#warning "using VTWS with 128"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
-	{TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \
-	{TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \
-	{TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \
-	{TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \
-	{TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \
-	{TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \
-	{TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \
-	{TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \
-	{TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \
-	{TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \
-	{TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \
-	{TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \
-	{TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, \
-	{TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, \
-	{TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, \
-	{TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, \
-	{TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, \
-	{TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, \
-	{TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, \
-	{TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, \
-	{TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, \
-	{TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, \
-	{TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, \
-	{TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, \
-	{TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, \
-	{TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, \
-	{TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, \
-	{TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, \
-	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \
-	{TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \
-	{TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \
-	{TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \
-	{TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \
-	{TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \
-	{TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \
-	{TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \
-	{TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \
-	{TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \
-	{TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \
-	{TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \
-	{TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x}, \
-	{TW_SIN, v+64, x}, {TW_SIN, v+65, x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, x}, \
-	{TW_SIN, v+68, x}, {TW_SIN, v+69, x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, x}, \
-	{TW_SIN, v+72, x}, {TW_SIN, v+73, x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, x}, \
-	{TW_SIN, v+76, x}, {TW_SIN, v+77, x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, x}, \
-	{TW_SIN, v+80, x}, {TW_SIN, v+81, x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, x}, \
-	{TW_SIN, v+84, x}, {TW_SIN, v+85, x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, x}, \
-	{TW_SIN, v+88, x}, {TW_SIN, v+89, x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, x}, \
-	{TW_SIN, v+92, x}, {TW_SIN, v+93, x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, x}, \
-	{TW_SIN, v+96, x}, {TW_SIN, v+97, x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, x}, \
-	{TW_SIN, v+100, x}, {TW_SIN, v+101, x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, x}, \
-	{TW_SIN, v+104, x}, {TW_SIN, v+105, x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, x}, \
-	{TW_SIN, v+108, x}, {TW_SIN, v+109, x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, x}, \
-	{TW_SIN, v+112, x}, {TW_SIN, v+113, x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, x}, \
-	{TW_SIN, v+116, x}, {TW_SIN, v+117, x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, x}, \
-	{TW_SIN, v+120, x}, {TW_SIN, v+121, x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, x}, \
-	{TW_SIN, v+124, x}, {TW_SIN, v+125, x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, x} 
-#endif // VTW_SIZE == 128
-#if defined(VTW_SIZE) && VTW_SIZE == 256
-#warning "using VTWS with 256"
-#define VTWS(v,x) {TW_COS, v+0, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
-	{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
-	{TW_COS, v+8, x}, {TW_COS, v+9, x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
-	{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
-	{TW_COS, v+16, x}, {TW_COS, v+17, x}, {TW_COS, v+18, x}, {TW_COS, v+19, x}, \
-	{TW_COS, v+20, x}, {TW_COS, v+21, x}, {TW_COS, v+22, x}, {TW_COS, v+23, x}, \
-	{TW_COS, v+24, x}, {TW_COS, v+25, x}, {TW_COS, v+26, x}, {TW_COS, v+27, x}, \
-	{TW_COS, v+28, x}, {TW_COS, v+29, x}, {TW_COS, v+30, x}, {TW_COS, v+31, x}, \
-	{TW_COS, v+32, x}, {TW_COS, v+33, x}, {TW_COS, v+34, x}, {TW_COS, v+35, x}, \
-	{TW_COS, v+36, x}, {TW_COS, v+37, x}, {TW_COS, v+38, x}, {TW_COS, v+39, x}, \
-	{TW_COS, v+40, x}, {TW_COS, v+41, x}, {TW_COS, v+42, x}, {TW_COS, v+43, x}, \
-	{TW_COS, v+44, x}, {TW_COS, v+45, x}, {TW_COS, v+46, x}, {TW_COS, v+47, x}, \
-	{TW_COS, v+48, x}, {TW_COS, v+49, x}, {TW_COS, v+50, x}, {TW_COS, v+51, x}, \
-	{TW_COS, v+52, x}, {TW_COS, v+53, x}, {TW_COS, v+54, x}, {TW_COS, v+55, x}, \
-	{TW_COS, v+56, x}, {TW_COS, v+57, x}, {TW_COS, v+58, x}, {TW_COS, v+59, x}, \
-	{TW_COS, v+60, x}, {TW_COS, v+61, x}, {TW_COS, v+62, x}, {TW_COS, v+63, x}, \
-	{TW_COS, v+64, x}, {TW_COS, v+65, x}, {TW_COS, v+66, x}, {TW_COS, v+67, x}, \
-	{TW_COS, v+68, x}, {TW_COS, v+69, x}, {TW_COS, v+70, x}, {TW_COS, v+71, x}, \
-	{TW_COS, v+72, x}, {TW_COS, v+73, x}, {TW_COS, v+74, x}, {TW_COS, v+75, x}, \
-	{TW_COS, v+76, x}, {TW_COS, v+77, x}, {TW_COS, v+78, x}, {TW_COS, v+79, x}, \
-	{TW_COS, v+80, x}, {TW_COS, v+81, x}, {TW_COS, v+82, x}, {TW_COS, v+83, x}, \
-	{TW_COS, v+84, x}, {TW_COS, v+85, x}, {TW_COS, v+86, x}, {TW_COS, v+87, x}, \
-	{TW_COS, v+88, x}, {TW_COS, v+89, x}, {TW_COS, v+90, x}, {TW_COS, v+91, x}, \
-	{TW_COS, v+92, x}, {TW_COS, v+93, x}, {TW_COS, v+94, x}, {TW_COS, v+95, x}, \
-	{TW_COS, v+96, x}, {TW_COS, v+97, x}, {TW_COS, v+98, x}, {TW_COS, v+99, x}, \
-	{TW_COS, v+100, x}, {TW_COS, v+101, x}, {TW_COS, v+102, x}, {TW_COS, v+103, x}, \
-	{TW_COS, v+104, x}, {TW_COS, v+105, x}, {TW_COS, v+106, x}, {TW_COS, v+107, x}, \
-	{TW_COS, v+108, x}, {TW_COS, v+109, x}, {TW_COS, v+110, x}, {TW_COS, v+111, x}, \
-	{TW_COS, v+112, x}, {TW_COS, v+113, x}, {TW_COS, v+114, x}, {TW_COS, v+115, x}, \
-	{TW_COS, v+116, x}, {TW_COS, v+117, x}, {TW_COS, v+118, x}, {TW_COS, v+119, x}, \
-	{TW_COS, v+120, x}, {TW_COS, v+121, x}, {TW_COS, v+122, x}, {TW_COS, v+123, x}, \
-	{TW_COS, v+124, x}, {TW_COS, v+125, x}, {TW_COS, v+126, x}, {TW_COS, v+127, x}, \
-	{TW_COS, v+128, x}, {TW_COS, v+129, x}, {TW_COS, v+130, x}, {TW_COS, v+131, x}, \
-	{TW_COS, v+132, x}, {TW_COS, v+133, x}, {TW_COS, v+134, x}, {TW_COS, v+135, x}, \
-	{TW_COS, v+136, x}, {TW_COS, v+137, x}, {TW_COS, v+138, x}, {TW_COS, v+139, x}, \
-	{TW_COS, v+140, x}, {TW_COS, v+141, x}, {TW_COS, v+142, x}, {TW_COS, v+143, x}, \
-	{TW_COS, v+144, x}, {TW_COS, v+145, x}, {TW_COS, v+146, x}, {TW_COS, v+147, x}, \
-	{TW_COS, v+148, x}, {TW_COS, v+149, x}, {TW_COS, v+150, x}, {TW_COS, v+151, x}, \
-	{TW_COS, v+152, x}, {TW_COS, v+153, x}, {TW_COS, v+154, x}, {TW_COS, v+155, x}, \
-	{TW_COS, v+156, x}, {TW_COS, v+157, x}, {TW_COS, v+158, x}, {TW_COS, v+159, x}, \
-	{TW_COS, v+160, x}, {TW_COS, v+161, x}, {TW_COS, v+162, x}, {TW_COS, v+163, x}, \
-	{TW_COS, v+164, x}, {TW_COS, v+165, x}, {TW_COS, v+166, x}, {TW_COS, v+167, x}, \
-	{TW_COS, v+168, x}, {TW_COS, v+169, x}, {TW_COS, v+170, x}, {TW_COS, v+171, x}, \
-	{TW_COS, v+172, x}, {TW_COS, v+173, x}, {TW_COS, v+174, x}, {TW_COS, v+175, x}, \
-	{TW_COS, v+176, x}, {TW_COS, v+177, x}, {TW_COS, v+178, x}, {TW_COS, v+179, x}, \
-	{TW_COS, v+180, x}, {TW_COS, v+181, x}, {TW_COS, v+182, x}, {TW_COS, v+183, x}, \
-	{TW_COS, v+184, x}, {TW_COS, v+185, x}, {TW_COS, v+186, x}, {TW_COS, v+187, x}, \
-	{TW_COS, v+188, x}, {TW_COS, v+189, x}, {TW_COS, v+190, x}, {TW_COS, v+191, x}, \
-	{TW_COS, v+192, x}, {TW_COS, v+193, x}, {TW_COS, v+194, x}, {TW_COS, v+195, x}, \
-	{TW_COS, v+196, x}, {TW_COS, v+197, x}, {TW_COS, v+198, x}, {TW_COS, v+199, x}, \
-	{TW_COS, v+200, x}, {TW_COS, v+201, x}, {TW_COS, v+202, x}, {TW_COS, v+203, x}, \
-	{TW_COS, v+204, x}, {TW_COS, v+205, x}, {TW_COS, v+206, x}, {TW_COS, v+207, x}, \
-	{TW_COS, v+208, x}, {TW_COS, v+209, x}, {TW_COS, v+210, x}, {TW_COS, v+211, x}, \
-	{TW_COS, v+212, x}, {TW_COS, v+213, x}, {TW_COS, v+214, x}, {TW_COS, v+215, x}, \
-	{TW_COS, v+216, x}, {TW_COS, v+217, x}, {TW_COS, v+218, x}, {TW_COS, v+219, x}, \
-	{TW_COS, v+220, x}, {TW_COS, v+221, x}, {TW_COS, v+222, x}, {TW_COS, v+223, x}, \
-	{TW_COS, v+224, x}, {TW_COS, v+225, x}, {TW_COS, v+226, x}, {TW_COS, v+227, x}, \
-	{TW_COS, v+228, x}, {TW_COS, v+229, x}, {TW_COS, v+230, x}, {TW_COS, v+231, x}, \
-	{TW_COS, v+232, x}, {TW_COS, v+233, x}, {TW_COS, v+234, x}, {TW_COS, v+235, x}, \
-	{TW_COS, v+236, x}, {TW_COS, v+237, x}, {TW_COS, v+238, x}, {TW_COS, v+239, x}, \
-	{TW_COS, v+240, x}, {TW_COS, v+241, x}, {TW_COS, v+242, x}, {TW_COS, v+243, x}, \
-	{TW_COS, v+244, x}, {TW_COS, v+245, x}, {TW_COS, v+246, x}, {TW_COS, v+247, x}, \
-	{TW_COS, v+248, x}, {TW_COS, v+249, x}, {TW_COS, v+250, x}, {TW_COS, v+251, x}, \
-	{TW_COS, v+252, x}, {TW_COS, v+253, x}, {TW_COS, v+254, x}, {TW_COS, v+255, x}, \
-	{TW_SIN, v+0, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
-	{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}, \
-	{TW_SIN, v+8, x}, {TW_SIN, v+9, x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
-	{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}, \
-	{TW_SIN, v+16, x}, {TW_SIN, v+17, x}, {TW_SIN, v+18, x}, {TW_SIN, v+19, x}, \
-	{TW_SIN, v+20, x}, {TW_SIN, v+21, x}, {TW_SIN, v+22, x}, {TW_SIN, v+23, x}, \
-	{TW_SIN, v+24, x}, {TW_SIN, v+25, x}, {TW_SIN, v+26, x}, {TW_SIN, v+27, x}, \
-	{TW_SIN, v+28, x}, {TW_SIN, v+29, x}, {TW_SIN, v+30, x}, {TW_SIN, v+31, x}, \
-	{TW_SIN, v+32, x}, {TW_SIN, v+33, x}, {TW_SIN, v+34, x}, {TW_SIN, v+35, x}, \
-	{TW_SIN, v+36, x}, {TW_SIN, v+37, x}, {TW_SIN, v+38, x}, {TW_SIN, v+39, x}, \
-	{TW_SIN, v+40, x}, {TW_SIN, v+41, x}, {TW_SIN, v+42, x}, {TW_SIN, v+43, x}, \
-	{TW_SIN, v+44, x}, {TW_SIN, v+45, x}, {TW_SIN, v+46, x}, {TW_SIN, v+47, x}, \
-	{TW_SIN, v+48, x}, {TW_SIN, v+49, x}, {TW_SIN, v+50, x}, {TW_SIN, v+51, x}, \
-	{TW_SIN, v+52, x}, {TW_SIN, v+53, x}, {TW_SIN, v+54, x}, {TW_SIN, v+55, x}, \
-	{TW_SIN, v+56, x}, {TW_SIN, v+57, x}, {TW_SIN, v+58, x}, {TW_SIN, v+59, x}, \
-	{TW_SIN, v+60, x}, {TW_SIN, v+61, x}, {TW_SIN, v+62, x}, {TW_SIN, v+63, x}, \
-	{TW_SIN, v+64, x}, {TW_SIN, v+65, x}, {TW_SIN, v+66, x}, {TW_SIN, v+67, x}, \
-	{TW_SIN, v+68, x}, {TW_SIN, v+69, x}, {TW_SIN, v+70, x}, {TW_SIN, v+71, x}, \
-	{TW_SIN, v+72, x}, {TW_SIN, v+73, x}, {TW_SIN, v+74, x}, {TW_SIN, v+75, x}, \
-	{TW_SIN, v+76, x}, {TW_SIN, v+77, x}, {TW_SIN, v+78, x}, {TW_SIN, v+79, x}, \
-	{TW_SIN, v+80, x}, {TW_SIN, v+81, x}, {TW_SIN, v+82, x}, {TW_SIN, v+83, x}, \
-	{TW_SIN, v+84, x}, {TW_SIN, v+85, x}, {TW_SIN, v+86, x}, {TW_SIN, v+87, x}, \
-	{TW_SIN, v+88, x}, {TW_SIN, v+89, x}, {TW_SIN, v+90, x}, {TW_SIN, v+91, x}, \
-	{TW_SIN, v+92, x}, {TW_SIN, v+93, x}, {TW_SIN, v+94, x}, {TW_SIN, v+95, x}, \
-	{TW_SIN, v+96, x}, {TW_SIN, v+97, x}, {TW_SIN, v+98, x}, {TW_SIN, v+99, x}, \
-	{TW_SIN, v+100, x}, {TW_SIN, v+101, x}, {TW_SIN, v+102, x}, {TW_SIN, v+103, x}, \
-	{TW_SIN, v+104, x}, {TW_SIN, v+105, x}, {TW_SIN, v+106, x}, {TW_SIN, v+107, x}, \
-	{TW_SIN, v+108, x}, {TW_SIN, v+109, x}, {TW_SIN, v+110, x}, {TW_SIN, v+111, x}, \
-	{TW_SIN, v+112, x}, {TW_SIN, v+113, x}, {TW_SIN, v+114, x}, {TW_SIN, v+115, x}, \
-	{TW_SIN, v+116, x}, {TW_SIN, v+117, x}, {TW_SIN, v+118, x}, {TW_SIN, v+119, x}, \
-	{TW_SIN, v+120, x}, {TW_SIN, v+121, x}, {TW_SIN, v+122, x}, {TW_SIN, v+123, x}, \
-	{TW_SIN, v+124, x}, {TW_SIN, v+125, x}, {TW_SIN, v+126, x}, {TW_SIN, v+127, x}, \
-	{TW_SIN, v+128, x}, {TW_SIN, v+129, x}, {TW_SIN, v+130, x}, {TW_SIN, v+131, x}, \
-	{TW_SIN, v+132, x}, {TW_SIN, v+133, x}, {TW_SIN, v+134, x}, {TW_SIN, v+135, x}, \
-	{TW_SIN, v+136, x}, {TW_SIN, v+137, x}, {TW_SIN, v+138, x}, {TW_SIN, v+139, x}, \
-	{TW_SIN, v+140, x}, {TW_SIN, v+141, x}, {TW_SIN, v+142, x}, {TW_SIN, v+143, x}, \
-	{TW_SIN, v+144, x}, {TW_SIN, v+145, x}, {TW_SIN, v+146, x}, {TW_SIN, v+147, x}, \
-	{TW_SIN, v+148, x}, {TW_SIN, v+149, x}, {TW_SIN, v+150, x}, {TW_SIN, v+151, x}, \
-	{TW_SIN, v+152, x}, {TW_SIN, v+153, x}, {TW_SIN, v+154, x}, {TW_SIN, v+155, x}, \
-	{TW_SIN, v+156, x}, {TW_SIN, v+157, x}, {TW_SIN, v+158, x}, {TW_SIN, v+159, x}, \
-	{TW_SIN, v+160, x}, {TW_SIN, v+161, x}, {TW_SIN, v+162, x}, {TW_SIN, v+163, x}, \
-	{TW_SIN, v+164, x}, {TW_SIN, v+165, x}, {TW_SIN, v+166, x}, {TW_SIN, v+167, x}, \
-	{TW_SIN, v+168, x}, {TW_SIN, v+169, x}, {TW_SIN, v+170, x}, {TW_SIN, v+171, x}, \
-	{TW_SIN, v+172, x}, {TW_SIN, v+173, x}, {TW_SIN, v+174, x}, {TW_SIN, v+175, x}, \
-	{TW_SIN, v+176, x}, {TW_SIN, v+177, x}, {TW_SIN, v+178, x}, {TW_SIN, v+179, x}, \
-	{TW_SIN, v+180, x}, {TW_SIN, v+181, x}, {TW_SIN, v+182, x}, {TW_SIN, v+183, x}, \
-	{TW_SIN, v+184, x}, {TW_SIN, v+185, x}, {TW_SIN, v+186, x}, {TW_SIN, v+187, x}, \
-	{TW_SIN, v+188, x}, {TW_SIN, v+189, x}, {TW_SIN, v+190, x}, {TW_SIN, v+191, x}, \
-	{TW_SIN, v+192, x}, {TW_SIN, v+193, x}, {TW_SIN, v+194, x}, {TW_SIN, v+195, x}, \
-	{TW_SIN, v+196, x}, {TW_SIN, v+197, x}, {TW_SIN, v+198, x}, {TW_SIN, v+199, x}, \
-	{TW_SIN, v+200, x}, {TW_SIN, v+201, x}, {TW_SIN, v+202, x}, {TW_SIN, v+203, x}, \
-	{TW_SIN, v+204, x}, {TW_SIN, v+205, x}, {TW_SIN, v+206, x}, {TW_SIN, v+207, x}, \
-	{TW_SIN, v+208, x}, {TW_SIN, v+209, x}, {TW_SIN, v+210, x}, {TW_SIN, v+211, x}, \
-	{TW_SIN, v+212, x}, {TW_SIN, v+213, x}, {TW_SIN, v+214, x}, {TW_SIN, v+215, x}, \
-	{TW_SIN, v+216, x}, {TW_SIN, v+217, x}, {TW_SIN, v+218, x}, {TW_SIN, v+219, x}, \
-	{TW_SIN, v+220, x}, {TW_SIN, v+221, x}, {TW_SIN, v+222, x}, {TW_SIN, v+223, x}, \
-	{TW_SIN, v+224, x}, {TW_SIN, v+225, x}, {TW_SIN, v+226, x}, {TW_SIN, v+227, x}, \
-	{TW_SIN, v+228, x}, {TW_SIN, v+229, x}, {TW_SIN, v+230, x}, {TW_SIN, v+231, x}, \
-	{TW_SIN, v+232, x}, {TW_SIN, v+233, x}, {TW_SIN, v+234, x}, {TW_SIN, v+235, x}, \
-	{TW_SIN, v+236, x}, {TW_SIN, v+237, x}, {TW_SIN, v+238, x}, {TW_SIN, v+239, x}, \
-	{TW_SIN, v+240, x}, {TW_SIN, v+241, x}, {TW_SIN, v+242, x}, {TW_SIN, v+243, x}, \
-	{TW_SIN, v+244, x}, {TW_SIN, v+245, x}, {TW_SIN, v+246, x}, {TW_SIN, v+247, x}, \
-	{TW_SIN, v+248, x}, {TW_SIN, v+249, x}, {TW_SIN, v+250, x}, {TW_SIN, v+251, x}, \
-	{TW_SIN, v+252, x}, {TW_SIN, v+253, x}, {TW_SIN, v+254, x}, {TW_SIN, v+255, x} 
-#endif // VTW_SIZE == 256
-#endif // REQ_VTWS

From 14084d09a8d9023bcfa366c91c238ee201572697 Mon Sep 17 00:00:00 2001
From: Gilles Gouaillardet <gilles@rist.or.jp>
Date: Sun, 26 Jul 2020 00:11:10 +0900
Subject: [PATCH 04/13] update .gitignore

ignore all files automatically generated for SVE support
---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 355ca76ef..5d1dc4cab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ rdft/simd/common/*.c
 rdft/simd/kcvi/*.c
 rdft/simd/neon/*.c
 rdft/simd/sse2/*.c
+rdft/simd/sve*/*.c
 rdft/simd/vsx/*.c
 rdft/scalar/r2cb/*.c
 rdft/scalar/r2cf/*.c
@@ -45,6 +46,7 @@ dft/simd/common/*.c
 dft/simd/kcvi/*.c
 dft/simd/neon/*.c
 dft/simd/sse2/*.c
+dft/simd/sve*/*.c
 dft/simd/vsx/*.c
 
 # other generated files
@@ -54,6 +56,8 @@ api/fftw3*.f*
 *.cmake
 mpi/f03-wrap.c
 mpi/fftw3*-mpi.f*
+simd-support/vtw.h
+simd-support/generate_vtw
 
 # other build products
 tests/bench

From 1348189b56a031f455c8db96a963170fb0375755 Mon Sep 17 00:00:00 2001
From: Gilles Gouaillardet <gilles@rist.or.jp>
Date: Sat, 18 Jul 2020 15:47:08 +0900
Subject: [PATCH 05/13] sve: correctly support negative offsets

---
 simd-support/simd-maskedsve.h | 52 +++++++++++++++++------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
index 459d2bb8b..606854739 100644
--- a/simd-support/simd-maskedsve.h
+++ b/simd-support/simd-maskedsve.h
@@ -161,12 +161,12 @@ static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
 static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-  svuint32_t  gvvl = svindex_u32(0, 1);
-  gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ivs);
-  gvvl = svzip1_u32(gvvl, gvvl);
-  gvvl = svadd_u32_x(svptrue_b32(), gvvl, svdupq_n_u32(0,sizeof(R),0,sizeof(R)));
+  svint32_t  gvvl = svindex_s32(0, 1);
+  gvvl = svmul_n_s32_x(MASKA, gvvl, sizeof(R)*ivs);
+  gvvl = svzip1_s32(gvvl, gvvl);
+  gvvl = svadd_s32_x(MASKA, gvvl, svdupq_n_s32(0,sizeof(R),0,sizeof(R)));
   
-  return svld1_gather_u32offset_f32(MASKA, x, gvvl);
+  return svld1_gather_s32offset_f32(MASKA, x, gvvl);
 }
 
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
@@ -175,12 +175,12 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
   if (ovs==0) { // FIXME: hack for extra_iter hack support
     v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0));
   }
-  svuint32_t  gvvl = svindex_u32(0, 1);
-  gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ovs);
-  gvvl = svzip1_u32(gvvl, gvvl);
-  gvvl = svadd_u32_x(svptrue_b32(), gvvl, svdupq_n_u32(0,sizeof(R),0,sizeof(R)));
+  svint32_t  gvvl = svindex_s32(0, 1);
+  gvvl = svmul_n_s32_x(MASKA, gvvl, sizeof(R)*ovs);
+  gvvl = svzip1_s32(gvvl, gvvl);
+  gvvl = svadd_s32_x(MASKA, gvvl, svdupq_n_s32(0,sizeof(R),0,sizeof(R)));
 
-  svst1_scatter_u32offset_f32(MASKA, x, gvvl, v);
+  svst1_scatter_s32offset_f32(MASKA, x, gvvl, v);
 }
 
 #else /* !FFTW_SINGLE */
@@ -189,12 +189,12 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
   (void)aligned_like; /* UNUSED */
-  svuint64_t  gvvl = svindex_u64(0, 1);
-  gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ivs);
-  gvvl = svzip1_u64(gvvl, gvvl);
-  gvvl = svadd_u64_x(svptrue_b64(), gvvl, svdupq_n_u64(0,sizeof(R)));
+  svint64_t  gvvl = svindex_s64(0, 1);
+  gvvl = svmul_n_s64_x(MASKA, gvvl, sizeof(R)*ivs);
+  gvvl = svzip1_s64(gvvl, gvvl);
+  gvvl = svadd_s64_x(MASKA, gvvl, svdupq_n_s64(0,sizeof(R)));
 
-  return svld1_gather_u64offset_f64(MASKA, x, gvvl);
+  return svld1_gather_s64offset_f64(MASKA, x, gvvl);
 }
 
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
@@ -203,12 +203,12 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
   if (ovs==0) { // FIXME: hack for extra_iter hack support
     v = svdupq_lane_f64(v,0);
   }
-  svuint64_t  gvvl = svindex_u64(0, 1);
-  gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ovs);
-  gvvl = svzip1_u64(gvvl, gvvl);
-  gvvl = svadd_u64_x(svptrue_b64(), gvvl, svdupq_n_u64(0,sizeof(R)));
+  svint64_t  gvvl = svindex_s64(0, 1);
+  gvvl = svmul_n_s64_x(MASKA, gvvl, sizeof(R)*ovs);
+  gvvl = svzip1_s64(gvvl, gvvl);
+  gvvl = svadd_s64_x(MASKA, gvvl, svdupq_n_s64(0,sizeof(R)));
 
-  svst1_scatter_u64offset_f64(MASKA, x, gvvl, v);
+  svst1_scatter_s64offset_f64(MASKA, x, gvvl, v);
 }
 
 #endif /* FFTW_SINGLE */
@@ -224,10 +224,10 @@ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
   (void)aligned_like; /* UNUSED */
-  svuint32_t  gvvl = svindex_u32(0, 1);
-  gvvl = svmul_n_u32_x(svptrue_b32(), gvvl, sizeof(R)*ovs);
+  svint32_t  gvvl = svindex_s32(0, 1);
+  gvvl = svmul_n_s32_x(svptrue_b32(), gvvl, sizeof(R)*ovs);
 
-  svst1_scatter_u32offset_f32(MASKA, x, gvvl, v);
+  svst1_scatter_s32offset_f32(MASKA, x, gvvl, v);
 }
 #define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
 #else /* !FFTW_SINGLE */
@@ -238,10 +238,10 @@ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
   (void)aligned_like; /* UNUSED */
-  svuint64_t  gvvl = svindex_u64(0, 1);
-  gvvl = svmul_n_u64_x(svptrue_b64(), gvvl, sizeof(R)*ovs);
+  svint64_t  gvvl = svindex_s64(0, 1);
+  gvvl = svmul_n_s64_x(svptrue_b64(), gvvl, sizeof(R)*ovs);
 
-  svst1_scatter_u64offset_f64(MASKA, x, gvvl, v);
+  svst1_scatter_s64offset_f64(MASKA, x, gvvl, v);
 }
 #define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
 #endif /* FFTW_SINGLE */

From 38ca2c6ebf5ee26d94a373c970d71e458b34a7ed Mon Sep 17 00:00:00 2001
From: Gilles Gouaillardet <gilles@rist.or.jp>
Date: Sat, 18 Jul 2020 16:02:37 +0900
Subject: [PATCH 06/13] sve: go brrr

---
 simd-support/simd-maskedsve.h | 42 +++++++++++++----------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
index 606854739..e4b67590f 100644
--- a/simd-support/simd-maskedsve.h
+++ b/simd-support/simd-maskedsve.h
@@ -161,12 +161,9 @@ static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
 static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-  svint32_t  gvvl = svindex_s32(0, 1);
-  gvvl = svmul_n_s32_x(MASKA, gvvl, sizeof(R)*ivs);
-  gvvl = svzip1_s32(gvvl, gvvl);
-  gvvl = svadd_s32_x(MASKA, gvvl, svdupq_n_s32(0,sizeof(R),0,sizeof(R)));
-  
-  return svld1_gather_s32offset_f32(MASKA, x, gvvl);
+  svint64_t gvvl = svindex_s64(0, ivs/2);
+
+  return svreinterpret_f32_f64(svld1_gather_s64index_f64(MASKA, (const double *)x, gvvl));
 }
 
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
@@ -175,12 +172,9 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
   if (ovs==0) { // FIXME: hack for extra_iter hack support
     v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0));
   }
-  svint32_t  gvvl = svindex_s32(0, 1);
-  gvvl = svmul_n_s32_x(MASKA, gvvl, sizeof(R)*ovs);
-  gvvl = svzip1_s32(gvvl, gvvl);
-  gvvl = svadd_s32_x(MASKA, gvvl, svdupq_n_s32(0,sizeof(R),0,sizeof(R)));
+  svint64_t gvvl = svindex_s64(0, ovs/2);
 
-  svst1_scatter_s32offset_f32(MASKA, x, gvvl, v);
+  svst1_scatter_s64index_f64(MASKA, (double *)x, gvvl, svreinterpret_f64_f32(v));
 }
 
 #else /* !FFTW_SINGLE */
@@ -189,12 +183,10 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
   (void)aligned_like; /* UNUSED */
-  svint64_t  gvvl = svindex_s64(0, 1);
-  gvvl = svmul_n_s64_x(MASKA, gvvl, sizeof(R)*ivs);
-  gvvl = svzip1_s64(gvvl, gvvl);
-  gvvl = svadd_s64_x(MASKA, gvvl, svdupq_n_s64(0,sizeof(R)));
+  svint64_t  gvvl = svindex_s64(0, ivs);
+  gvvl = svzip1_s64(gvvl, svadd_n_s64_x(MASKA, gvvl, 1));
 
-  return svld1_gather_s64offset_f64(MASKA, x, gvvl);
+  return svld1_gather_s64index_f64(MASKA, x, gvvl);
 }
 
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
@@ -203,12 +195,10 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
   if (ovs==0) { // FIXME: hack for extra_iter hack support
     v = svdupq_lane_f64(v,0);
   }
-  svint64_t  gvvl = svindex_s64(0, 1);
-  gvvl = svmul_n_s64_x(MASKA, gvvl, sizeof(R)*ovs);
-  gvvl = svzip1_s64(gvvl, gvvl);
-  gvvl = svadd_s64_x(MASKA, gvvl, svdupq_n_s64(0,sizeof(R)));
+  svint64_t  gvvl = svindex_s64(0, ovs);
+  gvvl = svzip1_s64(gvvl, svadd_n_s64_x(MASKA, gvvl, 1));
 
-  svst1_scatter_s64offset_f64(MASKA, x, gvvl, v);
+  svst1_scatter_s64index_f64(MASKA, x, gvvl, v);
 }
 
 #endif /* FFTW_SINGLE */
@@ -224,10 +214,9 @@ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
   (void)aligned_like; /* UNUSED */
-  svint32_t  gvvl = svindex_s32(0, 1);
-  gvvl = svmul_n_s32_x(svptrue_b32(), gvvl, sizeof(R)*ovs);
+  svint32_t  gvvl = svindex_s32(0, ovs);
 
-  svst1_scatter_s32offset_f32(MASKA, x, gvvl, v);
+  svst1_scatter_s32index_f32(MASKA, x, gvvl, v);
 }
 #define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
 #else /* !FFTW_SINGLE */
@@ -238,10 +227,9 @@ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
   (void)aligned_like; /* UNUSED */
-  svint64_t  gvvl = svindex_s64(0, 1);
-  gvvl = svmul_n_s64_x(svptrue_b64(), gvvl, sizeof(R)*ovs);
+  svint64_t  gvvl = svindex_s64(0, ovs);
 
-  svst1_scatter_s64offset_f64(MASKA, x, gvvl, v);
+  svst1_scatter_s64index_f64(MASKA, x, gvvl, v);
 }
 #define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
 #endif /* FFTW_SINGLE */

From f85c7f7e4d3a043d0a4c41f962292df8d1f78639 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain@dolbeau.org>
Date: Wed, 2 Sep 2020 09:19:01 -0400
Subject: [PATCH 07/13] Remove test&branch (ovs==0) in STu, replace by masking.
 Also, improve VBYI.

---
 simd-support/simd-maskedsve.h | 40 +++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
index e4b67590f..0a1c9454b 100644
--- a/simd-support/simd-maskedsve.h
+++ b/simd-support/simd-maskedsve.h
@@ -94,7 +94,11 @@ typedef DS(svfloat64_t, svfloat32_t) V;
 /* FXIME: there is a better way, surely */
 /* #define VCONJ(x)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VRONE,0),x,VRONE,270) */
 #define VCONJ(x) TYPESUF(svmul,_x)(MASKA,x,VCONEMI)
+#if 0
 #define VBYI(x)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VCI,0),x,VCI,90)
+#else
+#define VBYI(x)  TYPESUF(svcadd,_x)(MASKA,VZERO,x,90)
+#endif
 
 #define VNEG(a)   TYPESUF(svneg,_x)(MASKA,a)
 #define VADD(a,b) TYPESUF(svadd,_x)(MASKA,a,b)
@@ -169,12 +173,19 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-  if (ovs==0) { // FIXME: hack for extra_iter hack support
-    v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0));
-  }
-  svint64_t gvvl = svindex_s64(0, ovs/2);
-
-  svst1_scatter_s64index_f64(MASKA, (double *)x, gvvl, svreinterpret_f64_f32(v));
+/*   if (ovs==0) { // FIXME: hack for extra_iter hack support */
+/*     v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); */
+/*   } */
+  const svint64_t gvvl = svindex_s64(0, ovs/2);
+
+  /* no-branch implementation of extra_iter hack support
+   * if ovs is non-zero, keep the original MASKA;
+   * if not, only store one 64 bits element (two 32 bits consecutive)
+   */
+  const svbool_t which = svdupq_n_b64(ovs != 0, ovs != 0);
+  const svbool_t mask = svsel_b(which, MASKA, svptrue_pat_b64(SV_VL1));
+
+  svst1_scatter_s64index_f64(mask, (double *)x, gvvl, svreinterpret_f64_f32(v));
 }
 
 #else /* !FFTW_SINGLE */
@@ -192,13 +203,20 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-  if (ovs==0) { // FIXME: hack for extra_iter hack support
-    v = svdupq_lane_f64(v,0);
-  }
-  svint64_t  gvvl = svindex_s64(0, ovs);
+/*   if (ovs==0) { // FIXME: hack for extra_iter hack support */
+/*     v = svdupq_lane_f64(v,0); */
+/*   } */
+  svint64_t gvvl = svindex_s64(0, ovs);
   gvvl = svzip1_s64(gvvl, svadd_n_s64_x(MASKA, gvvl, 1));
 
-  svst1_scatter_s64index_f64(MASKA, x, gvvl, v);
+  /* no-branch implementation of extra_iter hack support
+   * if ovs is non-zero, keep the original MASKA;
+   * if not, only store two 64 bits elements
+   */
+  const svbool_t which = svdupq_n_b64(ovs != 0, ovs != 0);
+  const svbool_t mask = svsel_b(which, MASKA, svptrue_pat_b64(SV_VL2));
+
+  svst1_scatter_s64index_f64(mask, x, gvvl, v);
 }
 
 #endif /* FFTW_SINGLE */

From 332007f250c4ef583d50c38f7535c027c372136f Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain@dolbeau.org>
Date: Wed, 9 Sep 2020 08:36:58 -0400
Subject: [PATCH 08/13] Experimental change for performance - non-masked
 ADD/SUB/MUL

ADD/SUB/MUL are three-addresses in SVE, but the masked form is only
two-adresses. And there's a lot of reuse in FFTW3 (and complex
arithmetic). But ACLE/SVE (i.e. intrinsics) don't have the non-masked
form :-(
So used inline ASM for force the non-masked version to be used.
Masked-out lanes should be mostly zero, and are never stored anyway, so
computing on them should be fine.

This one will be reversed if it's not a performance win.
---
 simd-support/simd-maskedsve.h | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
index 0a1c9454b..415bf0c70 100644
--- a/simd-support/simd-maskedsve.h
+++ b/simd-support/simd-maskedsve.h
@@ -40,7 +40,6 @@
 #  define ALLA  svptrue_b64()
 #endif /* FFTW_SINGLE */
 
-//#define SIMD_SUFFIX  _sve  /* for renaming */
 #if SVE_SIZE == 2048
 #define VL DS(16, 32)        /* SIMD complex vector length */
 #define MASKA DS(svptrue_pat_b64(SV_VL32),svptrue_pat_b32(SV_VL64))
@@ -70,8 +69,19 @@
 
 typedef DS(svfloat64_t, svfloat32_t) V;
 
+/* The goal is to limit to the required width by using masking.
+ * However, some SVE instructions are limited to two-addresses
+ * rather than three adresses when masked.
+ * (i.e. they do X op= Y, not X = Z op X)
+ * Loads will put zero in masked-out value.
+ * For performance reason, we want to use non-masked for the instructions
+ * with a two-addresses masked form: add & sub.
+ * But ACLE doesn't have the non-masked form...
+ */
+
+/* do we need to mask VLIT somehow ?*/
 #define VLIT(re, im) DS(svdupq_n_f64(re,im),svdupq_n_f32(re,im,re,im))
-#define VLIT1(val) DS(svdup_n_f64(val), svdup_n_f32(val))
+#define VLIT1(val) TYPESUF(svdup_n,_z)(MASKA,val)
 #define LDK(x) x
 #define DVK(var, val) V var = VLIT1(val)
 #define VZERO VLIT1(DS(0.,0.f))
@@ -91,7 +101,7 @@ typedef DS(svfloat64_t, svfloat32_t) V;
 #define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x)
 #endif
 
-/* FXIME: there is a better way, surely */
+/* FIXME: there is a better way, surely */
 /* #define VCONJ(x)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VRONE,0),x,VRONE,270) */
 #define VCONJ(x) TYPESUF(svmul,_x)(MASKA,x,VCONEMI)
 #if 0
@@ -101,9 +111,27 @@ typedef DS(svfloat64_t, svfloat32_t) V;
 #endif
 
 #define VNEG(a)   TYPESUF(svneg,_x)(MASKA,a)
+#if 0
 #define VADD(a,b) TYPESUF(svadd,_x)(MASKA,a,b)
 #define VSUB(a,b) TYPESUF(svsub,_x)(MASKA,a,b)
 #define VMUL(a,b) TYPESUF(svmul,_x)(MASKA,a,b)
+#else
+static inline V VADD(const V a, const V b) {
+	V r;
+	asm("fadd %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+	return r;
+}
+static inline V VSUB(const V a, const V b) {
+	V r;
+	asm("fsub %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+	return r;
+}
+static inline V VMUL(const V a, const V b) {
+	V r;
+	asm("fmul %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+	return r;
+}
+#endif
 #define VFMA(a, b, c)  TYPESUF(svmad,_x)(MASKA,b,a,c)
 #define VFMS(a, b, c)  TYPESUF(svnmsb,_x)(MASKA,b,a,c)
 #define VFNMS(a, b, c) TYPESUF(svmsb,_x)(MASKA,b,a,c)

From b0bf4c6c20e417486355061e2390577e3055834b Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain@dolbeau.org>
Date: Wed, 9 Sep 2020 09:14:04 -0400
Subject: [PATCH 09/13] Make some variants based on #define, as their behavior
 seems to be compiler/hardware dependent, and more tests are needed before
 settling on some defaults.

---
 simd-support/simd-maskedsve.h | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
index 415bf0c70..51bcaa397 100644
--- a/simd-support/simd-maskedsve.h
+++ b/simd-support/simd-maskedsve.h
@@ -77,7 +77,13 @@ typedef DS(svfloat64_t, svfloat32_t) V;
  * For performance reason, we want to use non-masked for the instructions
  * with a two-addresses masked form: add & sub.
  * But ACLE doesn't have the non-masked form...
+ * clang 11 & armclang 20.2 used masked form in assembly and lots of copies
+ * gcc 10 uses the non-masked form (!) and no copies
  */
+#define USE_UNMASKED_ASSEMBLY
+/* Define below to use masking instead of branching in STu
+ */
+//#define BRANCHLESS_STU
 
 /* do we need to mask VLIT somehow ?*/
 #define VLIT(re, im) DS(svdupq_n_f64(re,im),svdupq_n_f32(re,im,re,im))
@@ -111,7 +117,7 @@ typedef DS(svfloat64_t, svfloat32_t) V;
 #endif
 
 #define VNEG(a)   TYPESUF(svneg,_x)(MASKA,a)
-#if 0
+#if !defined(USE_UNMASKED_ASSEMBLY)
 #define VADD(a,b) TYPESUF(svadd,_x)(MASKA,a,b)
 #define VSUB(a,b) TYPESUF(svsub,_x)(MASKA,a,b)
 #define VMUL(a,b) TYPESUF(svmul,_x)(MASKA,a,b)
@@ -201,19 +207,21 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-/*   if (ovs==0) { // FIXME: hack for extra_iter hack support */
-/*     v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0)); */
-/*   } */
   const svint64_t gvvl = svindex_s64(0, ovs/2);
-
+#if !defined(BRANCHLESS_STU)
+  if (ovs==0) { // FIXME: hack for extra_iter hack support
+    v = svreinterpret_f32_f64(svdup_lane_f64(svreinterpret_f64_f32(v),0));
+  }
+  svst1_scatter_s64index_f64(MASKA, (double *)x, gvvl, svreinterpret_f64_f32(v));
+#else
   /* no-branch implementation of extra_iter hack support
    * if ovs is non-zero, keep the original MASKA;
    * if not, only store one 64 bits element (two 32 bits consecutive)
    */
   const svbool_t which = svdupq_n_b64(ovs != 0, ovs != 0);
   const svbool_t mask = svsel_b(which, MASKA, svptrue_pat_b64(SV_VL1));
-
   svst1_scatter_s64index_f64(mask, (double *)x, gvvl, svreinterpret_f64_f32(v));
+#endif
 }
 
 #else /* !FFTW_SINGLE */
@@ -231,12 +239,14 @@ static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-/*   if (ovs==0) { // FIXME: hack for extra_iter hack support */
-/*     v = svdupq_lane_f64(v,0); */
-/*   } */
   svint64_t gvvl = svindex_s64(0, ovs);
   gvvl = svzip1_s64(gvvl, svadd_n_s64_x(MASKA, gvvl, 1));
-
+#if !defined(BRANCHLESS_STU)
+  if (ovs==0) { // FIXME: hack for extra_iter hack support
+    v = svdupq_lane_f64(v,0);
+  }
+  svst1_scatter_s64index_f64(MASKA, x, gvvl, v);
+#else
   /* no-branch implementation of extra_iter hack support
    * if ovs is non-zero, keep the original MASKA;
    * if not, only store two 64 bits elements
@@ -245,6 +255,7 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
   const svbool_t mask = svsel_b(which, MASKA, svptrue_pat_b64(SV_VL2));
 
   svst1_scatter_s64index_f64(mask, x, gvvl, v);
+#endif
 }
 
 #endif /* FFTW_SINGLE */

From e3150025ade7e4bc5d6af821ab8e849e48ff6a3e Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain@dolbeau.org>
Date: Thu, 4 Mar 2021 13:26:09 +0100
Subject: [PATCH 10/13] oups, missing ASM for SP

---
 simd-support/simd-maskedsve.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
index 51bcaa397..9c5be42ce 100644
--- a/simd-support/simd-maskedsve.h
+++ b/simd-support/simd-maskedsve.h
@@ -124,17 +124,29 @@ typedef DS(svfloat64_t, svfloat32_t) V;
 #else
 static inline V VADD(const V a, const V b) {
 	V r;
+#ifdef FFTW_SINGLE
+	asm("fadd %[r].s, %[a].s, %[b].s\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+#else
 	asm("fadd %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+#endif
 	return r;
 }
 static inline V VSUB(const V a, const V b) {
 	V r;
+#ifdef FFTW_SINGLE
+	asm("fsub %[r].s, %[a].s, %[b].s\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+#else
 	asm("fsub %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+#endif
 	return r;
 }
 static inline V VMUL(const V a, const V b) {
 	V r;
+#ifdef FFTW_SINGLE
+	asm("fmul %[r].s, %[a].s, %[b].s\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+#else
 	asm("fmul %[r].d, %[a].d, %[b].d\n" : [r]"=w"(r) : [a]"w"(a), [b]"w"(b));
+#endif
 	return r;
 }
 #endif

From eefa1b44b279100e7e994d047c4c102131e31cc1 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain@dolbeau.org>
Date: Fri, 5 Mar 2021 09:10:15 +0100
Subject: [PATCH 11/13] disable USE_UNMASKED_ASSEMBLY by default so it can be
 reenabled from the command line

---
 simd-support/simd-maskedsve.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
index 9c5be42ce..ca7cccc00 100644
--- a/simd-support/simd-maskedsve.h
+++ b/simd-support/simd-maskedsve.h
@@ -80,7 +80,7 @@ typedef DS(svfloat64_t, svfloat32_t) V;
  * clang 11 & armclang 20.2 used masked form in assembly and lots of copies
  * gcc 10 uses the non-masked form (!) and no copies
  */
-#define USE_UNMASKED_ASSEMBLY
+//#define USE_UNMASKED_ASSEMBLY
 /* Define below to use masking instead of branching in STu
  */
 //#define BRANCHLESS_STU

From 8cb2fbd8f4623d6c9d66a03f7684781ee9158622 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain@dolbeau.org>
Date: Mon, 4 Mar 2024 10:10:14 +0000
Subject: [PATCH 12/13] improve VZMULI[j], clean-up old code

---
 simd-support/simd-maskedsve.h | 31 ++++++-------------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/simd-support/simd-maskedsve.h b/simd-support/simd-maskedsve.h
index ca7cccc00..d21ff6cdb 100644
--- a/simd-support/simd-maskedsve.h
+++ b/simd-support/simd-maskedsve.h
@@ -101,20 +101,14 @@ typedef DS(svfloat64_t, svfloat32_t) V;
 #define VDUPH(x) TYPE(svtrn2)(x,x)
 
 #ifdef FFTW_SINGLE
-//#define FLIP_RI(x) svreinterpret_f32_u64(svrevw_u64_x(MASKA,svreinterpret_u64_f32(x)))
 #define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x)
 #else
 #define FLIP_RI(x) TYPE(svtrn1)(VDUPH(x),x)
 #endif
 
-/* FIXME: there is a better way, surely */
-/* #define VCONJ(x)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VRONE,0),x,VRONE,270) */
+/* there might be a better way */
 #define VCONJ(x) TYPESUF(svmul,_x)(MASKA,x,VCONEMI)
-#if 0
-#define VBYI(x)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,x,VCI,0),x,VCI,90)
-#else
 #define VBYI(x)  TYPESUF(svcadd,_x)(MASKA,VZERO,x,90)
-#endif
 
 #define VNEG(a)   TYPESUF(svneg,_x)(MASKA,a)
 #if !defined(USE_UNMASKED_ASSEMBLY)
@@ -155,26 +149,14 @@ static inline V VMUL(const V a, const V b) {
 #define VFNMS(a, b, c) TYPESUF(svmsb,_x)(MASKA,b,a,c)
 #define VFMAI(b, c)    TYPESUF(svcadd,_x)(MASKA,c,b,90)
 #define VFNMSI(b, c)   TYPESUF(svcadd,_x)(MASKA,c,b,270)
-/* FIXME: next 3 overkill ? */
-#if 0
-#define VFMACONJ(b,c)  TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,c,b,VRONE,0),b,VRONE,270)
-#else
-/* Use inline functions instead of macros to avoid replicating inputs */
+
 static inline V VFMACONJ(V b, V c) {
 	V m = TYPESUF(svcmla,_x)(MASKA,c,b,VRONE,0);
 	return TYPESUF(svcmla,_x)(MASKA,m,b,VRONE,270);
 }
-#endif
 #define VFMSCONJ(b,c)  VFMACONJ(b,VNEG(c))
 #define VFNMSCONJ(b,c) VNEG(VFMSCONJ(b,c))
 
-#if 0
-#define VZMUL(a,b)    TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0),a,b,90)
-#define VZMULJ(a,b)   TYPESUF(svcmla,_x)(MASKA,TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0),a,b,270)
-#define VZMULI(a,b)   VZMUL(VCI,VZMUL(a,b))
-#define VZMULIJ(a,b)   VZMUL(VCI,VZMULJ(a,b))
-#else
-/* Use inline functions instead of macros to avoid replicating inputs */
 static inline V VZMUL(V a, V b) {
 	V m = TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0);
 	return TYPESUF(svcmla,_x)(MASKA,m,a,b,90);
@@ -183,17 +165,16 @@ static inline V VZMULJ(V a, V b) {
         V m = TYPESUF(svcmla,_x)(MASKA,VZERO,a,b,0);
         return TYPESUF(svcmla,_x)(MASKA,m,a,b,270);
 }
-/* FIXME: there's probably a better way */
+/* there might be a better way */
 static inline V VZMULI(V a, V b) {
 	V m = VZMUL(a,b);
-	return VZMUL(VCI,m);
+	return VFMAI(m, VZERO);
 }
-/* FIXME: there's probably a better way */
+/* there might be a better way */
 static inline V VZMULIJ(V a, V b) {
 	V m = VZMULJ(a,b);
-	return VZMUL(VCI,m);
+	return VFMAI(m, VZERO);
 }
-#endif
 
 static inline V LDA(const R *x, INT ivs, const R *aligned_like) {
   (void)aligned_like; /* UNUSED */

From ff3dfb01f038a83ec274ae3a88902ef0fa4de1c5 Mon Sep 17 00:00:00 2001
From: Gilles Gouaillardet <gilles@rist.or.jp>
Date: Tue, 16 Apr 2024 15:09:01 +0900
Subject: [PATCH 13/13] try building a sample SVE program

When configure'd with --enable-sve, try to build a sample SVE program
and abort on failure, otherwise configure successes but make will fail.
---
 configure.ac  | 10 ++++++++--
 m4/acx_sve.m4 | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 m4/acx_sve.m4

diff --git a/configure.ac b/configure.ac
index e6fd591bf..24f9735a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -237,11 +237,10 @@ AM_CONDITIONAL(HAVE_GENERIC_SIMD256, test "$have_generic_simd256" = "yes")
 
 AC_ARG_ENABLE(sve, [AC_HELP_STRING([--enable-sve],[enable ARM SVE optimizations])], have_sve=$enableval, have_sve=no)
 if test "$have_sve" = "yes"; then
-        AC_DEFINE(HAVE_SVE,1,[Define to enable ARM SVE optimizations.])
+   AC_DEFINE(HAVE_SVE,1,[Define to enable ARM SVE optimizations])
 fi
 AM_CONDITIONAL(HAVE_SVE, test "$have_sve" = "yes")
 
-
 dnl FIXME:
 dnl AC_ARG_ENABLE(mips-ps, [AS_HELP_STRING([--enable-mips-ps],[enable MIPS pair-single optimizations])], have_mips_ps=$enableval, have_mips_ps=no)
 dnl if test "$have_mips_ps" = "yes"; then
@@ -683,6 +682,13 @@ if test "$enable_openmp" = "yes"; then
    AX_OPENMP([], [AC_MSG_ERROR([don't know how to enable OpenMP])])
 fi
 
+if test "$have_sve" = "yes"; then
+   ACX_SVE([sve_ok=yes], [sve_ok=no])
+   if test "$sve_ok" != "yes"; then
+      AC_MSG_ERROR([Cannot build a SVE program, aborting])
+   fi
+fi
+
 AC_ARG_ENABLE(threads, [AS_HELP_STRING([--enable-threads],[compile FFTW SMP threads library])], enable_threads=$enableval, enable_threads=no)
 
 if test "$enable_threads" = "yes"; then
diff --git a/m4/acx_sve.m4 b/m4/acx_sve.m4
new file mode 100644
index 000000000..6a981fe8e
--- /dev/null
+++ b/m4/acx_sve.m4
@@ -0,0 +1,26 @@
+dnl @synopsis ACX_SVE([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+dnl @summary figure out whether a simple SVE program can be compiled
+dnl @category InstalledPackages
+dnl
+dnl This macro tries to compile a simple SVE program that uses
+dnl the ACLE SVE extensions.
+dnl
+dnl ACTION-IF-FOUND is a list of shell commands to run if a SVE
+dnl program can be compiled, and ACTION-IF-NOT-FOUND is a list of commands
+dnl to run it cannot.
+dnl
+dnl @version 2024-04-15
+dnl @license GPLWithACException
+dnl @author Gilles Gouaillardet <gilles@rist.or.jp>
+
+AC_DEFUN([ACX_SVE], [
+
+   AC_MSG_CHECKING([whether a SVE program can be compiled])
+   AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>]],
+      [[#if defined(__GNUC__) && !defined(__ARM_FEATURE_SVE)
+#error compiling without SVE support
+#endif]])],[AC_MSG_RESULT([yes])
+            $1],
+       [AC_MSG_RESULT([no])
+        $2])
+])dnl ACX_SVE