From 51410cfac69bfecbd92146f9cf3f234f53323137 Mon Sep 17 00:00:00 2001
From: Laurent Desnogues <laurent.desnogues@gmail.com>
Date: Wed, 14 Aug 2024 14:22:23 +0200
Subject: [PATCH] Remove warnings from radix36_ditN_cy_dif1.c.

---
 src/radix36_ditN_cy_dif1.c    | 173 +++++++++++++++++++++++++++-------
 src/radix36_main_carry_loop.h |  29 +++---
 2 files changed, 154 insertions(+), 48 deletions(-)

diff --git a/src/radix36_ditN_cy_dif1.c b/src/radix36_ditN_cy_dif1.c
index 3f6b75e4..477d06af 100755
--- a/src/radix36_ditN_cy_dif1.c
+++ b/src/radix36_ditN_cy_dif1.c
@@ -141,19 +141,37 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 !   storage scheme, and radix8_ditN_cy_dif1 for details on the reduced-length weights array scheme.
 */
 	const char func[] = "radix36_ditN_cy_dif1";
+  #if !defined(MULTITHREAD) && defined(USE_SSE2)
 	const int pfetch_dist = PFETCH_DIST;
+  #endif
 	const int stride = (int)RE_IM_STRIDE << 1;	// main-array loop stride = 2*RE_IM_STRIDE
   #if !defined(USE_SSE2) || (COMPACT_OBJ != 0)
 	static uint32 p0123[4];
   #endif
 	static double wts_mult[2], inv_mult[2];	// Const wts-multiplier and 2*(its multiplicative inverse)
+  #if !defined(MULTITHREAD) && !defined(USE_SSE2)
 	double wt_re,wt_im, wi_re,wi_im;	// Fermat-mod/LOACC weights stuff, used in both scalar and SIMD mode
+  #endif
   #ifdef USE_AVX512
 	const int jhi_wrap = 15;
   #else
 	const int jhi_wrap =  7;
   #endif
-	int NDIVR,i,incr,j,j1,j2,jt,jp,jstart,jhi,full_pass,k,khi,l,ntmp,outer,nbytes;
+	int NDIVR,i,j,j1,jt,jhi,full_pass,khi,l,outer;
+  #if !defined(MULTITHREAD) && !defined(USE_SSE2)
+	int j2,jp;
+  #endif
+  #if !defined(MULTITHREAD) && (!defined(USE_SSE2) || defined(USE_AVX512))
+	int ntmp;
+  #endif
+  #ifndef MULTITHREAD
+	int jstart;
+  #endif
+  #ifdef USE_SSE2
+	int nbytes;
+  #endif
+  #if !defined(MULTITHREAD) && defined(USE_SSE2)
+	int incr;
 	// incr = Carry-chain wts-multipliers recurrence length, which must divide
 	// RADIX/[n-wayness of carry macro], e.g. RADIX/[16|8|4] = --|--|9 for avx512,avx,sse, respectively:
 	const int incr_long = 9,incr_med = 3, incr_short = 3;
@@ -172,6 +190,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		incr = incr_short;
 	else
 		incr = incr_hiacc;
+  #endif
 
 	// Jun 2018: Add support for residue shift. (Only LL-test needs intervention at carry-loop level).
 	int target_idx = -1, target_set = 0,tidx_mod_stride;
@@ -186,43 +205,56 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					s   =  0.64278760968653932631,	/* sin(2*pi/9) */
 					c2  =  0.17364817766693034887,	/* cos(2*u) */
 					s2  =  0.98480775301220805936,	/* sin(2*u) */
-					c3  = -0.50000000000000000000,	/* cos(3*u) */
 					c3m1= -1.50000000000000000000,	/* cos(3*u)-1 */
 					s3  =  0.86602540378443864677,	/* sin(3*u) */
 					c4  = -0.93969262078590838404,	/* cos(4*u) */
 					s4  =  0.34202014332566873307;	/* sin(4*u) */
 #endif
+  #ifndef MULTITHREAD
 	double *addr;
+  #endif
 	static double radix_inv, n2inv;
 	double scale, dtmp, maxerr = 0.0;
 	// Local storage: We must use an array here because scalars have no guarantees about relative address offsets
 	// [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
 	// to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
-	struct complex t[RADIX], *tptr;
-	int *itmp,*itm2;	// Pointer into the bjmodn array
+	struct complex t[RADIX];
+  #if !defined(MULTITHREAD) && !defined(USE_SSE2)
+	struct complex *tptr;
+  #endif
+  #ifndef MULTITHREAD
+	int *itmp;	// Pointer into the bjmodn array
+  #endif
+  #if !defined(MULTITHREAD) && defined(USE_AVX) && !defined(USE_AVX512)
+	int *itm2;	// Pointer into the bjmodn array
+  #endif
 	int err;
 	static int first_entry=TRUE;
 
 /*...stuff for the reduced-length DWT weights array is here:	*/
 	int n_div_nwt;
+  #ifndef MULTITHREAD
 	int col,co2,co3;
-  #ifdef USE_AVX512
+   #ifdef USE_AVX512
 	double t0,t1,t2,t3;
 	static struct uint32x8 *n_minus_sil,*n_minus_silp1,*sinwt,*sinwtm1;
-  #elif defined(USE_AVX)
+   #elif defined(USE_AVX)
 	static struct uint32x4 *n_minus_sil,*n_minus_silp1,*sinwt,*sinwtm1;
-  #else
+   #else
 	int n_minus_sil,n_minus_silp1,sinwt,sinwtm1;
 	double wtl,wtlp1,wtn,wtnm1;	/* Mersenne-mod weights stuff */
-  #endif
-  #ifdef USE_AVX2
+   #endif
+   #ifdef USE_AVX2
 	// Due to GCC macro argc limit of 30, to enable 16-register data-doubled version of the radix-9 macros need 2 length-9 ptr arrays:
 	vec_dbl *rad9_iptr[9], *rad9_optr[9];
+   #endif
   #endif
 
 #ifdef USE_SSE2
 
+  #if !defined(MULTITHREAD) && defined(USE_AVX512)
 	int i0,i1,i2,i3;
+  #endif
 	static int cslots_in_local_store;
 	static vec_dbl *sc_arr = 0x0, *sc_ptr;
 	static uint64 *sm_ptr, *sign_mask, *sse_bw, *sse_sw, *sse_n;
@@ -231,20 +263,37 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
   #ifdef MULTITHREAD
 	static vec_dbl *__r0;	// Base address for discrete per-thread local stores
   #else
-	double *add0, *add1, *add2, *add3, *add4, *add5, *add6, *add7, *add8;	/* Addresses into array sections */
+	double *add0, *add1, *add2, *add3;	/* Addresses into array sections */
   #endif
 
+  #ifndef MULTITHREAD
 	static int *bjmodn;	// Alloc mem for this along with other 	SIMD stuff
+  #endif
+  #ifndef USE_AVX512
 	const double crnd = 3.0*0x4000000*0x2000000;
+  #endif
+  #ifndef USE_AVX
 	struct complex *ctmp;	// Hybrid AVX-DFT/SSE2-carry scheme used for Mersenne-mod needs a 2-word-double pointer
-	vec_dbl *tmp,*tm1,*tm2;	// Non-static utility ptrs
+  #endif
+	vec_dbl *tmp,*tm2;	// Non-static utility ptrs
+  #ifndef MULTITHREAD
+	vec_dbl *tm1;	// Non-static utility ptrs
+  #endif
 	static vec_dbl *two,*one, *cc1, *ss1, *cc2, *ss2, *cc3m1, *ss3, *cc4, *ss4, *max_err, *sse2_rnd, *half_arr
-		,*r00,*r02,*r04,*r06,*r08,*r0a,*r0c,*r0e,*r0g
+		,*r00
+	  #if !defined(MULTITHREAD)
+	   #if !COMPACT_OBJ
+	    ,*r02,*r04,*r06,*r08,*r0a,*r0c,*r0e,*r0g
 		,*r10,*r12,*r14,*r16,*r18,*r1a,*r1c,*r1e,*r1g
 		,*r20,*r22,*r24,*r26,*r28,*r2a,*r2c,*r2e,*r2g
 		,*r30,*r32,*r34,*r36,*r38,*r3a,*r3c,*r3e,*r3g
-		,*s1p00,*s1p01,*s1p02,*s1p03,*s1p04,*s1p05,*s1p06,*s1p07,*s1p08,*s1p09,*s1p10,*s1p11,*s1p12,*s1p13,*s1p14,*s1p15,*s1p16,*s1p17
+	   #endif
+		,*s1p00
+	   #if !COMPACT_OBJ
+		,*s1p01,*s1p02,*s1p03,*s1p04,*s1p05,*s1p06,*s1p07,*s1p08,*s1p09,*s1p10,*s1p11,*s1p12,*s1p13,*s1p14,*s1p15,*s1p16,*s1p17
 		,*s1p18,*s1p19,*s1p20,*s1p21,*s1p22,*s1p23,*s1p24,*s1p25,*s1p26,*s1p27,*s1p28,*s1p29,*s1p30,*s1p31,*s1p32,*s1p33,*s1p34,*s1p35
+	   #endif
+	  #endif
 		,*cy;	// Need RADIX/2 slots for sse2 carries, RADIX/4 for avx
 
 #endif
@@ -253,7 +302,10 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	static struct cy_thread_data_t *tdat = 0x0;
 	// Threadpool-based dispatch stuff:
-	static int main_work_units = 0, pool_work_units = 0;
+  #if 0//def OS_TYPE_MACOSX
+	static int main_work_units = 0;
+  #endif
+	static int pool_work_units = 0;
 	static struct threadpool *tpool = 0x0;
 	static int task_is_blocking = TRUE;
 	static thread_control_t thread_control = {0,0,0};
@@ -297,8 +349,10 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
+  #ifndef MULTITHREAD
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
 	col=co2=co3=-1;
+  #endif
 	// Jan 2018: To support PRP-testing, read the LR-modpow-scalar-multiply-needed bit for the current iteration from the global array:
 	double prp_mult = 1.0;
 	if((TEST_TYPE & 0xfffffffe) == TEST_TYPE_PRP) {	// Mask off low bit to lump together PRP and PRP-C tests
@@ -476,7 +530,9 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		__r0 = sc_ptr;
 	#endif
 		tmp = sc_ptr;			tm2 = tmp + 0x48;
-		r00	= tmp + 0x00;		s1p00 = tm2 + 0x00;
+		r00	= tmp + 0x00;
+	#ifndef MULTITHREAD
+								s1p00 = tm2 + 0x00;
 	  #if !COMPACT_OBJ
 		r02	= tmp + 0x02;		s1p01 = tm2 + 0x02;
 		r04	= tmp + 0x04;		s1p02 = tm2 + 0x04;
@@ -514,6 +570,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		r3e	= tmp + 0x44;		s1p34 = tm2 + 0x44;
 		r3g	= tmp + 0x46;		s1p35 = tm2 + 0x46;
 	  #endif
+	#endif
 		tmp	+= 0x92;	// Extra 2 slots here for two,one below - added those late, too lazy to rejigger all the existing offsets following
 		two    = tmp - 2;	// AVX+ versions of Radix-32 DFT macros assume consts 2.0,1.0,sqrt2,isrt2 laid out thusly
 		one    = tmp - 1;
@@ -786,16 +843,20 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		nbytes = 4 << L2_SZ_VD;
 
 	#ifdef USE_AVX512
+	  #ifndef MULTITHREAD
 		n_minus_sil   = (struct uint32x8 *)sse_n + 1;
 		n_minus_silp1 = (struct uint32x8 *)sse_n + 2;
 		sinwt         = (struct uint32x8 *)sse_n + 3;
 		sinwtm1       = (struct uint32x8 *)sse_n + 4;
+	  #endif
 		nbytes += 128;
 	#elif defined(USE_AVX)
+	  #ifndef MULTITHREAD
 		n_minus_sil   = (struct uint32x4 *)sse_n + 1;
 		n_minus_silp1 = (struct uint32x4 *)sse_n + 2;
 		sinwt         = (struct uint32x4 *)sse_n + 3;
 		sinwtm1       = (struct uint32x4 *)sse_n + 4;
+	  #endif
 		nbytes += 64;
 	#endif
 
@@ -808,10 +869,12 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		}
 
 	// For large radices, array-access to bjmodn means only init base-ptr here:
-	#ifdef USE_AVX
+	#ifndef MULTITHREAD
+	  #ifdef USE_AVX
 		bjmodn = (int*)(sinwtm1 + RE_IM_STRIDE);
-	#else
+	  #else
 		bjmodn = (int*)(sse_n   + RE_IM_STRIDE);
+	  #endif
 	#endif
 
 	#endif	// USE_SSE2
@@ -1500,7 +1563,6 @@ void radix36_dif_pass1(double a[], int n)
 					s   =  0.64278760968653932631,	/* sin(2*pi/9) */
 					c2  =  0.17364817766693034887,	/* cos(2*u) */
 					s2  =  0.98480775301220805936,	/* sin(2*u) */
-					c3  = -0.50000000000000000000,	/* cos(3*u) */
 					c3m1= -1.50000000000000000000,	/* cos(3*u)-1 */
 					s3  =  0.86602540378443864677,	/* sin(3*u) */
 					c4  = -0.93969262078590838404,	/* cos(4*u) */
@@ -1630,11 +1692,10 @@ void radix36_dit_pass1(double a[], int n)
 					c2  =  0.17364817766693034887,	/* cos(2*u) */
 					s2  =  0.98480775301220805936,	/* sin(2*u) */
 					c3  = -0.50000000000000000000,	/* cos(3*u) */
-					c3m1= -1.50000000000000000000,	/* cos(3*u)-1 */
 					s3  =  0.86602540378443864677,	/* sin(3*u) */
 					c4  = -0.93969262078590838404,	/* cos(4*u) */
 					s4  =  0.34202014332566873307;	/* sin(4*u) */
-	double rt,it,re;
+	double rt,it;
 	struct complex t[RADIX], *tptr;
 
 	if(!first_entry && (n/RADIX) != NDIVR)	/* New runlength?	*/
@@ -1745,11 +1806,21 @@ void radix36_dit_pass1(double a[], int n)
 	{
 		struct cy_thread_data_t* thread_arg = targ;	// Move to top because scalar-mode carry pointers taken directly from it
 		double *addr;
+	#ifdef USE_SSE2
 		const int pfetch_dist = PFETCH_DIST;
+	#endif
 		const int stride = (int)RE_IM_STRIDE << 1;	// main-array loop stride = 2*RE_IM_STRIDE
 		uint32 p01,p02,p03,p04,p08,p12,p16,p20,p24,p28,p32;
 		int poff[RADIX>>2];	// Store mults of p04 offset for loop control
-		int incr,j,j1,j2,jt,jp,k,l,ntmp;
+		int j,j1,l;
+	#ifndef USE_SSE2
+		int j2,jt,jp;
+	#endif
+	#if !defined(USE_SSE2) || defined(USE_AVX512)
+		int ntmp;
+	#endif
+	#ifdef USE_SSE2
+		int incr;
 		// incr = Carry-chain wts-multipliers recurrence length, which must divide
 		// RADIX/[n-wayness of carry macro], e.g. RADIX/[16|8|4] = --|--|9 for avx512,avx,sse, respectively:
 		const int incr_long = 9,incr_med = 3, incr_short = 3;
@@ -1768,8 +1839,11 @@ void radix36_dit_pass1(double a[], int n)
 			incr = incr_short;
 		else
 			incr = incr_hiacc;
+	#endif
 
+	#ifndef USE_AVX
 		double wtl,wtlp1,wtn,wtnm1;	/* Mersenne-mod weights stuff */
+	#endif
 	#ifdef USE_AVX512
 		double t0,t1,t2,t3;
 		struct uint32x8 *n_minus_sil,*n_minus_silp1,*sinwt,*sinwtm1;
@@ -1782,27 +1856,52 @@ void radix36_dit_pass1(double a[], int n)
 		// Due to GCC macro argc limit of 30, to enable 16-register data-doubled version of the radix-9 macros need 2 length-9 ptr arrays:
 		vec_dbl *rad9_iptr[9], *rad9_optr[9];
 	#endif
+	#ifndef USE_SSE2
 		double wt_re,wt_im, wi_re,wi_im;	// Fermat-mod/LOACC weights stuff, used in both scalar and SIMD mode
+	#endif
 	#if !defined(USE_SSE2) || (COMPACT_OBJ != 0)
 		uint32 p0123[4];
 	#endif
 	#ifdef USE_SSE2
+	  #ifdef USE_AVX512
 		int i0,i1,i2,i3;
+	  #else
 		const double crnd = 3.0*0x4000000*0x2000000;
-		int *itmp,*itm2;	// Pointer into the bjmodn array
+	  #endif
+		int *itmp;	// Pointer into the bjmodn array
+	  #if defined(USE_AVX) && !defined(USE_AVX512)
+		int *itm2;	// Pointer into the bjmodn array
+	  #endif
+	  #ifndef USE_AVX
 		struct complex *ctmp;	// Hybrid AVX-DFT/SSE2-carry scheme used for Mersenne-mod needs a 2-word-double pointer
+	  #endif
 		double *add0, *add1, *add2, *add3;	/* Addresses into array sections */
 		int *bjmodn;	// Alloc mem for this along with other 	SIMD stuff
-		vec_dbl *two,*one, *cc1, *ss1, *cc2, *ss2, *cc3m1, *ss3, *cc4, *ss4, *max_err, *sse2_rnd, *half_arr
-			,*r00,*r02,*r04,*r06,*r08,*r0a,*r0c,*r0e,*r0g
+		vec_dbl
+		  #ifdef USE_AVX2
+			*two,
+		  #endif
+		  /* *one, */ *cc1, /* *ss1, *cc2, *ss2, *cc3m1, *ss3, *cc4, *ss4, */ *max_err, *half_arr
+		  #ifndef USE_AVX512
+			,*sse2_rnd
+		  #endif
+			,*r00
+		  #if !COMPACT_OBJ
+			,*r02,*r04,*r06,*r08,*r0a,*r0c,*r0e,*r0g
 			,*r10,*r12,*r14,*r16,*r18,*r1a,*r1c,*r1e,*r1g
 			,*r20,*r22,*r24,*r26,*r28,*r2a,*r2c,*r2e,*r2g
 			,*r30,*r32,*r34,*r36,*r38,*r3a,*r3c,*r3e,*r3g
-			,*s1p00,*s1p01,*s1p02,*s1p03,*s1p04,*s1p05,*s1p06,*s1p07,*s1p08,*s1p09,*s1p10,*s1p11,*s1p12,*s1p13,*s1p14,*s1p15,*s1p16,*s1p17
+		  #endif
+			,*s1p00
+		  #if !COMPACT_OBJ
+			,*s1p01,*s1p02,*s1p03,*s1p04,*s1p05,*s1p06,*s1p07,*s1p08,*s1p09,*s1p10,*s1p11,*s1p12,*s1p13,*s1p14,*s1p15,*s1p16,*s1p17
 			,*s1p18,*s1p19,*s1p20,*s1p21,*s1p22,*s1p23,*s1p24,*s1p25,*s1p26,*s1p27,*s1p28,*s1p29,*s1p30,*s1p31,*s1p32,*s1p33,*s1p34,*s1p35
+		  #endif
 			,*cy;	// Need RADIX/2 slots for sse2 carries, RADIX/4 for avx
 		vec_dbl *tmp,*tm1,*tm2;	// Non-static utility ptrs
+	  #ifndef USE_AVX512
 		double dtmp;
+	  #endif
 		uint64 *sign_mask, *sse_bw, *sse_sw, *sse_n;
 
 	#else
@@ -1811,7 +1910,6 @@ void radix36_dit_pass1(double a[], int n)
 						s   =  0.64278760968653932631,	/* sin(2*pi/9) */
 						c2  =  0.17364817766693034887,	/* cos(2*u) */
 						s2  =  0.98480775301220805936,	/* sin(2*u) */
-						c3  = -0.50000000000000000000,	/* cos(3*u) */
 						c3m1= -1.50000000000000000000,	/* cos(3*u)-1 */
 						s3  =  0.86602540378443864677,	/* sin(3*u) */
 						c4  = -0.93969262078590838404,	/* cos(4*u) */
@@ -1828,7 +1926,6 @@ void radix36_dit_pass1(double a[], int n)
 	#endif	// SIMD or scalar?
 
 	// int data:
-		int iter = thread_arg->iter;
 		int NDIVR = thread_arg->ndivr;
 		int n = NDIVR*RADIX;
 		int target_idx = thread_arg->target_idx;
@@ -1846,7 +1943,7 @@ void radix36_dit_pass1(double a[], int n)
 
 	// double data:
 		double maxerr = thread_arg->maxerr;
-		double scale = thread_arg->scale;	int full_pass = scale < 0.5;
+		double scale = thread_arg->scale;
 		double prp_mult = thread_arg->prp_mult;
 
 	// pointer data:
@@ -1931,16 +2028,18 @@ void radix36_dit_pass1(double a[], int n)
 		r3g	= tmp + 0x46;		s1p35 = tm2 + 0x46;
 	  #endif
 		tmp	+= 0x92;	// Extra 2 slots here for two,one below - added those late, too lazy to rejigger all the existing offsets following
+	   #ifdef USE_AVX2
 		two    = tmp - 2;	// AVX+ versions of Radix-32 DFT macros assume consts 2.0,1.0,sqrt2,isrt2 laid out thusly
-		one    = tmp - 1;
+	   #endif
+		//one    = tmp - 1;
 		cc1    = tmp + 0;
-		ss1    = tmp + 1;
-		cc2    = tmp + 2;
-		ss2    = tmp + 3;
-		cc3m1  = tmp + 4;
-		ss3    = tmp + 5;
-		cc4    = tmp + 6;
-		ss4    = tmp + 7;
+		//ss1    = tmp + 1;
+		//cc2    = tmp + 2;
+		//ss2    = tmp + 3;
+		//cc3m1  = tmp + 4;
+		//ss3    = tmp + 5;
+		//cc4    = tmp + 6;
+		//ss4    = tmp + 7;
 		tmp += 0x8;
 	  #ifdef USE_AVX512
 		cy = tmp;		tmp += 5;		// RADIX/8 and round up
@@ -1950,7 +2049,9 @@ void radix36_dit_pass1(double a[], int n)
 		cy = tmp;		tmp += 18;
 	  #endif
 		max_err = tmp + 0x00;
+	  #ifndef USE_AVX512
 		sse2_rnd= tmp + 0x01;
+	  #endif
 		half_arr= tmp + 0x02;	/* This table needs 20x16 bytes */
 
 		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
diff --git a/src/radix36_main_carry_loop.h b/src/radix36_main_carry_loop.h
index faf450f2..ba58d923 100755
--- a/src/radix36_main_carry_loop.h
+++ b/src/radix36_main_carry_loop.h
@@ -23,7 +23,7 @@
 // This main loop is same for un-and-multithreaded, so stick into a header file
 // (can't use a macro because of the #if-enclosed stuff).
 
-for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
+for(int k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 {
 	/* In SIMD mode, data are arranged in [re_0,...,re_n-1,im_0,...,im_n-1] groups, not the usual [re_0,im_0],...,[re_n-1,im_n-1] pairs.
 	Thus we can still increment the j-index as if stepping through the residue array-of-doubles in strides of 2,
@@ -33,7 +33,9 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 	for(j = jstart; j < jhi; j += stride)
 	{
 		j1 = j + ( (j >> DAT_BITS) << PAD_BITS );	/* padded-array fetch index is here */
+	#ifndef USE_SSE2
 		j2 = j1 + RE_IM_STRIDE;
+	#endif
 
 	/*...The radix-36 DIT pass is here:	*/
 
@@ -75,8 +77,8 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 		*vb0,*vb1,*vb2,*vb3,*vb4,*vb5,*vb6,*vb7,*vb8;	// O-ptrs
 	   #ifdef USE_AVX2
 		// Due to GCC macro argc limit of 30, to enable 16-register data-doubled version of the radix-9 macros need 2 length-9 ptr arrays:
-		tm1 = rad9_iptr;	// Stash head-of-array-ptrs in tmps to workaround GCC's "not directly addressable" macro arglist stupidity
-		tm2 = rad9_optr;
+		tm1 = (vec_dbl *)rad9_iptr;	// Stash head-of-array-ptrs in tmps to workaround GCC's "not directly addressable" macro arglist stupidity
+		tm2 = (vec_dbl *)rad9_optr;
 		for(l = 0, tmp = r00, ntmp = 0; l < 2; l++, ntmp += 18) {
 	   #else
 		for(l = 0, tmp = r00, ntmp = 0; l < 4; l++, ntmp += 9) {
@@ -134,8 +136,8 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 		/* Radix-9 DFT uses adjacent temps, i.e. stride = 2*16 bytes: */
 	   #ifdef USE_AVX2
 		// Due to GCC macro argc limit of 30, to enable 16-register data-doubled version of the radix-9 macros need 2 length-9 ptr arrays:
-		tm1 = rad9_iptr;	// Stash head-of-array-ptrs in tmps to workaround GCC's "not directly addressable" macro arglist stupidity
-		tm2 = rad9_optr;
+		//tm1 = ad9_iptr;	// Stash head-of-array-ptrs in tmps to workaround GCC's "not directly addressable" macro arglist stupidity
+		//tm2 = ad9_optr;
 		// Pointer patterns here same as for DIF, just need to swap I/O by reversing order of tm1,tm2 --> tm2,tm1 in macro arglists:
 		rad9_iptr[0] = s1p27; rad9_iptr[1] = s1p23; rad9_iptr[2] = s1p19; rad9_iptr[3] = s1p15; rad9_iptr[4] = s1p11; rad9_iptr[5] = s1p07; rad9_iptr[6] = s1p03; rad9_iptr[7] = s1p35; rad9_iptr[8] = s1p31;
 		rad9_optr[0] = r10; rad9_optr[1] = r12; rad9_optr[2] = r14; rad9_optr[3] = r16; rad9_optr[4] = r18; rad9_optr[5] = r1a; rad9_optr[6] = r1c; rad9_optr[7] = r1e; rad9_optr[8] = r1g;
@@ -272,7 +274,10 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 
 		i = (!j);
 		addr = &prp_mult;
-		tmp = s1p00; tm1 = cy; tm2 = cy+1; itmp = bjmodn; itm2 = bjmodn+4;
+		tmp = s1p00; tm1 = cy; tm2 = cy+1; itmp = bjmodn;
+	   #ifndef USE_AVX512
+		itm2 = bjmodn+4;
+	   #endif
 		for(l = 0; l < RADIX>>3; l++) {
 			// Each AVX carry macro call also processes 8 prefetches of main-array data
 			add0 = a + j1 + pfetch_dist + poff[l+l];
@@ -325,7 +330,7 @@ After rcol-carry-estimate step:
 Output:
 */
 		add0 += p04;				// prefetch of a + [prefetch offset] + p4,5,6,7
-		tmp  = (double *)tmp +  4;	// Call 2 will handle the .d4-7 doubles of our 4 input zmm register-sized vector data
+		tmp  = (vec_dbl *)((double *)tmp +  4);	// Call 2 will handle the .d4-7 doubles of our 4 input zmm register-sized vector data
 		AVX_cmplx_carry_fast_errcheck_X4(tmp, tm1, itmp, half_arr,0x800, sign_mask,sse_bw,sse_n,sse_sw, add0,p01,p02,p03, addr);	// Call 2 wts-data pointers += 0x400
 	   #endif
 /*
@@ -609,8 +614,8 @@ vinsertf64x4 1,ymm1,zmm0,zmm0	 3-6/1 for y,z,z, 7/1 for m256,z,z			3/1 for y,z,z
 		// Radix-9 DFT inputs can use same optr_off[] perm-index array as DIT:
 	   #ifdef USE_AVX2
 		// Due to GCC macro argc limit of 30, to enable 16-register data-doubled version of the radix-9 macros need 2 length-9 ptr arrays:
-		tm1 = rad9_iptr;	// Stash head-of-array-ptrs in tmps to workaround GCC's "not directly addressable" macro arglist stupidity
-		tm2 = rad9_optr;
+		tm1 = (vec_dbl *)rad9_iptr;	// Stash head-of-array-ptrs in tmps to workaround GCC's "not directly addressable" macro arglist stupidity
+		tm2 = (vec_dbl *)rad9_optr;
 		for(l = 0, tmp = r00, ntmp = 0; l < 2; l++, ntmp += 18) {
 	   #else
 		for(l = 0, tmp = r00, ntmp = 0; l < 4; l++, ntmp += 9) {
@@ -668,8 +673,8 @@ vinsertf64x4 1,ymm1,zmm0,zmm0	 3-6/1 for y,z,z, 7/1 for m256,z,z			3/1 for y,z,z
 		/* Radix-9 DFT uses adjacent temps, i.e. stride = 2*16 bytes: */
 	   #ifdef USE_AVX2
 		// Due to GCC macro argc limit of 30, to enable 16-register data-doubled version of the radix-9 macros need 2 length-9 ptr arrays:
-		tm1 = rad9_iptr;	// Stash head-of-array-ptrs in tmps to workaround GCC's "not directly addressable" macro arglist stupidity
-		tm2 = rad9_optr;
+		tm1 = (vec_dbl *)rad9_iptr;	// Stash head-of-array-ptrs in tmps to workaround GCC's "not directly addressable" macro arglist stupidity
+		tm2 = (vec_dbl *)rad9_optr;
 		rad9_iptr[0] = s1p27; rad9_iptr[1] = s1p23; rad9_iptr[2] = s1p19; rad9_iptr[3] = s1p15; rad9_iptr[4] = s1p11; rad9_iptr[5] = s1p07; rad9_iptr[6] = s1p03; rad9_iptr[7] = s1p35; rad9_iptr[8] = s1p31;
 		rad9_optr[0] = r10; rad9_optr[1] = r12; rad9_optr[2] = r14; rad9_optr[3] = r16; rad9_optr[4] = r18; rad9_optr[5] = r1a; rad9_optr[6] = r1c; rad9_optr[7] = r1e; rad9_optr[8] = r1g;
 		SSE2_RADIX_09_DIF_X2(s1p00,s1p32,s1p28,s1p24,s1p20,s1p16,s1p12,s1p08,s1p04, cc1,two, r00,r02,r04,r06,r08,r0a,r0c,r0e,r0g,
@@ -747,7 +752,7 @@ vinsertf64x4 1,ymm1,zmm0,zmm0	 3-6/1 for y,z,z, 7/1 for m256,z,z			3/1 for y,z,z
 
 	col += RADIX;
 	co3 -= RADIX;
-}	/* end for(k=1; k <= khi; k++) */
+}	/* end for(int k=1; k <= khi; k++) */
 
 #ifndef USE_ARM_V8_SIMD
   #undef OFF