From 9cd261ef4e5d196cc55d4fce45f1306794883d44 Mon Sep 17 00:00:00 2001
From: yurkin <yurkin@gmail.com>
Date: Thu, 7 Feb 2013 12:43:09 +0000
Subject: [PATCH] A lot of improvements to sparse mode: - Macro (flag)
 ADDA_SPARSE was shortened to SPARSE. - Makefiles modified to produce info
 about SPARSE and USE_SSE3 and set up interaction (errors) with other options.
 - Gather operations of sparse mode are now performed through AllGather(). For
 that it was modified to enable in-place operation, and a new MPI_Datatype
 mpi_int3 was introduced. Previously used functions and variables has been
 removed. - In sparse mode local_nvoid_d0 and local_nvoid_d1 are now
 initialized in ParSetup(), while SetupLocalD() is not called at all. - Memory
 optimizations: material_full and remnants of DipoleCoord_full are eliminated,
 'position' is no more allocated but just points to a part of position_full. -
 A lot of indexes in sparse_ops.h are now size_t instead of int. Among other
 changes, Fixes issue 159. - Fixed initialization of mat_count in
 MakeParticle() in sparse mode. It was broken and led - Timings for init
 Dmatrix and FFT setup are no more shown in sparse mode. - All command line
 options, not supported in sparse mode, are now explicitly removed during
 compilation (with appropriate modification of help texts). These includes:
 '-shape ...' (except read), '-init_field wkb', -granul, -save_geom,
 -sg_format, -store_grans. - Removed error messages when using '-int ...'
 (except poi) in sparse mode. Now these options work fine, but relatively
 slow. - Information about compile options SPARSE and USE_SSE3 (when used) has
 been added to output of option -V. Added information about sparse mode to log
 file (instead of information about FFT method). - Added meaningful error
 message for default run of adda (without command line parameters) in sparse
 mode. - A lot of formatting changes to make the sparse mode similar to the
 rest of the code.

Other changes:
- interaction.c/h now holds all the code related to calculation of Green's tensor. In particular, reading/freeing of table integrals was moved from calculator.c, and duplicating code to compute Green's tensor in fft.c was removed.
An important change is that now functions, which compute Green's tensor do not test for zero argument. Hence, these test is now made in InitDmatrix().
- Intermediate vector DipoleCoord_tmp is no more used in make_particle.c. This somewhat decreases memory usage during prognosis. Overall, the code in MakeParticle() was significantly changed (a lot of moving parts around).
- oclcore.h is now always included, but is void if OPENCL is not specified. That is similar to other conditional headers like fft.h.
- Tests (tests/2exec) were modified to incorporate sparse mode (to compare two sparse mode versions or sparse mode vs. FFT mode). Corresponding tests are listed in separate file - suite_sparse, which is automatically used if appropriate flag is uncommented in comp2exec.  Added 2 new shape files and modified ellipsoid.geom. Improved behavior of diff_numeric.awk - before it was skipping differences, if the line was missing in the first file.
- A number of files (new or heavily modified) were formatted for a window width of 120 symbols.
- A number of minor syntax changes to remove warnings by Eclipse code analyzer. In particular, changed several empty statements from ";" to "{}", removed two unused functions, and hid several extern declarations in sparse mode.
- Fixed crashes when using command line options '-shape' and '-beam' without arguments. Now meaningful error messages are produced.
- version incremented to 1.2b2.

The updated code was tested to compile in different compilation modes (almost all possible combinations) and tested by tests/2exec/ against the version 1.1.
---
 src/CalculateE.c             |  47 +--
 src/Makefile                 |  65 ++--
 src/calculator.c             | 154 ++--------
 src/cmplx.h                  |   7 +-
 src/comm.c                   | 170 ++++-------
 src/comm.h                   |  34 +--
 src/const.h                  |   2 +-
 src/crosssec.c               |  43 +--
 src/fft.c                    | 578 +----------------------------------
 src/fft.h                    |  10 +-
 src/interaction.c            | 402 ++++++++++++++----------
 src/interaction.h            |  27 +-
 src/iterative.c              |  14 +-
 src/make_particle.c          | 180 +++++------
 src/matvec.c                 |  86 +++---
 src/ocl/Makefile             |  42 +--
 src/oclcore.c                |   5 +-
 src/oclcore.h                |   7 +-
 src/param.c                  | 104 +++++--
 src/sparse_ops.h             | 248 ++++++++-------
 src/timing.c                 |  10 +-
 src/vars.c                   |  15 +-
 src/vars.h                   |  13 +-
 tests/2exec/README           |  22 +-
 tests/2exec/comp2exec        | 123 +++++---
 tests/2exec/diff_numeric.awk |  34 ++-
 tests/2exec/ellipsoid.geom   | 426 +++++++++++---------------
 tests/2exec/sphere.geom      | 283 +++++++++++++++++
 tests/2exec/sphere4.geom     |  35 +++
 tests/2exec/suite            |  25 +-
 tests/2exec/suite_sparse     | 275 +++++++++++++++++
 31 files changed, 1661 insertions(+), 1825 deletions(-)
 create mode 100644 tests/2exec/sphere.geom
 create mode 100644 tests/2exec/sphere4.geom
 create mode 100644 tests/2exec/suite_sparse

diff --git a/src/CalculateE.c b/src/CalculateE.c
index f8c8009e..4c80fb59 100644
--- a/src/CalculateE.c
+++ b/src/CalculateE.c
@@ -5,7 +5,7 @@
  *        Routines for most scattering quantities are in crosssec.c. Also saves internal fields to
  *        file (optional).
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -60,13 +60,6 @@ extern size_t TotalEFieldPlane;
 // used in iterative.c
 TIME_TYPE tstart_CE;
 
-// EXTERNAL FUNCTIONS
-
-// GenerateB.c
-void GenerateB(enum incpol which,doublecomplex *x);
-// iterative.c
-int IterativeSolver(enum iter method);
-
 // LOCAL VARIABLES
 
 #define MUEL_HEADER "s11 s12 s13 s14 s21 s22 s23 s24 s31 s32 s33 s34 s41 s42 s43 s44"
@@ -80,6 +73,13 @@ int IterativeSolver(enum iter method);
 #define ANGLE_FORMAT "%.2f"
 #define RMSE_FORMAT "%.3E"
 
+// EXTERNAL FUNCTIONS
+
+// GenerateB.c
+void GenerateB(enum incpol which,doublecomplex *x);
+// iterative.c
+int IterativeSolver(enum iter method);
+
 //============================================================
 
 static void ComputeMuellerMatrix(double matrix[4][4], const doublecomplex s1,const doublecomplex s2,
@@ -111,37 +111,6 @@ static void ComputeMuellerMatrix(double matrix[4][4], const doublecomplex s1,con
 
 //============================================================
 
-static void ATT_UNUSED ComputeMuellerMatrixNorm(double matrix[4][4],const doublecomplex s1,
-	const doublecomplex s2,const doublecomplex s3,const doublecomplex s4)
-/* computer mueller matrix from scattering matrix elements s1, s2, s3, s4, according to formula
- * 3.16 from Bohren and Huffman; normalize all elements to S11 (except itself)
- */
-{
-	matrix[0][0] = 0.5*(cMultConRe(s1,s1)+cMultConRe(s2,s2)+cMultConRe(s3,s3)+cMultConRe(s4,s4));
-	matrix[0][1] = 0.5*(cMultConRe(s2,s2)-cMultConRe(s1,s1)+cMultConRe(s4,s4)-cMultConRe(s3,s3))
-	             / matrix[0][0];
-	matrix[0][2] = (cMultConRe(s2,s3)+cMultConRe(s1,s4))/matrix[0][0];
-	matrix[0][3] = (cMultConIm(s2,s3)-cMultConIm(s1,s4))/matrix[0][0];
-
-	matrix[1][0] = 0.5*(cMultConRe(s2,s2)-cMultConRe(s1,s1)+cMultConRe(s3,s3)-cMultConRe(s4,s4))
-	             / matrix[0][0];
-	matrix[1][1] = 0.5*(cMultConRe(s2,s2)+cMultConRe(s1,s1)-cMultConRe(s3,s3)-cMultConRe(s4,s4))
-	             / matrix[0][0];
-	matrix[1][2] = (cMultConRe(s2,s3)-cMultConRe(s1,s4))/matrix[0][0];
-	matrix[1][3] = (cMultConIm(s2,s3)+cMultConIm(s1,s4))/matrix[0][0];
-
-	matrix[2][0] = (cMultConRe(s2,s4)+cMultConRe(s1,s3))/matrix[0][0];
-	matrix[2][1] = (cMultConRe(s2,s4)-cMultConRe(s1,s3))/matrix[0][0];
-	matrix[2][2] = (cMultConRe(s1,s2)+cMultConRe(s3,s4))/matrix[0][0];
-	matrix[2][3] = (cMultConIm(s2,s1)+cMultConIm(s4,s3))/matrix[0][0];
-
-	matrix[3][0] = (cMultConIm(s4,s2)+cMultConIm(s1,s3))/matrix[0][0];
-	matrix[3][1] = (cMultConIm(s4,s2)-cMultConIm(s1,s3))/matrix[0][0];
-	matrix[3][2] = (cMultConIm(s1,s2)-cMultConIm(s3,s4))/matrix[0][0];
-	matrix[3][3] = (cMultConRe(s1,s2)-cMultConRe(s3,s4))/matrix[0][0];
-}
-
-//==============================================================
 INLINE void InitMuellerIntegrFile(const int type,const char * restrict fname,FILE * restrict * file,
 	char * restrict buf,const size_t buf_size,double * restrict *mult)
 /* If 'phi_int_type' matches 'type', appropriate file (name given by 'fname') is created (with
diff --git a/src/Makefile b/src/Makefile
index eba25a5e..a11c5b13 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -79,7 +79,7 @@ ifneq ($(if $(MAKECMDGOALS),$(if $(filter $(NONTRIVIAL),$(MAKECMDGOALS)),1,),1),
 # below are appended to the list specified elsewhere.
 # Full list of possible options is the following:
 VALID_OPTS := DEBUG DEBUGFULL FFT_TEMPERTON PRECISE_TIMING NOT_USE_LOCK ONLY_LOCKFILE NO_FORTRAN \
-              NO_CPP OVERRIDE_STDC_TEST OCL_READ_SOURCE_RUNTIME CLFFT_APPLE SPARSE
+              NO_CPP OVERRIDE_STDC_TEST OCL_READ_SOURCE_RUNTIME CLFFT_APPLE SPARSE USE_SSE3
 
 # Debug mode. By default, release configuration is used (no debug, no warnings, maximum
 # optimization). DEBUG turns on producing debugging symbols (-g) and warnings and brings
@@ -224,22 +224,42 @@ else
   $(info Release mode)
   DBGLVL := 0
 endif
-ifneq ($(filter FFT_TEMPERTON,$(OPTIONS)),)
-  $(info Temperton FFT)
-  CDEFS += -DFFT_TEMPERTON
-  ifeq ($(filter NO_FORTRAN,$(OPTIONS)),)
-    FSOURCE += cfft99D.f
-  else
-    $(error Temperton FFT (FFT_TEMPERTON) is implemented in Fortran, hence incompatible with NO_FORTRAN)
+ifneq ($(filter SPARSE,$(OPTIONS)),)
+  $(info Sparse (non-FFT) mode)
+  ifneq ($(filter FFT_TEMPERTON,$(OPTIONS)),)
+    $(error SPARSE turns off all FFT-related code, so it is incompatible with FFT_TEMPERTON)
+  endif
+  ifneq ($(filter CLFFT_APPLE,$(OPTIONS)),)
+    $(error SPARSE turns off all FFT-related code, so it is incompatible with CLFFT_APPLE)
+  endif
+  ifneq ($(filter PRECISE_TIMING,$(OPTIONS)),)
+    $(error SPARSE is currently incompatible with PRECISE_TIMING)
   endif
+  
+  CDEFS += -DSPARSE
 else
-  $(info FFTW3)
-  LDLIBS += -lfftw3
-  ifdef FFTW3_INC_PATH
-    CFLAGS += -I$(FFTW3_INC_PATH)
+  CSOURCE += fft.c
+  ifneq ($(filter FFT_TEMPERTON,$(OPTIONS)),)
+    $(info Temperton FFT)
+    CDEFS += -DFFT_TEMPERTON
+    ifeq ($(filter NO_FORTRAN,$(OPTIONS)),)
+      FSOURCE += cfft99D.f
+    else
+      $(error Temperton FFT (FFT_TEMPERTON) is implemented in Fortran, hence incompatible with NO_FORTRAN)
+    endif
+  else
+    $(info FFTW3)
+    LDLIBS += -lfftw3
+    ifdef FFTW3_INC_PATH
+      CFLAGS += -I$(FFTW3_INC_PATH)
+    endif
+    ifdef FFTW3_LIB_PATH
+      LDFLAGS += -L$(FFTW3_LIB_PATH)
+    endif
   endif
-  ifdef FFTW3_LIB_PATH
-    LDFLAGS += -L$(FFTW3_LIB_PATH)
+  ifneq ($(filter CLFFT_APPLE,$(OPTIONS)),)
+    # Here only the info is printed, the main logic is in ocl/Makefile
+    $(info Apple clFFT routines)
   endif
 endif
 ifneq ($(filter PRECISE_TIMING,$(OPTIONS)),)
@@ -270,24 +290,15 @@ ifneq ($(filter OVERRIDE_STDC_TEST,$(OPTIONS)),)
   $(info Overriding test for C99 conformance)
   CDEFS += -DOVERRIDE_STDC_TEST
 endif
-
-ifneq ($(filter SPARSE,$(OPTIONS)),)
-  CDEFS += -DADDA_SPARSE
-else
-  CSOURCE += fft.c
-endif
 ifneq ($(filter USE_SSE3,$(OPTIONS)),)
   CDEFS += -DUSE_SSE3
   CFLAGS += -msse3
+  $(info Using SSE3 optimizations)
 endif
 ifneq ($(filter OCL_READ_SOURCE_RUNTIME,$(OPTIONS)),)
   # Here only the info is printed, the main logic is in ocl/Makefile
   $(info Read CL sources at runtime)
 endif
-ifneq ($(filter CLFFT_APPLE,$(OPTIONS)),)
-  # Here only the info is printed, the main logic is in ocl/Makefile
-  $(info Apple clFFT routines)
-endif
 # Process EXTRA_FLAGS
 ifneq ($(strip $(EXTRA_FLAGS)),)
   $(info Extra compiler options: '$(EXTRA_FLAGS)')
@@ -446,12 +457,12 @@ clean: cleanseq cleanmpi cleanocl
 # compilation and thus contain quite heavy processing.
 cleanseq:
 	@echo "Removing sequential compiled files"
-	cd $(SEQ) && rm -f *.o *.d *.d.* $(OPTSFILES) $(PROGSEQ) $(PROGSEQ).exe
+	cd $(SEQ) && rm -f *.o *.d $(OPTSFILES) $(PROGSEQ) $(PROGSEQ).exe
 
 cleanmpi:
 	@echo "Removing MPI compiled files"
-	cd $(MPI) && rm -f *.o *.d *.d.* $(OPTSFILES) $(PROGMPI) $(PROGMPI).exe
+	cd $(MPI) && rm -f *.o *.d $(OPTSFILES) $(PROGMPI) $(PROGMPI).exe
 
 cleanocl:
 	@echo "Removing OpenCL compiled files"
-	cd $(OCL) && rm -f *.o *.d *.d.* $(OPTSFILES) $(PROGOCL) $(PROGOCL).exe *.clstr
+	cd $(OCL) && rm -f *.o *.d $(OPTSFILES) $(PROGOCL) $(PROGOCL).exe *.clstr
diff --git a/src/calculator.c b/src/calculator.c
index fa29529c..c3a08ea3 100644
--- a/src/calculator.c
+++ b/src/calculator.c
@@ -3,7 +3,7 @@
  * Descr: all the initialization is done here before actually calculating internal fields; includes
  *        calculation of couple constants
  *
- * Copyright (C) 2006-2010,2012 ADDA contributors
+ * Copyright (C) 2006-2010,2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -27,6 +27,7 @@
 #include "interaction.h"
 #include "io.h"
 #include "memory.h"
+#include "oclcore.h"
 #include "Romberg.h"
 #include "timing.h"
 #include "vars.h"
@@ -35,10 +36,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef OPENCL
-#	include "oclcore.h"
-#endif
-
 // SEMI-GLOBAL VARIABLES
 
 // defined and initialized in crosssec.c
@@ -64,26 +61,27 @@ double dtheta_deg,dtheta_rad; // delta theta in degrees and radians
 	// amplitude matrix for different values of alpha
 doublecomplex * restrict ampl_alphaX,* restrict ampl_alphaY;
 double * restrict muel_alpha; // mueller matrix for different values of alpha
-// used in fft.c
-  // tables of integrals
-double * restrict tab1,* restrict tab2,* restrict tab3,* restrict tab4,* restrict tab5,
-	* restrict tab6,* restrict tab7,* restrict tab8,* restrict tab9,* restrict tab10;
-/* it is preferable to declare the following as "* restrict * restrict", but it is hard to make it
- * generally compatible with Free_iMatrix function syntax. However, it is defined so in fft.c.
- */
-int ** restrict tab_index; // matrix for indexing of table arrays
+
 // used in crosssec.c
 double * restrict E2_alldir; // square of E, calculated for alldir
 double * restrict E2_alldir_buffer; // buffer to accumulate E2_alldir
 doublecomplex cc[MAX_NMAT][3]; // couple constants
 	// arrays of exponents along 3 axes (for calc_field)
+#ifndef SPARSE
 doublecomplex * restrict expsX,* restrict expsY,* restrict expsZ;
+#endif
+
 // used in iterative.c
 doublecomplex *rvec;                 // current residual
 doublecomplex * restrict Avecbuffer; // used to hold the result of matrix-vector products
 // auxiliary vectors, used in some iterative solvers (with more meaningful names)
 doublecomplex * restrict vec1,* restrict vec2,* restrict vec3;
 
+#ifdef SPARSE
+// used in matvec.c
+doublecomplex * restrict arg_full; // vector to hold argvec for all dipoles
+#endif
+
 // LOCAL VARIABLES
 
 static size_t block_theta; // size of one block of mueller matrix - 16*nTheta
@@ -95,9 +93,9 @@ static double * restrict out;
 // EXTERNAL FUNCTIONS
 
 // CalculateE.c
-extern int CalculateE(enum incpol which,enum Eftype type);
-extern void MuellerMatrix(void);
-extern void SaveMuellerAndCS(double * restrict in);
+int CalculateE(enum incpol which,enum Eftype type);
+void MuellerMatrix(void);
+void SaveMuellerAndCS(double * restrict in);
 
 //============================================================
 
@@ -247,91 +245,6 @@ static void InitCC(const enum incpol which)
 
 //============================================================
 
-static double * ATT_MALLOC ReadTableFile(const char * restrict sh_fname,const int size_multiplier)
-{
-	FILE * restrict ftab;
-	double * restrict tab_n;
-	int size;
-	char fname[MAX_FNAME];
-	int i;
-
-	size=TAB_SIZE*size_multiplier;
-	memory+=size*sizeof(double);
-	if (!prognosis) {
-		// allocate memory for tab_n
-		MALLOC_VECTOR(tab_n,double,size,ALL);
-		// open file
-		SnprintfErr(ALL_POS,fname,MAX_FNAME,TAB_PATH"%s",sh_fname);
-		ftab=FOpenErr(fname,"r",ALL_POS);
-		// scan file
-		for (i=0; i<size; i++) if (fscanf(ftab,"%lf\t",&(tab_n[i]))!=1)
-			LogError(ALL_POS,"Scan error in file '%s'. Probably file is too small",fname);
-		if (!feof(ftab))
-			LogWarning(EC_WARN,ONE_POS,"File '%s' is longer than specified size (%d)",fname,size);
-		// close file
-		FCloseErr(ftab,fname,ALL_POS);
-		return tab_n;
-	}
-	else return NULL;
-}
-
-//============================================================
-
-static void ReadTables(void)
-{
-	int i, j, ymax, Rm2, Rm2x;
-
-	TIME_TYPE tstart=GET_TIME();
-	tab1=ReadTableFile(TAB_FNAME(1),1);
-	tab2=ReadTableFile(TAB_FNAME(2),6);
-	tab3=ReadTableFile(TAB_FNAME(3),3);
-	tab4=ReadTableFile(TAB_FNAME(4),18);
-	tab5=ReadTableFile(TAB_FNAME(5),6);
-	tab6=ReadTableFile(TAB_FNAME(6),36);
-	tab7=ReadTableFile(TAB_FNAME(7),1);
-	tab8=ReadTableFile(TAB_FNAME(8),6);
-	tab9=ReadTableFile(TAB_FNAME(9),1);
-	tab10=ReadTableFile(TAB_FNAME(10),6);
-	Timing_FileIO += GET_TIME() - tstart;
-
-	if (!prognosis) {
-		// allocate memory for tab_index
-		MALLOC_IMATRIX(tab_index,1,TAB_RMAX,0,TAB_RMAX,ALL);
-		// fill tab_index
-		Rm2=TAB_RMAX*TAB_RMAX;
-		tab_index[1][0] = 0;
-		for (i=1; i<=TAB_RMAX; i++) {
-			Rm2x=Rm2-i*i;
-			ymax = MIN(i,(int)floor(sqrt(Rm2x)));
-			for (j=0; j<ymax; j++) {
-				tab_index[i][j+1] = tab_index[i][j] + MIN(j,(int)floor(sqrt(Rm2x-j*j)))+1;
-			}
-			if (i<TAB_RMAX) tab_index[i+1][0] = tab_index[i][ymax]
-			                                  + MIN(ymax,(int)floor(sqrt(Rm2x-ymax*ymax)))+1;
-		}
-	}
-	// PRINTZ("P[5,3]=%d (should be 41)\n",tab_index[5][3]);
-}
-
-//============================================================
-
-static void FreeTables(void)
-{
-	Free_iMatrix(tab_index,1,TAB_RMAX,0);
-	Free_general(tab1);
-	Free_general(tab2);
-	Free_general(tab3);
-	Free_general(tab4);
-	Free_general(tab5);
-	Free_general(tab6);
-	Free_general(tab7);
-	Free_general(tab8);
-	Free_general(tab9);
-	Free_general(tab10);
-}
-
-//============================================================
-
 static void calculate_one_orientation(double * restrict res)
 // performs calculation for one orientation; may do orientation averaging and put the result in res
 {
@@ -426,12 +339,12 @@ static void AllocateEverything(void)
 		MALLOC_VECTOR(Avecbuffer,complex,local_nRows,ALL);
 	}
 	memory+=5*tmp;
-#ifdef ADDA_SPARSE
+#ifdef SPARSE
 	if (!prognosis) {
 		MALLOC_VECTOR(arg_full,complex,3*nvoid_Ndip,ALL);
 	}
 	memory+=3*nvoid_Ndip*sizeof(*arg_full);
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 	/* additional vectors for iterative methods. Potentially, this procedure can be fully automated
 	 * for any new iterative solver, based on the information contained in structure array 'params'
 	 * in file iterative.c. However, this requires different order of function calls to extract this
@@ -458,11 +371,11 @@ static void AllocateEverything(void)
 	 * non-zero, then change the above condition to allocate memory for these vectors. Variable
 	 * memory should be incremented to reflect the total allocated memory.
 	 */
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 	MALLOC_VECTOR(expsX,complex,boxX,ALL);
 	MALLOC_VECTOR(expsY,complex,boxY,ALL);
 	MALLOC_VECTOR(expsZ,complex,local_Nz_unif,ALL);
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 	if (yzplane) {
 		tmp=2*(double)nTheta;
 		if (!prognosis) {
@@ -584,25 +497,16 @@ static void FreeEverything(void)
  * actually allocated.
  */
 {
-	if (IntRelation == G_SO || IntRelation == G_IGT_SO) FreeTables();
-#ifndef ADDA_SPARSE	
+	FreeInteraction();
+#ifndef SPARSE	
 	Free_FFT_Dmat();
 	Free_cVector(expsX);
 	Free_cVector(expsY);
 	Free_cVector(expsZ);
+	Free_general(position); // allocated in MakeParticle();
 #else	
-	Free_general(position_full);
-	Free_general(material_full);
-#ifdef PARALLEL
-	Free_general(arg_full);
-	Free_general(proc_mem_position);
-	Free_general(proc_mem_material);
-	Free_general(proc_mem_argvec);
-	Free_general(proc_disp_position);
-	Free_general(proc_disp_material);
-	Free_general(proc_disp_argvec);
-#endif //PARALLEL
-#endif //ADDA_SPARSE
+	Free_general(position_full); // allocated in MakeParticle();
+#endif // SPARSE
 
 	Free_cVector(xvec);
 	Free_cVector(rvec);
@@ -657,9 +561,8 @@ static void FreeEverything(void)
 		Free_general(Egrid_buffer);
 #endif
 	}
-	// these 3 were allocated in MakeParticle
+	// these 2 were allocated in MakeParticle
 	Free_general(DipoleCoord);
-	Free_general(position);
 	Free_general(material);
 
 	if (orient_avg) {
@@ -702,19 +605,14 @@ void Calculator (void)
 	}
 	else dtheta_deg=dtheta_rad=block_theta=0;
 	finish_avg=false;
-	// read tables if needed
-	if (IntRelation == G_SO || IntRelation == G_IGT_SO) ReadTables();
-
 	// Do preliminary setup for MatVec
 	InitInteraction();
-
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 	// initialize D matrix (for matrix-vector multiplication)
 	D("InitDmatrix started");
 	InitDmatrix();
 	D("InitDmatrix finished");
-#endif //ADDA_SPARSE
-
+#endif // !SPARSE
 	// allocate most (that is not already allocated; perform memory analysis
 	AllocateEverything();
 	// finish initialization
diff --git a/src/cmplx.h b/src/cmplx.h
index 90623ba8..78c491e2 100644
--- a/src/cmplx.h
+++ b/src/cmplx.h
@@ -882,6 +882,9 @@ INLINE double Rad2Deg(const double rad)
 }
 
 #ifdef USE_SSE3
+
+//============================================================
+
 static inline __m128d cmul(__m128d a,__m128d b)
 // complex multiplication
 {
@@ -894,12 +897,14 @@ static inline __m128d cmul(__m128d a,__m128d b)
 	return _mm_addsub_pd(t,b);
 }
 
+//============================================================
+
 static inline __m128d cadd(__m128d a,__m128d b)
 // complex addition
 {
 	return _mm_add_pd(a,b);
 }
 
-#endif //USE_SSE3
+#endif // USE_SSE3
 
 #endif // __cmplx_h
diff --git a/src/comm.c b/src/comm.c
index 19f16517..8b87b41a 100644
--- a/src/comm.c
+++ b/src/comm.c
@@ -3,7 +3,7 @@
  * Descr: incorporates all parallelization related code, so most of it is directly involved in or
  *        closely related to interprocess communication
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -35,7 +35,7 @@
 #include <string.h>
 
 #ifdef ADDA_MPI
-MPI_Datatype mpi_dcomplex,mpi_double3,mpi_dcomplex3;  // combined datatypes
+MPI_Datatype mpi_dcomplex,mpi_int3,mpi_double3,mpi_dcomplex3;  // combined datatypes
 int *recvcounts,*displs; // arrays of size ringid required for AllGather operations
 bool displs_init=false;  // whether arrays above are initialized
 #endif
@@ -46,31 +46,25 @@ bool displs_init=false;  // whether arrays above are initialized
  */
 #define SYNCHRONIZE_TIMING
 
+#ifdef PARALLEL
+#ifndef SPARSE
+
 // SEMI-GLOBAL VARIABLES
 
 // defined and allocated in fft.c
 extern double * restrict BT_buffer, * restrict BT_rbuffer;
+
 // defined and initialized in timing.c
 extern TIME_TYPE Timing_InitDmComm;
 
 // LOCAL VARIABLES
 
-#ifdef PARALLEL
-
 static int Ntrans;         // number of transmissions; used in CalcPartner
-#ifndef ADDA_SPARSE
 static int * restrict gr_comm_size;  // sizes of transmissions for granule generator communications
 static int * restrict gr_comm_overl; // shows whether two sequential transmissions overlap
 static unsigned char * restrict gr_comm_ob; // buffer for overlaps
 static void * restrict gr_comm_buf;         // buffer for MPI transfers
-#else //ADDA_SPARSE
-int * proc_mem_position;
-int * proc_mem_material;
-int * proc_mem_argvec; 
-int * proc_disp_position;
-int * proc_disp_material;
-int * proc_disp_argvec; 
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 
 
 // First several functions are defined only in parallel mode
@@ -91,7 +85,8 @@ static void RecoverCommandLine(int *argc_p,char ***argv_p)
 }
 //============================================================
 
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
+
 INLINE size_t IndexBlock(const size_t x,const size_t y,const size_t z,const size_t lengthY)
 // index block; used in BlockTranspose
 {
@@ -119,7 +114,7 @@ INLINE int CalcPartner(const int tran)
 	}
 	return part;
 }
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 
 //============================================================
 
@@ -181,6 +176,13 @@ static MPI_Datatype MPIVarType(var_type type,bool reduce,int *mult)
 		}
 		else res=mpi_dcomplex;
 	}
+	else if (type==int3_type) {
+		if (reduce) {
+			res=MPI_INT;
+			*mult=3;
+		}
+		else res=mpi_int3;
+	}
 	else if (type==double3_type) {
 		if (reduce) {
 			res=MPI_DOUBLE;
@@ -224,7 +226,9 @@ void InitDispls(void)
 //============================================================
 
 void AllGather(void * restrict x_from,void * restrict x_to,const var_type type,TIME_TYPE *timing)
-// Gather distributed arrays; works for all types;increments 'timing' (if not NULL) by the time used
+/* Gather distributed arrays; works for all types; x_from can be NULL then the data from x_to is used (in_place)
+ * increments 'timing' (if not NULL) by the time used
+ */
 {
 #ifdef ADDA_MPI
 	MPI_Datatype mes_type;
@@ -240,7 +244,8 @@ void AllGather(void * restrict x_from,void * restrict x_to,const var_type type,T
 	}
 	InitDispls(); // actually initialization is done only once
 	mes_type=MPIVarType(type,false,NULL);
-	MPI_Allgatherv(x_from,local_nvoid_Ndip,mes_type,x_to,recvcounts,displs,mes_type,MPI_COMM_WORLD);
+	if (x_from==NULL) MPI_Allgatherv(MPI_IN_PLACE,0,mes_type,x_to,recvcounts,displs,mes_type,MPI_COMM_WORLD);
+	else MPI_Allgatherv(x_from,local_nvoid_Ndip,mes_type,x_to,recvcounts,displs,mes_type,MPI_COMM_WORLD);
 	if (timing!=NULL) (*timing)+=GET_TIME()-tstart;
 #endif
 }
@@ -269,15 +274,19 @@ void InitComm(int *argc_p UOIP,char ***argv_p UOIP)
 	// initialize ringid and nprocs
 	MPI_Comm_rank(MPI_COMM_WORLD,&ringid);
 	MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
+#ifndef SPARSE
 	// initialize Ntrans
 	if (IS_EVEN(nprocs)) Ntrans=nprocs-1;
 	else Ntrans=nprocs;
+#endif
 	// define a few derived datatypes
 	/* this is intimately tied to the type definition of doublecomplex; when switching to C99
 	 * complex datatypes, this should be replaced just by MPI_DOUBLE_COMPLEX.
 	 */
 	MPI_Type_contiguous(2,MPI_DOUBLE,&mpi_dcomplex);
 	MPI_Type_commit(&mpi_dcomplex);
+	MPI_Type_contiguous(3,MPI_INT,&mpi_int3);
+	MPI_Type_commit(&mpi_int3);
 	MPI_Type_contiguous(3,MPI_DOUBLE,&mpi_double3);
 	MPI_Type_commit(&mpi_double3);
 	MPI_Type_contiguous(3,mpi_dcomplex,&mpi_dcomplex3);
@@ -294,12 +303,12 @@ void InitComm(int *argc_p UOIP,char ***argv_p UOIP)
 	ringid=ADDA_ROOT;
 #endif
 
-#ifndef ADDA_SPARSE //CheckNprocs does not exist in sparse mode
+#ifndef SPARSE // CheckNprocs does not exist in sparse mode
 	/* check if weird number of processors is specified; called even in sequential mode to
 	 * initialize weird_nprocs
 	 */
 	CheckNprocs();
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 }
 
 //============================================================
@@ -467,12 +476,10 @@ void MyInnerProduct(void * restrict data UOIP,const var_type type UOIP,size_t n_
 void ParSetup(void)
 // initialize common parameters; need to do in the beginning to enable call to MakeParticle
 {
-
-#ifndef ADDA_SPARSE //FFT mode initialization
-
-#ifdef PARALLEL
+#ifndef SPARSE //FFT mode initialization
+#	ifdef PARALLEL
 	int unitZ,unitX;
-#endif
+#	endif
 	// calculate size of 3D grid
 	gridX=fftFit(2*boxX,nprocs);
 	gridY=fftFit(2*boxY,1);
@@ -484,7 +491,7 @@ void ParSetup(void)
 	 * except for XY values, used in granule generator
 	 */
 	gridYZ=MultOverflow(gridY,gridZ,ALL_POS,"gridYZ");
-#ifdef PARALLEL
+#	ifdef PARALLEL
 	unitZ=smallZ/nprocs; // this should always be an exact division
 	local_z0=ringid*unitZ;
 	local_z1=(ringid+1)*unitZ;
@@ -493,13 +500,13 @@ void ParSetup(void)
 	unitX=gridX/nprocs;
 	local_x0=ringid*unitX;
 	local_x1=(ringid+1)*unitX;
-#else
+#	else
 	local_z0=0;
 	local_z1=smallZ;
 	local_z1_coer=boxZ;
 	local_x0=0;
 	local_x1=gridX;
-#endif
+#	endif
 	if (local_z1_coer<=local_z0) {
 		LogWarning(EC_INFO,ALL_POS,"No real dipoles are assigned");
 		local_z1_coer=local_z0;
@@ -509,29 +516,30 @@ void ParSetup(void)
 	boxXY=boxX*(size_t)boxY; // overflow check is covered by gridYZ above
 	local_Ndip=MultOverflow(boxXY,local_z1_coer-local_z0,ALL_POS,"local_Ndip");
 	D("%i :  %i %i %i %zu %zu \n",ringid,local_z0,local_z1_coer,local_z1,local_Ndip,local_Nx);
-	
-#else //sparse mode initialization (completely different from FFT mode)
-
-#ifdef PARALLEL
+#else // SPARSE
+	/* For sparse mode, nvoid_Ndip is defined in InitDipFile(), and here we define local_nvoid_d0 and local_nvoid_d1,
+	 * since they are required already in ReadDipFile()
+	 */
+	D("%zu",nvoid_Ndip);
+#	ifdef PARALLEL
 	double unit_f=1.0/nprocs;
 	//making sure that the first local_f0 and last local_f1 are 0.0 and 1.0
-	local_f0 = (ringid == 0) ? 0.0 : ringid*unit_f;
-	local_f1 = (ringid == nprocs-1) ? 1.0 : (ringid+1)*unit_f; 
-#else
-	local_f0 = 0.0;
-	local_f1 = 1.0;
-#endif //PARALLEL
-
-#endif //ADDA_SPARSE
+	double local_f0 = (ringid == 0) ? 0.0 : ringid*unit_f;
+	double local_f1 = (ringid == nprocs-1) ? 1.0 : (ringid+1)*unit_f;
+	local_nvoid_d0 = local_f0*nvoid_Ndip;
+	local_nvoid_d1 = local_f1*nvoid_Ndip;
+#	else
+	local_nvoid_d0=0;
+	local_nvoid_d1=nvoid_Ndip;
+#	endif // PARALLEL
+#endif
 }
 
 //============================================================
 
 void SetupLocalD(void)
-// initialize local_nvoid_d0 and local_nvoid_d1
+// initialize local_nvoid_d0 and local_nvoid_d1 in FFT mode. In sparse mode those are set earlier in ParSetup()
 {
-#ifndef ADDA_SPARSE //FFT mode
- 
 #ifdef ADDA_MPI
 	/* use of exclusive scan (MPI_Exscan) is logically more suitable, but it has special behavior
 	 * for the ringid=0. The latter would require special additional arrangements.
@@ -542,19 +550,11 @@ void SetupLocalD(void)
 	local_nvoid_d0=0;
 	local_nvoid_d1=local_nvoid_Ndip;
 #endif
-
-#else // sparse mode
-
-	D("%d", (int)nvoid_Ndip);
-	local_nvoid_d0 = local_f0*nvoid_Ndip;
-	local_nvoid_d1 = local_f1*nvoid_Ndip;
-
-#endif //ADDA_SPARSE
 }
 
 //============================================================
 
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 
 void BlockTranspose(doublecomplex * restrict X UOIP,TIME_TYPE *timing UOIP)
 /* do the data-transposition, i.e. exchange, between fftX and fftY&fftZ; specializes at Xmatrix;
@@ -866,70 +866,4 @@ bool ExchangePhaseShifts(doublecomplex * restrict bottom, doublecomplex * restri
 
 #endif // PARALLEL
 
-#endif //ADDA_SPARSE
-
-//============================================================
-
-#ifdef ADDA_SPARSE //functions exclusive to the sparse mode
-
-void InitArraySync() {
-#ifdef ADDA_MPI
-	MALLOC_VECTOR(proc_mem_position,int,nprocs,ALL);
-	MALLOC_VECTOR(proc_mem_material,int,nprocs,ALL);
-	MALLOC_VECTOR(proc_mem_argvec,int,nprocs,ALL);
-	MALLOC_VECTOR(proc_disp_position,int,nprocs,ALL);
-	MALLOC_VECTOR(proc_disp_material,int,nprocs,ALL);
-	MALLOC_VECTOR(proc_disp_argvec,int,nprocs,ALL);
-
-	proc_mem_position[ringid] = 3*local_nvoid_Ndip;
-	proc_mem_material[ringid] = local_nvoid_Ndip;
-	proc_mem_argvec[ringid] = 3*local_nvoid_Ndip*2; //*2 due to 2 doubles in complex
-	proc_disp_position[ringid] = 3*local_nvoid_d0;
-	proc_disp_material[ringid] = local_nvoid_d0;
-	proc_disp_argvec[ringid] = 3*local_nvoid_d0*2;
-	
-	MPI_Allgather(MPI_IN_PLACE,0,MPI_INT,proc_mem_position,1,MPI_INT,MPI_COMM_WORLD);
-	MPI_Allgather(MPI_IN_PLACE,0,MPI_INT,proc_mem_material,1,MPI_INT,MPI_COMM_WORLD);
-	MPI_Allgather(MPI_IN_PLACE,0,MPI_INT,proc_mem_argvec,1,MPI_INT,MPI_COMM_WORLD);
-	MPI_Allgather(MPI_IN_PLACE,0,MPI_INT,proc_disp_position,1,MPI_INT,MPI_COMM_WORLD);
-	MPI_Allgather(MPI_IN_PLACE,0,MPI_INT,proc_disp_material,1,MPI_INT,MPI_COMM_WORLD);
-	MPI_Allgather(MPI_IN_PLACE,0,MPI_INT,proc_disp_argvec,1,MPI_INT,MPI_COMM_WORLD);
-#endif
-}
-
-//============================================================
-
-void SyncPosition(int * restrict pos)
-{
-#ifdef ADDA_MPI
-	static const MPI_Datatype mes_type = MPI_INT;
-	
-	MPI_Allgatherv(MPI_IN_PLACE, 0, mes_type,pos, proc_mem_position,
-	               proc_disp_position, mes_type, MPI_COMM_WORLD);
-		
-#endif
-}
-
-//============================================================
-
-void SyncMaterial(unsigned char * restrict mat)
-{
-#ifdef ADDA_MPI	
-	static const MPI_Datatype mes_type = MPI_CHAR;
-	MPI_Allgatherv(MPI_IN_PLACE, 0, mes_type, mat, proc_mem_material,
-	               proc_disp_material, mes_type, MPI_COMM_WORLD);
-#endif
-}
-
-//============================================================
-
-void SyncArgvec()
-{
-#ifdef ADDA_MPI
-	static const MPI_Datatype mes_type = MPI_DOUBLE;
-	MPI_Allgatherv(MPI_IN_PLACE, 0, mes_type,(double *)arg_full, proc_mem_argvec,
-	               proc_disp_argvec, mes_type, MPI_COMM_WORLD);
-#endif
-}
-
-#endif //ADDA_SPARSE
+#endif // !SPARSE
diff --git a/src/comm.h b/src/comm.h
index c0ecb4dd..d0e0e8cd 100644
--- a/src/comm.h
+++ b/src/comm.h
@@ -2,7 +2,7 @@
  * $Date::                            $
  * Descr: definitions of communication global variables and routines
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -31,27 +31,11 @@
 #define UOIP ATT_UNUSED
 #endif
 
-typedef enum {uchar_type,int_type,sizet_type,double_type,double3_type,cmplx_type,cmplx3_type} var_type;
-
-#ifdef PARALLEL
-#ifdef ADDA_SPARSE
-extern int * proc_mem_position;
-extern int * proc_mem_material;
-extern int * proc_mem_argvec; 
-extern int * proc_disp_position;
-extern int * proc_disp_material;
-extern int * proc_disp_argvec; 
-#endif //ADDA_SPARSE
-#endif //PARALLEL
+typedef enum {uchar_type,int_type,int3_type,sizet_type,double_type,double3_type,cmplx_type,cmplx3_type} var_type;
 
 void Stop(int) ATT_NORETURN;
 void Synchronize(void);
-#ifndef ADDA_SPARSE
-void BlockTranspose(doublecomplex * restrict X,TIME_TYPE *timing);
-void BlockTranspose_Dm(doublecomplex * restrict X,size_t lengthY,size_t lengthZ);
-#endif //ADDA_SPARSE
 double AccumulateMax(double data,double *max);
-
 void Accumulate(double * restrict data,size_t size,double * restrict buffer,TIME_TYPE *timing);
 void MyInnerProduct(void * restrict data,var_type type,size_t n_elem,TIME_TYPE *timing);
 void InitComm(int *argc_p,char ***argv_p);
@@ -59,21 +43,17 @@ void ParSetup(void);
 void SetupLocalD(void);
 void MyBcast(void * restrict data,var_type type,const size_t n_elem,TIME_TYPE *timing);
 void BcastOrient(int *i,int *j,int *k);
+
+#ifndef SPARSE
+void BlockTranspose(doublecomplex * restrict X,TIME_TYPE *timing);
+void BlockTranspose_Dm(doublecomplex * restrict X,size_t lengthY,size_t lengthZ);
 // used by granule generator
-#ifndef ADDA_SPARSE
 void SetGranulComm(double z0,double z1,double gdZ,int gZ,size_t gXY,size_t buf_size,int *lz0,
                    int *lz1,int sm_gr);
 void CollectDomainGranul(unsigned char * restrict dom,size_t gXY,int lz0,int locgZ,TIME_TYPE *timing);
 void FreeGranulComm(int sm_gr);
 void ExchangeFits(char * restrict data,const size_t n,TIME_TYPE *timing);
-#endif //ADDA_SPARSE
-
-#ifdef ADDA_SPARSE
-void InitArraySync(void);
-void SyncPosition(int * restrict pos);
-void SyncMaterial(unsigned char * restrict mat);
-void SyncArgvec(void);
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 
 #ifdef PARALLEL
 // these functions are defined only in parallel mode
diff --git a/src/const.h b/src/const.h
index 06aad603..26e0b1e3 100644
--- a/src/const.h
+++ b/src/const.h
@@ -21,7 +21,7 @@
 #define __const_h
 
 // version number (string)
-#define ADDA_VERSION "1.2b1"
+#define ADDA_VERSION "1.2b2"
 
 /* ADDA uses certain C99 extensions, which are widely supported by GNU and Intel compilers. However,
  * they may be not completely supported by e.g. Microsoft Visual Studio compiler. Therefore, we
diff --git a/src/crosssec.c b/src/crosssec.c
index 3ebc6da7..df0ec61a 100644
--- a/src/crosssec.c
+++ b/src/crosssec.c
@@ -3,7 +3,7 @@
  * Descr: all the functions to calculate scattering quantities (except Mueller matrix); to read
  *        different parameters from files; and initialize orientation of the particle
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -41,7 +41,9 @@
 // defined and initialized in calculator.c
 extern double * restrict E2_alldir,* restrict E2_alldir_buffer;
 extern const doublecomplex cc[][3];
+#ifndef SPARSE
 extern doublecomplex * restrict expsX,* restrict expsY,* restrict expsZ;
+#endif
 // defined and initialized in GenerateB.c
 extern const double beam_center_0[3];
 // defined and initialized in param.c
@@ -117,26 +119,6 @@ void InitRotation (void)
 
 //=====================================================================
 
-static int ATT_UNUSED ReadLine(FILE * restrict file, // opened file
-	const char * restrict fname,                     // ... its filename
-	char * restrict buf,const int buf_size)          // buffer for line and its size
-// reads the first uncommented line; returns 1 if EOF reached
-{
-	while (!feof(file)) {
-		fgets(buf,buf_size,file);
-		if (*buf!='#') { // if uncommented
-			if (strstr(buf,"\n")==NULL && !feof(file)) LogError(ONE_POS,
-				"Buffer overflow while reading '%s' (size of uncommented line > %d)",
-				fname,buf_size-1);
-			else return 0; // complete line is read
-		} // finish reading the commented line
-		else while (strstr(buf,"\n")==NULL  && !feof(file)) fgets(buf,buf_size,file);
-	}
-	return 1;
-}
-
-//=====================================================================
-
 static void ReadLineStart(FILE *  restrict file,                  // opened file
 	                      const char * restrict fname,            // ... its filename
                           char * restrict buf,const int buf_size, // buffer for line and its size
@@ -526,7 +508,7 @@ void CalcField (doublecomplex * restrict ebuff, // where to write calculated sca
 	double temp, na;
 	doublecomplex mult_mat[MAX_NMAT];
 	const bool scat_avg=true; // temporary fixed option for SO formulation
-#ifdef ADDA_SPARSE
+#ifdef SPARSE
 	doublecomplex expX, expY, expZ;
 #endif
 
@@ -546,11 +528,11 @@ void CalcField (doublecomplex * restrict ebuff, // where to write calculated sca
 	}
 	for(i=0;i<3;i++) sum[i][RE]=sum[i][IM]=0.0;
 	// prepare values of exponents, along each of the coordinates
-#ifndef ADDA_SPARSE	
+#ifndef SPARSE	
 	imExp_arr(-kd*n[0],boxX,expsX);
 	imExp_arr(-kd*n[1],boxY,expsY);
 	imExp_arr(-kd*n[2],local_Nz_unif,expsZ);
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 	/* not to double the code in the source we use two temporary defines,since the following 'if'
 	 * cases differ only by one line of code; (taking 'if' inside the cycle will affect performance)
 	 */
@@ -562,7 +544,7 @@ void CalcField (doublecomplex * restrict ebuff, // where to write calculated sca
 	 * the grid.
 	 */
 	 
-#ifndef ADDA_SPARSE //FFT mode
+#ifndef SPARSE //FFT mode
 #define PART1\
 	iy1=iz1=UNDEF;\
 	for (j=0;j<local_nvoid_Ndip;++j) {\
@@ -578,7 +560,7 @@ void CalcField (doublecomplex * restrict ebuff, // where to write calculated sca
 			cMult(expsY[iy2],expsZ[iz2],tmp);\
 		}\
 		cMult(tmp,expsX[ix],a);
-#else //sparse mode
+#else // sparse mode
 #define PART1\
 	iy1=iz1=UNDEF;\
 	for (j=0;j<local_nvoid_Ndip;++j) {\
@@ -597,7 +579,7 @@ void CalcField (doublecomplex * restrict ebuff, // where to write calculated sca
 		}\
 		imExp(-kd*n[0]*ix,expX);\
 		cMult(tmp,expX,a);
-#endif //ADDA_SPARSE
+#endif // SPARSE
 #define PART2\
 	/* sum(P*exp(-ik*r.n)) */\
 		for(i=0;i<3;i++) {\
@@ -1068,13 +1050,18 @@ void Frp_mat(double Finc_tot[static restrict 3],double Fsca_tot[static restrict
 	}
 	// check if it can work at all; check is redundant for sequential mode
 	size_t nRows=MultOverflow(3,nvoid_Ndip,ONE_POS_FUNC);
-	#ifdef PARALLEL
+#ifdef PARALLEL
 	/* Because of the parallelization by row-block decomposition the distributed arrays involved
 	 * need to be gathered on each node a) DipoleCoord -> rdipT; b) pvec -> pT.
 	 * Actually this routine is usually called for two polarizations and rdipT does not change
 	 * between the calls. So one AllGather of rdipT can be removed. Number of memory allocations can
 	 * also be reduced. But this should be replaced by Fourier anyway.
 	 */
+	/* The following is somewhat redundant in sparse mode, since "full" (containing information about all dipoles)
+	 * vectors are already present in that mode. However, we do not optimize it now, since in standard mode radiation
+	 * forces should be computed by FFT anyway. Moreover, there are certain ideas to optimize sparse mode, so it will
+	 * not use full vectors - if done, this improvement can be also adjusted to the code below.
+	 */
 	// allocates a lot of additional memory
 	MALLOC_VECTOR(rdipT,double,nRows,ALL);
 	MALLOC_VECTOR(pT,complex,nRows,ALL);
diff --git a/src/fft.c b/src/fft.c
index 501e3764..46f84760 100644
--- a/src/fft.c
+++ b/src/fft.c
@@ -1,7 +1,6 @@
 /* File: fft.c
  * $Date::                            $
- * Descr: initialization of all FFT for matrix-vector products; and FFT procedures themselves
-
+ * Descr: initialization of all FFT for matrix-vector products; and FFT procedures themselves; not used in sparse mode
  *        TODO: A lot of indirect indexing used - way to optimize.
  *
  * Copyright (C) 2006-2013 ADDA contributors
@@ -26,7 +25,9 @@
 #include "debug.h"
 #include "function.h"
 #include "io.h"
+#include "interaction.h"
 #include "memory.h"
+#include "oclcore.h"
 #include "prec_time.h"
 #include "vars.h"
 // system headers
@@ -34,16 +35,13 @@
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef OPENCL
-#	ifdef CLFFT_AMD
-#		include <clAmdFft.h> //external library from AMD
-#	elif defined(CLFFT_APPLE)
-#		include "cpp/clFFT.h" //nearly unmodified APPLE FFT header file
-#		ifdef NO_CPP
-#			error "Apple clFFT relies on C++ sources, hence is incompatible with NO_CPP option"
-#		endif
+#ifdef CLFFT_AMD
+#	include <clAmdFft.h> //external library from AMD
+#elif defined(CLFFT_APPLE)
+#	include "cpp/clFFT.h" //nearly unmodified APPLE FFT header file
+#	ifdef NO_CPP
+#		error "Apple clFFT relies on C++ sources, hence is incompatible with NO_CPP option"
 #	endif
-#	include "oclcore.h"
 #endif
 /* standard FFT routines (FFTW3 of FFT_TEMPERTON) are required even when OpenCL is used, since
  * they are used for Fourier transform of the D-matrix
@@ -70,17 +68,6 @@
 
 // SEMI-GLOBAL VARIABLES
 
-// defined ant initialized in calculator.c
-extern const double * restrict tab1,* restrict tab2,* restrict tab3,* restrict tab4,* restrict tab5,
-	* restrict tab6,* restrict tab7,* restrict tab8,* restrict tab9,* restrict tab10;
-extern const int * restrict * restrict tab_index;
-
-// defined and initialized in make_particle.c
-extern double gridspace;
-
-// defined and initialized in param.c
-extern double igt_lim, igt_eps;
-
 // defined and initialized in timing.c
 extern TIME_TYPE Timing_FFT_Init,Timing_Dm_Init;
 
@@ -132,17 +119,6 @@ void cfft99_(double * restrict data,double * restrict _work,const double * restr
 	const int *isign);
 #endif
 
-// EXTERNAL FUNCTIONS
-
-#ifndef NO_FORTRAN
-// fort/propaesplibreintadda.f
-void propaespacelibreintadda_(const double *Rij,const double *ka,const double *arretecube,
-	const double *relreq, double *result);
-#endif
-
-// sinint.c
-void cisi(double x,double *ci,double *si);
-
 //============================================================
 
 INLINE size_t IndexDmatrix(const size_t x,size_t y,size_t z)
@@ -373,7 +349,7 @@ void fftY(const int isign)
 	if (isign==FFT_FORWARD) fftw_execute(planYf);
 	else fftw_execute(planYb);
 #elif defined(FFT_TEMPERTON)
-	int nn=gridY,inc=1,jump=nn,lot=3*gridZ,j;
+	int nn=gridY,inc=1,jump=nn,lot=3*gridZ;
 
 	cfft99_((double *)(slices_tr),work,trigsY,ifaxY,&inc,&jump,&nn,&lot,&isign);
 #endif
@@ -740,532 +716,6 @@ static void fftInitAfterD(void)
 
 //============================================================
 
-INLINE bool TestTableSize(const double rn)
-// tests if rn fits into the table; if not, returns false and produces info message
-{
-	static bool warned=false;
-
-	if (rn>TAB_RMAX) {
-		if (!warned) {
-			warned=true;
-			LogWarning(EC_INFO,ONE_POS,"Not enough table size (available only up to R/d=%d), "
-				"so O(kd^2) accuracy of Green's function is not guaranteed",TAB_RMAX);
-		}
-		return false;
-	}
-	else return true;
-}
-
-//============================================================
-
-static void CalcInterTerm(const int i,const int j,const int k,doublecomplex * restrict result)
-/* calculates interaction term between two dipoles; given integer distance vector {i,j,k}
- * (in units of d). All six components of the symmetric matrix are computed at once.
- */
-{
-	double rr,qvec[3],q2[3],invr3,qavec[3],av[3];
-	double kr,kr2,kr3,kd2,q4,rn,rn2;
-	double temp,qmunu[6],qa,qamunu[6],invrn,invrn2,invrn3,invrn4;
-	double dmunu[6]; // KroneckerDelta[mu,nu] - can serve both as multiplier, and as bool
-	double kfr,ci,si,ci1,si1,ci2,si2,brd,cov,siv,g0,g2;
-	doublecomplex expval,br,br1,m,m2,Gf1,Gm0,Gm1,Gc1,Gc2;
-	int ind0,ind1,ind2,ind2m,ind3,ind4,indmunu,comp,mu,nu,mu1,nu1;
-	int sigV[3],ic,sig,ivec[3],ord[3],invord[3];
-	double t3q,t3a,t4q,t4a,t5tr,t5aa,t6tr,t6aa;
-	const bool inter_avg=true; // temporary fixed option for SO formulation
-
-// this is used for debugging, should be empty define, when not required
-#define PRINT_GVAL /*printf("%d,%d,%d: %g%+gi, %g%+gi, %g%+gi,\n%g%+gi, %g%+gi, %g%+gi\n",\
-	i,j,k,result[0][RE],result[0][IM],result[1][RE],result[1][IM],result[2][RE],result[2][IM],\
-	result[3][RE],result[3][IM],result[4][RE],result[4][IM],result[5][RE],result[5][IM]);*/
-
-	// self interaction; self term is computed in different subroutine
-	if (i==0 && j==0 && k==0) for (comp=0;comp<NDCOMP;comp++) {
-		result[comp][RE]=result[comp][IM]=0;
-		return;
-	}
-	//====== calculate some basic constants ======
-	qvec[0]=i; // qvec is normalized below (after IGT)
-	qvec[1]=j;
-	qvec[2]=k;
-	rn2=DotProd(qvec,qvec);
-	rn=sqrt(rn2); // normalized r
-#ifndef NO_FORTRAN
-	if (IntRelation==G_IGT && (igt_lim==UNDEF || rn<=igt_lim)) { // a special case
-		double rtemp[3];
-		vMultScal(gridspace,qvec,rtemp);
-		propaespacelibreintadda_(rtemp,&WaveNum,&gridspace,&igt_eps,(double *)result);
-		PRINT_GVAL;
-		return;
-	}
-#endif
-	// a common part of the code (up to FCD...), which effectively implements G_POINT_DIP
-	invrn=1/rn;
-	vMultScalSelf(invrn,qvec); // finalize qvec
-	rr=rn*gridspace;
-	invr3=1/(rr*rr*rr);
-	kr=WaveNum*rr;
-	kr2=kr*kr;
-	kfr=PI*rn; // k_F*r, for FCD
-	// cov=cos(kr); siv=sin(kr); expval=Exp(ikr)/r^3
-	imExp(kr,expval);
-	cov=expval[RE];
-	siv=expval[IM];
-	cMultReal(invr3,expval,expval);
-	//====== calculate Gp ========
-	for (mu=0,comp=0;mu<3;mu++) for (nu=mu;nu<3;nu++,comp++) {
-		dmunu[comp]= mu==nu ? 1 : 0;
-		qmunu[comp]=qvec[mu]*qvec[nu];
-		// br=delta[mu,nu]*(-1+ikr+kr^2)-qmunu*(-3+3ikr+kr^2)
-		br[RE]=(3-kr2)*qmunu[comp];
-		br[IM]=-3*kr*qmunu[comp];
-		if(dmunu[comp]) {
-			br[RE]+=kr2-1;
-			br[IM]+=kr;
-		}
-		// result=Gp=expval*br
-		cMult(br,expval,result[comp]);
-	}
-	//====== FCD (static and full) ========
-	/* speed of FCD can be improved by using faster version of sici routine, using predefined
-	 * tables, etc (e.g. as is done in GSL library). But currently extra time for this computation
-	 * is already smaller than one main iteration.
-	 */
-	if (IntRelation==G_FCD_ST) {
-		/* FCD is based on Gay-Balmaz P., Martin O.J.F. "A library for computing the filtered and
-		 * non-filtered 3D Green's tensor associated with infinite homogeneous space and surfaces",
-		 * Comp. Phys. Comm. 144:111-120 (2002), and Piller N.B. "Increasing the performance of the
-		 * coupled-dipole approximation: A spectral approach", IEEE Trans.Ant.Propag. 46(8):
-		 * 1126-1137. Here it differs by a factor of 4*pi*k^2.
-		 */
-		// result = Gp*[3*Si(k_F*r)+k_F*r*cos(k_F*r)-4*sin(k_F*r)]*2/(3*pi)
-		cisi(kfr,&ci,&si);
-		brd=TWO_OVER_PI*ONE_THIRD*(3*si+kfr*cos(kfr)-4*sin(kfr));
-		for (comp=0;comp<NDCOMP;comp++) cMultReal(brd,result[comp],result[comp]);
-	}
-	else if (IntRelation==G_FCD) {
-		// ci,si_1,2 = ci,si_+,- = Ci,Si((k_F +,- k)r)
-		cisi(kfr+kr,&ci1,&si1);
-		cisi(kfr-kr,&ci2,&si2);
-		// ci=ci1-ci2; si=pi-si1-si2
-		ci=ci1-ci2;
-		si=PI-si1-si2;
-		g0=INV_PI*(siv*ci+cov*si);
-		g2=INV_PI*(kr*(cov*ci-siv*si)+2*ONE_THIRD*(kfr*cos(kfr)-4*sin(kfr)))-g0;
-		temp=g0*kr2;
-		for (comp=0;comp<NDCOMP;comp++) {
-			// brd=(delta[mu,nu]*(-g0*kr^2-g2)+qmunu*(g0*kr^2+3g2))/r^3
-			brd=qmunu[comp]*(temp+3*g2);
-			if (dmunu[comp]) brd-=temp+g2;
-			brd*=invr3;
-			// result=Gp+brd
-			result[comp][RE]+=brd;
-		}
-	}
-	//======= second order corrections ========
-	else if (IntRelation==G_IGT_SO) {
-		/* There is still some space for speed optimization here (e.g. move mu,nu-independent
-		 * operations out of the cycles over components).
-		 */
-		kd2=kd*kd;
-		if (kr*rn < G_BOUND_CLOSE && TestTableSize(rn)) {
-			//====== G close for IGT =============
-			ivec[0]=i;
-			ivec[1]=j;
-			ivec[2]=k;
-			// transformation of negative coordinates
-			for (ic=0;ic<3;ic++) {
-				if (ivec[ic]<0) {
-					sigV[ic]=-1;
-					qvec[ic]*=-1;
-					ivec[ic]*=-1;
-				}
-				else sigV[ic]=1;
-			}
-			// transformation to case i>=j>=k>=0
-			// building of ord; ord[x] is x-th largest coordinate (0-th - the largest)
-			if (ivec[0]>=ivec[1]) { // i>=j
-				if (ivec[0]>=ivec[2]) { // i>=k
-					ord[0]=0;
-					if (ivec[1]>=ivec[2]) { // j>=k
-						ord[1]=1;
-						ord[2]=2;
-					}
-					else {
-						ord[1]=2;
-						ord[2]=1;
-					}
-				}
-				else {
-					ord[0]=2;
-					ord[1]=0;
-					ord[2]=1;
-				}
-			}
-			else {
-				if (ivec[0]>=ivec[2]) { // i>=k
-					ord[0]=1;
-					ord[1]=0;
-					ord[2]=2;
-				}
-				else {
-					ord[2]=0;
-					if (ivec[1]>=ivec[2]) { // j>=k
-						ord[0]=1;
-						ord[1]=2;
-					}
-					else {
-						ord[0]=2;
-						ord[1]=1;
-					}
-				}
-			}
-			// change parameters according to coordinate transforms
-			Permutate(qvec,ord);
-			Permutate_i(ivec,ord);
-			// compute inverse permutation
-			memcpy(invord,ord,3*sizeof(int));
-			Permutate_i(invord,ord);
-			if (invord[0]==0 && invord[1]==1 && invord[2]==2) memcpy(invord,ord,3*sizeof(int));
-			// temp = kr/24; and set some indices
-			temp=kr/24;
-			ind0=tab_index[ivec[0]][ivec[1]]+ivec[2];
-			ind1=3*ind0;
-			ind2m=6*ind0;
-			// cycle over tensor components
-			for (mu=0,comp=0;mu<3;mu++) for (nu=mu;nu<3;nu++,comp++) {
-				sig=sigV[mu]*sigV[nu]; // sign of some terms below
-				/* indexes for tables of different dimensions based on transformed indices mu and nu
-				 * '...munu' variables are invariant to permutations because both constituent
-				 * vectors and indices are permutated. So this variables can be used, as precomputed
-				 * above.
-				 */
-				mu1=invord[mu];
-				nu1=invord[nu];
-				/* indmunu is a number of component[mu,nu] in a symmetric matrix, but counted
-				 * differently than comp. This is {{0,1,3},{1,2,4},{3,4,5}}
-				 */
-				indmunu=mu1+nu1;
-				if (mu1==2 || nu1==2) indmunu++;
-				ind2=ind2m+indmunu;
-				ind3=3*ind2;
-				ind4=6*ind2;
-				// computing several quantities with table integrals
-				t3q=DotProd(qvec,tab3+ind1);
-				t4q=DotProd(qvec,tab4+ind3);
-				t5tr=TrSym(tab5+ind2m);
-				t6tr=TrSym(tab6+ind4);
-				//====== computing Gc0 =====
-				/* br = delta[mu,nu]*(-I7-I9/2-kr*(i+kr)/24+2*t3q+t5tr)
-				 *    - (-3I8[mu,nu]-3I10[mu,nu]/2-qmunu*kr*(i+kr)/24+2*t4q+t6tr)
-				 */
-				br[RE]=sig*(3*(tab10[ind2]/2+tab8[ind2])-2*t4q-t6tr)+temp*qmunu[comp]*kr;
-				br[IM]=3*temp*qmunu[comp];
-				if (dmunu[comp]) {
-					br[RE]+=2*t3q+t5tr-temp*kr-tab9[ind0]/2-tab7[ind0];
-					br[IM]-=temp;
-				}
-				// br*=kd^2
-				cMultReal(kd2,br,br);
-				// br+=I1*delta[mu,nu]*(-1+ikr+kr^2)-sig*I2[mu,nu]*(-3+3ikr+kr^2)
-				br[RE]+=sig*tab2[ind2]*(3-kr2);
-				br[IM]-=sig*tab2[ind2]*3*kr;
-				if (dmunu[comp]) {
-					br[RE]+=tab1[ind0]*(kr2-1);
-					br[IM]+=tab1[ind0]*kr;
-				}
-				// Gc0=expval*br
-				cMult(expval,br,result[comp]);
-			}
-		}
-		else {
-			//====== Gfar (and part of Gmedian) for IGT =======
-			// Gf0 = Gp*(1-kd^2/24)
-			for (comp=0;comp<NDCOMP;comp++) cMultReal(1-kd2/24,result[comp],result[comp]);
-			if (kr < G_BOUND_MEDIAN) {
-				//===== G median for IGT ========
-				vSquare(qvec,q2);
-				q4=DotProd(q2,q2);
-				invrn2=invrn*invrn;
-				invrn3=invrn2*invrn;
-				invrn4=invrn2*invrn2;
-				for (mu=0,comp=0;mu<3;mu++) for (nu=mu;nu<3;nu++,comp++) {
-					// Gm0=expval*br*temp; temp is set anew
-					temp=qmunu[comp]*(33*q4-7-12*(q2[mu]+q2[nu]));
-					if (mu == nu) temp+=(1-3*q4+4*q2[mu]);
-					temp*=7*invrn4/64;
-					br[RE]=-1;
-					br[IM]=kr;
-					cMultReal(temp,br,Gm0);
-					cMultSelf(Gm0,expval);
-					// result = Gf + Gm0 + [ Gm1 ]
-					cAdd(Gm0,result[comp],result[comp]);
-				}
-			}
-		}
-	}
-	else if (IntRelation==G_SO) {
-		/* There is still some space for speed optimization here (e.g. move mu,nu-independent
-		 * operations out of the cycles over components). But now extra time is equivalent to 2-3
-		 * main iterations. So first priority is to make something useful out of SO.
-		 */
-		// next line should never happen
-		if (anisotropy) LogError(ONE_POS,"Incompatibility error in CalcInterTerm");
-		kd2=kd*kd;
-		kr3=kr2*kr;
-		// only one refractive index can be used for FFT-compatible algorithm
-		cEqual(ref_index[0],m);
-		cSquare(m,m2);
-		if (!inter_avg) {
-			qa=DotProd(qvec,prop);
-			for (mu=0,comp=0;mu<3;mu++) for (nu=mu;nu<3;nu++,comp++) {
-				// qamunu=qvec[mu]*prop[nu] + qvec[nu]*prop[mu]
-				qamunu[comp]=qvec[mu]*prop[nu];
-				if (dmunu[comp]) qamunu[comp]*=2;
-				else qamunu[comp]+=qvec[nu]*prop[mu];
-			}
-		}
-		if (kr*rn < G_BOUND_CLOSE && TestTableSize(rn)) {
-			//====== G close =============
-			// av is copy of propagation vector
-			if (!inter_avg) memcpy(av,prop,3*sizeof(double));
-			ivec[0]=i;
-			ivec[1]=j;
-			ivec[2]=k;
-			// transformation of negative coordinates
-			for (ic=0;ic<3;ic++) {
-				if (ivec[ic]<0) {
-					sigV[ic]=-1;
-					av[ic]*=-1;
-					qvec[ic]*=-1;
-					ivec[ic]*=-1;
-				}
-				else sigV[ic]=1;
-			}
-			// transformation to case i>=j>=k>=0
-			// building of ord; ord[x] is x-th largest coordinate (0-th - the largest)
-			if (ivec[0]>=ivec[1]) { // i>=j
-				if (ivec[0]>=ivec[2]) { // i>=k
-					ord[0]=0;
-					if (ivec[1]>=ivec[2]) { // j>=k
-						ord[1]=1;
-						ord[2]=2;
-					}
-					else {
-						ord[1]=2;
-						ord[2]=1;
-					}
-				}
-				else {
-					ord[0]=2;
-					ord[1]=0;
-					ord[2]=1;
-				}
-			}
-			else {
-				if (ivec[0]>=ivec[2]) { // i>=k
-					ord[0]=1;
-					ord[1]=0;
-					ord[2]=2;
-				}
-				else {
-					ord[2]=0;
-					if (ivec[1]>=ivec[2]) { // j>=k
-						ord[0]=1;
-						ord[1]=2;
-					}
-					else {
-						ord[0]=2;
-						ord[1]=1;
-					}
-				}
-			}
-			// change parameters according to coordinate transforms
-			Permutate(qvec,ord);
-			if (!inter_avg) Permutate(av,ord);
-			Permutate_i(ivec,ord);
-			// compute inverse permutation
-			memcpy(invord,ord,3*sizeof(int));
-			Permutate_i(invord,ord);
-			if (invord[0]==0 && invord[1]==1 && invord[2]==2) memcpy(invord,ord,3*sizeof(int));
-			// temp = kr/24; and set some indices
-			temp=kr/24;
-			ind0=tab_index[ivec[0]][ivec[1]]+ivec[2];
-			ind1=3*ind0;
-			ind2m=6*ind0;			// cycle over tensor components
-			for (mu=0,comp=0;mu<3;mu++) for (nu=mu;nu<3;nu++,comp++) {
-				sig=sigV[mu]*sigV[nu]; // sign of some terms below
-				/* indexes for tables of different dimensions based on transformed indices mu and nu
-				 * '...munu' variables are invariant to permutations because both constituent
-				 * vectors and indices are permutated. So this variables can be used, as precomputed
-				 * above.
-				 */
-				mu1=invord[mu];
-				nu1=invord[nu];
-				/* indmunu is a number of component[mu,nu] in a symmetric matrix, but counted
-				 * differently than comp. This is {{0,1,3},{1,2,4},{3,4,5}}
-				 */
-				indmunu=mu1+nu1;
-				if (mu1==2 || nu1==2) indmunu++;
-				ind2=ind2m+indmunu;
-				ind3=3*ind2;
-				ind4=6*ind2;
-				// computing several quantities with table integrals
-				t3q=DotProd(qvec,tab3+ind1);
-				t4q=DotProd(qvec,tab4+ind3);
-				t5tr=TrSym(tab5+ind2m);
-				t6tr=TrSym(tab6+ind4);
-				if (inter_avg) {
-					// <a[mu]*a[nu]>=1/3*delta[mu,nu]
-					t5aa=ONE_THIRD*t5tr;
-					t6aa=ONE_THIRD*t6tr;
-				}
-				else {
-					t3a=DotProd(av,tab3+ind1);
-					t4a=DotProd(av,tab4+ind3);
-					t5aa=QuadForm(tab5+ind2m,av);
-					t6aa=QuadForm(tab6+ind4,av);
-				}
-				//====== computing Gc0 =====
-				/* br = delta[mu,nu]*(-I7-I9/2-kr*(i+kr)/24+2*t3q+t5tr)
-				 *    - (-3I8[mu,nu]-3I10[mu,nu]/2-qmunu*kr*(i+kr)/24+2*t4q+t6tr)
-				 */
-				br[RE]=sig*(3*(tab10[ind2]/2+tab8[ind2])-2*t4q-t6tr)+temp*qmunu[comp]*kr;
-				br[IM]=3*temp*qmunu[comp];
-				if (dmunu[comp]) {
-					br[RE]+=2*t3q+t5tr-temp*kr-tab9[ind0]/2-tab7[ind0];
-					br[IM]-=temp;
-				}
-				// br*=kd^2
-				cMultReal(kd2,br,br);
-				// br+=I1*delta[mu,nu]*(-1+ikr+kr^2)-sig*I2[mu,nu]*(-3+3ikr+kr^2)
-				br[RE]+=sig*tab2[ind2]*(3-kr2);
-				br[IM]-=sig*tab2[ind2]*3*kr;
-				if (dmunu[comp]) {
-					br[RE]+=tab1[ind0]*(kr2-1);
-					br[IM]+=tab1[ind0]*kr;
-				}
-				// Gc0=expval*br
-				cMult(expval,br,result[comp]);
-				//==== computing Gc1 ======
-				if (!inter_avg) {
-					// br=(kd*kr/24)*(qa*(delta[mu,nu]*(-2+ikr)-qmunu*(-6+ikr))-qamunu)
-					br[RE]=6*qmunu[comp];
-					br[IM]=-kr*qmunu[comp];
-					if (dmunu[comp]) {
-						br[RE]-=2;
-						br[IM]+=kr;
-					}
-					cMultReal(qa,br,br);
-					br[RE]-=qamunu[comp];
-					cMultReal(2*temp*kd,br,br);
-					//  br1=(d/r)*(delta[mu,nu]*t3h*(-1+ikr)-sig*t4h*(-3+3ikr))
-					br1[RE]=3*sig*t4a;
-					br1[IM]=-kr*br1[RE];
-					if (dmunu[comp]) {
-						br1[RE]-=t3a;
-						br1[IM]+=t3a*kr;
-					}
-					cMultReal(1/rn,br1,br1);
-					// Gc1=expval*i*m*kd*(br1+br)
-					cAdd(br,br1,Gc1);
-					cMultSelf(Gc1,m);
-					cMultReal(kd,Gc1,Gc1);
-					cMultSelf(Gc1,expval);
-					cMult_i(Gc1);
-				}
-				//==== computing Gc2 ======
-				// br=delta[mu,nu]*t5aa-3*sig*t6aa-(kr/12)*(delta[mu,nu]*(i+kr)-qmunu*(3i+kr))
-				br[RE]=-kr*qmunu[comp];
-				br[IM]=-3*qmunu[comp];
-				if (dmunu[comp]) {
-					br[RE]+=kr;
-					br[IM]+=1;
-				}
-				cMultReal(-2*temp,br,br);
-				br[RE]-=3*sig*t6aa;
-				if (dmunu[comp]) br[RE]+=t5aa;
-				// Gc2=expval*(kd^2/2)*m^2*br
-				cMult(m2,br,Gc2);
-				cMultReal(kd2/2,Gc2,Gc2);
-				cMultSelf(Gc2,expval);
-				// result = Gc0 + [ Gc1 ] + Gc2
-				if (!inter_avg) cAdd(Gc2,Gc1,Gc2);
-				cAdd(Gc2,result[comp],result[comp]);
-			}
-		}
-		else {
-			//====== Gfar (and part of Gmedian) =======
-			// temp=kd^2/24
-			temp=kd2/24;
-			// br=1-(1+m^2)*kd^2/24
-			br[RE]=1-(1+m2[RE])*temp;
-			br[IM]=-m2[IM]*temp;
-			// Gf0 + Gf2 = Gp*br
-			for (comp=0;comp<NDCOMP;comp++) cMultSelf(result[comp],br);
-			//==== compute and add Gf1 ===
-			if (!inter_avg) for (comp=0;comp<NDCOMP;comp++) {
-				/* br = {delta[mu,nu]*(3-3ikr-2kr^2+ikr^3)-qmunu*(15-15ikr-6kr^2+ikr^3)}*qa
-				 *    + qamunu*(3-3ikr-kr^2)
-				 */
-				br[RE]=(6*kr2-15)*qmunu[comp];
-				br[IM]=(15*kr-kr3)*qmunu[comp];
-				if(dmunu[comp]) {
-					br[RE]+=3-2*kr2;
-					br[IM]+=kr3-3*kr;
-				}
-				cMultReal(qa,br,br);
-				br[RE]+=(3-kr2)*qamunu[comp];
-				br[IM]-=3*kr*qamunu[comp];
-				// Gf1=expval*i*m*temp*br
-				cMult(m,br,Gf1);
-				cMultReal(temp*2/kr,Gf1,Gf1);
-				cMultSelf(Gf1,expval);
-				cMult_i(Gf1);
-				// result = Gf
-				cAdd(Gf1,result[comp],result[comp]);
-			}
-			if (kr < G_BOUND_MEDIAN) {
-				//===== G median ========
-				vSquare(qvec,q2);
-				q4=DotProd(q2,q2);
-				invrn2=invrn*invrn;
-				invrn3=invrn2*invrn;
-				invrn4=invrn2*invrn2;
-				for (mu=0,comp=0;mu<3;mu++) for (nu=mu;nu<3;nu++,comp++) {
-					// Gm0=expval*br*temp; temp is set anew
-					temp=qmunu[comp]*(33*q4-7-12*(q2[mu]+q2[nu]));
-					if (mu == nu) temp+=(1-3*q4+4*q2[mu]);
-					temp*=7*invrn4/64;
-					br[RE]=-1;
-					br[IM]=kr;
-					cMultReal(temp,br,Gm0);
-					cMultSelf(Gm0,expval);
-					if (!inter_avg) {
-						// Gm1=expval*i*m*temp; temp is set anew
-						vMult(qvec,prop,qavec);
-						temp = 3*qa*(dmunu[comp]-7*qmunu[comp]) + 6*dmunu[comp]*qvec[mu]*prop[mu]
-							 - 7*(dmunu[comp]-9*qmunu[comp])*DotProd(qavec,q2)
-							 + 3*(prop[mu]*qvec[nu]*(1-7*q2[mu])+prop[nu]*qvec[mu]*(1-7*q2[nu]));
-						temp*=kd*invrn3/48;
-						cMultReal(temp,m,Gm1);
-						cMult_i(Gm1);
-						cMultSelf(Gm1,expval);
-						// add Gm1 to Gm0
-						cAdd(Gm0,Gm1,Gm0);
-					}
-					// result = Gf + Gm0 + [ Gm1 ]
-					cAdd(Gm0,result[comp],result[comp]);
-				}
-			}
-		}
-	}
-	PRINT_GVAL;
-}
-#undef PRINT_GVAL
-
-//============================================================
-
 void InitDmatrix(void)
 /* Initializes the matrix D. D[i][j][k]=A[i1-i2][j1-j2][k1-k2]. Actually D=-FFT(G)/Ngrid.
  * Then -G.x=invFFT(D*FFT(x)) for practical implementation of FFT such that invFFT(FFT(x))=Ngrid*x.
@@ -1478,7 +928,7 @@ void InitDmatrix(void)
 	 * indexing) component-wise (in cycle over NDCOMP)
 	 */
 
-	/* fill Dmatrix with 0, this if to fill the possible gap between e.g. boxY and gridY/2;
+	/* fill Dmatrix with 0, this if to fill the possible gap between e.g. boxY and gridY/2; (and for R=0)
 	 * probably faster than using a lot of conditionals
 	 */
 	for (ind=0;ind<Dsize;ind++) Dmatrix[ind][RE]=Dmatrix[ind][IM]=0;
@@ -1489,7 +939,11 @@ void InitDmatrix(void)
 		else kcor=k;
 		for (j=jstart;j<boxY;j++) for (i=1-boxX;i<boxX;i++) {
 			index=NDCOMP*IndexD2matrix(i,j,k,nnn);
-			CalcInterTerm(i,j,kcor,Dmatrix+index);
+			/* The test for zero distance is somewhat non-optimal. However, other alternatives are not perfect either:
+			 * 1) complicate the loops to remove the zero element in the beginning (move tests to the upper level)
+			 * 2) call the function with zero - it will produce NaN. Then set this element to zero after the loop.
+			 */
+			if (i!=0 || j!=0 || kcor!=0) (*CalcInterTerm)(i,j,kcor,Dmatrix+index);
 		}
 	} // end of i,j,k loop
 	if (IFROOT) printf("Fourier transform of Dmatrix");
diff --git a/src/fft.h b/src/fft.h
index 9ce43825..3cd9804e 100644
--- a/src/fft.h
+++ b/src/fft.h
@@ -1,6 +1,6 @@
 /* File: fft.h
  * $Date::                            $
- * Descr: definitions of FFT parameters and routines
+ * Descr: definitions of FFT parameters and routines; void in sparse mode
  *
  * Copyright (C) 2006,2008,2010-2013 ADDA contributors
  * This file is part of ADDA.
@@ -16,11 +16,11 @@
  * You should have received a copy of the GNU General Public License along with ADDA. If not, see
  * <http://www.gnu.org/licenses/>.
  */
+#ifndef SPARSE
+
 #ifndef __fft_h
 #define __fft_h
 
-#ifndef ADDA_SPARSE //not needed in sparse mode
-
 #ifndef FFT_TEMPERTON
 #	define FFTW3 // FFTW3 is default
 #endif
@@ -45,6 +45,6 @@ void Free_FFT_Dmat(void);
 int fftFit(int size, int _div);
 void CheckNprocs(void);
 
-#endif // ADDA_SPARSE
-
 #endif // __fft_h
+
+#endif // !SPARSE
diff --git a/src/interaction.c b/src/interaction.c
index 4a8fd15a..bce978b2 100644
--- a/src/interaction.c
+++ b/src/interaction.c
@@ -1,227 +1,193 @@
 /* FILE : interaction.c
  * Descr: the functions used to calculate the interaction term
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2011-2013 ADDA contributors
  * This file is part of ADDA.
  *
- * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
- * General Public License as published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
+ * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  *
- * ADDA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
- * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
- * Public License for more details.
+ * ADDA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License along with ADDA. If not, see
  * <http://www.gnu.org/licenses/>.
  */
-
+#include "const.h" // keep this first
+#include "interaction.h" // corresponding headers
+// project headers
 #include "cmplx.h"
-#include "const.h"
-#include "interaction.h"
 #include "io.h"
-#include "types.h"
+#include "memory.h"
 #include "vars.h"
-#include "param.h"
 
-// defined and initialized in calculator.c
-extern const double * restrict tab1,* restrict tab2,* restrict tab3,* restrict tab4,* restrict tab5,
-	* restrict tab6,* restrict tab7,* restrict tab8,* restrict tab9,* restrict tab10;
-extern const int * restrict * restrict tab_index;
+// SEMI-GLOBAL VARIABLES
+
+// defined and initialized in make_particle.c
 extern double gridspace;
-extern double igt_lim, igt_eps;
 
-#ifndef NO_FORTRAN
-// fort/propaesplibreintadda.f
-void propaespacelibreintadda_(const double *Rij,const double *ka,const double *arretecube,
-	const double *relreq, double *result);
-#endif
+// defined and initialized in param.c
+extern double igt_lim,igt_eps;
+
+// LOCAL VARIABLES
 
-//This function pointer...
-void (*CalcInterTerm)(const int i,const int j,const int k,doublecomplex * restrict result);
-//...will point to one of the functions below
-void CalcInterTerm_opt(const int i,const int j,const int k,doublecomplex * restrict result);
-void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex * restrict result);
+// tables of integrals
+static double * restrict tab1,* restrict tab2,* restrict tab3,* restrict tab4,* restrict tab5,* restrict tab6,
+	* restrict tab7,* restrict tab8,* restrict tab9,* restrict tab10;
+/* it is preferable to declare the following as "* restrict * restrict", but it is hard to make it
+* generally compatible with Free_iMatrix function syntax.
+*/
+static int ** restrict tab_index; // matrix for indexing of table arrays
 
 #ifdef USE_SSE3
-__m128d c1, c2, c3, zo, inv_2pi, p360, prad_to_deg;
-__m128d exptbl[361];
+static __m128d c1, c2, c3, zo, inv_2pi, p360, prad_to_deg;
+static __m128d exptbl[361];
 #endif //USE_SSE3
 
-#define M_PI 3.14159265358979323846
+// EXTERNAL FUNCTIONS
 
-void InitInteraction(void) 
-/*
-	Initialize the interaction calculations
-*/
-{
-	if (IntRelation==G_POINT_DIP) {
-		CalcInterTerm = CalcInterTerm_opt;
-	} else {
-		CalcInterTerm = CalcInterTerm_complete;
-	}
-
-	#ifdef USE_SSE3
-	c1 = _mm_set_pd(1.34959795251974073996e-11,3.92582397764340914444e-14);
-	c2 = _mm_set_pd(-8.86096155697856783296e-7,-3.86632385155548605680e-9);
-	c3 = _mm_set_pd(1.74532925199432957214e-2,1.52308709893354299569e-4);
-	zo = _mm_set_pd(0.0,1.0);
-	inv_2pi = _mm_set_sd(1.0/(2*M_PI));
-	p360 = _mm_set_sd(360.0);
-	prad_to_deg = _mm_set_sd(180.0/M_PI);
-	
-	for (unsigned int i=0; i<=360; i++) {
-		double x = M_PI/180.0*(double)i; 
-		exptbl[i] = _mm_set_pd(sin(x),cos(x));
-	}
-	#endif
-}
+#ifndef NO_FORTRAN
+// fort/propaesplibreintadda.f
+void propaespacelibreintadda_(const double *Rij,const double *ka,const double *arretecube,const double *relreq,
+	double *result);
+#endif
+// sinint.c
+void cisi(double x,double *ci,double *si);
 
+// this is used for debugging, should be empty define, when not required
+#define PRINT_GVAL /*printf("%d,%d,%d: %g%+gi, %g%+gi, %g%+gi,\n%g%+gi, %g%+gi, %g%+gi\n",\
+	i,j,k,result[0][RE],result[0][IM],result[1][RE],result[1][IM],result[2][RE],result[2][IM],\
+	result[3][RE],result[3][IM],result[4][RE],result[4][IM],result[5][RE],result[5][IM]);*/
 
 #ifdef USE_SSE3
 
-static inline __m128d accImExp(double x, double c)
-/*
-   Accelerated sin-cos (or imaginary exp) routine for use in the calculation of the 
-   interaction tensor. Returns c*exp(ix) in the resultant vector. The code is adapted
-   from the CEPHES library. The idea is that we have precalculated exp(iy) for some
-   discrete values of y. Then we take the y that is nearest to our x and write
-   exp(ix)=exp(iy+(ix-iy))=exp(iy)exp(i(x-y)). We take exp(y) from the table and 
-   exp(i(x-y)) from the Taylor series. This converges very fast since |x-y| is small.
-*/
+//=====================================================================================================================
+
+static inline __m128d accImExp(double x,double c)
+/* Accelerated sin-cos (or imaginary exp) routine for use in the calculation of the interaction tensor. Returns
+ * c*exp(ix) in the resultant vector. The code is adapted from the CEPHES library. The idea is that we have
+ * precalculated exp(iy) for some discrete values of y. Then we take the y that is nearest to our x and write
+ * exp(ix)=exp(iy+(ix-iy))=exp(iy)exp(i(x-y)). We take exp(y) from the table and exp(i(x-y)) from the Taylor series.
+ * This converges very fast since |x-y| is small.
+ */
 {
 	__m128d px = _mm_set_sd(x);
 	__m128d ipx = _mm_mul_pd(px,inv_2pi);
-	int ix = _mm_cvttsd_si32(ipx); //truncation so the function only works for 0 <= x < 2*pi*2^-31
+	int ix = _mm_cvttsd_si32(ipx); // truncation so the function only works for 0 <= x < 2*pi*2^-31
 	ipx = _mm_cvtsi32_sd(ipx,ix);
 	ipx = _mm_mul_pd(p360,ipx);
 	px = _mm_mul_pd(prad_to_deg,px);
-	px = _mm_sub_pd(px,ipx); //this is x (deg) mod 360
-	
-	ix = _mm_cvtsd_si32(px); //the closest integer (rounded, not truncated)
-	__m128d scx = exptbl[ix]; //the tabulated value
-	
+	px = _mm_sub_pd(px,ipx); // this is x (deg) mod 360
+
+	ix = _mm_cvtsd_si32(px); // the closest integer (rounded, not truncated)
+	__m128d scx = exptbl[ix]; // the tabulated value
+
 	ipx = _mm_cvtsi32_sd(ipx,ix);
-	__m128d pz = _mm_sub_pd(ipx,px); //the residual -z=(ix-x)	
+	__m128d pz = _mm_sub_pd(ipx,px); // the residual -z=(ix-x)
 	__m128d py = _mm_mul_pd(pz,pz);	
-	__m128d yy = _mm_shuffle_pd(py,py,0); //now (y,y)		
+	__m128d yy = _mm_shuffle_pd(py,py,0); // now (y,y)
 	__m128d zy = _mm_shuffle_pd(yy,pz,1);	
-	
-	__m128d scz = _mm_mul_pd(c1,yy);	//taylor series approximation
+
+	__m128d scz = _mm_mul_pd(c1,yy);	// Taylor series approximation
 	scz = _mm_add_pd(c2,scz);
 	scz = _mm_mul_pd(yy,scz);
 	scz = _mm_add_pd(c3,scz);
 	scz = _mm_mul_pd(zy,scz);
 	scz = _mm_sub_pd(zo,scz);
-	scx = cmul(scz,scx); //multiply lookup and approximation
-  	return _mm_mul_pd(_mm_set1_pd(c),scx); 
+	scx = cmul(scz,scx); // multiply lookup and approximation
+	return _mm_mul_pd(_mm_set1_pd(c),scx);
 }
 
+//=====================================================================================================================
+
 void CalcInterTerm_opt(const int i,const int j,const int k,doublecomplex * restrict result)
-/* calculates interaction term between two dipoles; given integer distance vector {i,j,k}
- * (in units of d). All six components of the symmetric matrix are computed at once.
- * The elements in result are: [G11, G12, G13, G22, G23, G33]
+/* calculates interaction term between two dipoles; given integer distance vector {i,j,k} (in units of d). All six
+ * components of the symmetric matrix are computed at once. The elements in result are: [G11, G12, G13, G22, G23, G33]
+ * Only 'poi' interaction is implemented in this optimized routine
  */
 {	
-
-// this is used for debugging, should be empty define, when not required
-#define PRINT_GVAL /*printf("%d,%d,%d: %g%+gi, %g%+gi, %g%+gi,\n%g%+gi, %g%+gi, %g%+gi\n",\
-	i,j,k,result[0][RE],result[0][IM],result[1][RE],result[1][IM],result[2][RE],result[2][IM],\
-	result[3][RE],result[3][IM],result[4][RE],result[4][IM],result[5][RE],result[5][IM]);*/
-	
 	double qx = i;
 	double qy = j;
 	double qz = k;
-	
+
 	const double rn = sqrt(qx*qx + qy*qy + qz*qz);
-		
 	const double rr=rn*gridspace;
 	const double invr3=1.0/(rr*rr*rr);
 	const double kr=WaveNum*rr;	
 	const __m128d sc = accImExp(kr,invr3);
-	
 	const double invrn = 1.0/rn;
 	qx *= invrn; qy *= invrn; qz *= invrn;
 	const double qxx=qx*qx, qxy=qx*qy, qxz=qx*qz, qyy=qy*qy, qyz=qy*qz, qzz=qz*qz;
-	
 	const double kr2=kr*kr;
 	const double t1=(3-kr2), t2=-3*kr, t3=(kr2-1);
-	 
+
 	__m128d qff, im_re;
 #define INTERACT_MUL(I)\
-   im_re = cmul(sc,im_re);\
-   _mm_store_pd(result+I,im_re);\
-	
+	im_re = cmul(sc,im_re);\
+	_mm_store_pd(result[I],im_re);
+
 	const __m128d v1 = _mm_set_pd(kr,t3);
 	const __m128d v2 = _mm_set_pd(t2,t1);
-	
+
 	qff = _mm_set1_pd(qxx);
 	im_re = _mm_add_pd(v1,_mm_mul_pd(v2,qff));
 	INTERACT_MUL(0)
-	
+
 	qff = _mm_set1_pd(qxy);
 	im_re = _mm_mul_pd(v2,qff);
 	INTERACT_MUL(1)
-	
+
 	qff = _mm_set1_pd(qxz);
 	im_re = _mm_mul_pd(v2,qff);
 	INTERACT_MUL(2)
-	
+
 	qff = _mm_set1_pd(qyy);
 	im_re = _mm_add_pd(v1,_mm_mul_pd(v2,qff));
 	INTERACT_MUL(3)
-	
+
 	qff = _mm_set1_pd(qyz);
 	im_re = _mm_mul_pd(v2,qff);
 	INTERACT_MUL(4)
-	
+
 	qff = _mm_set1_pd(qzz);
 	im_re = _mm_add_pd(v1,_mm_mul_pd(v2,qff));
 	INTERACT_MUL(5)
-
 #undef INTERACT_MUL
-	
+
 	PRINT_GVAL;
 }
 
 #else //not using SSE3
 
+//=====================================================================================================================
+
 void CalcInterTerm_opt(const int i,const int j,const int k,doublecomplex * restrict result)
-/* calculates interaction term between two dipoles; given integer distance vector {i,j,k}
- * (in units of d). All six components of the symmetric matrix are computed at once.
- * The elements in result are: [G11, G12, G13, G22, G23, G33]
+/* calculates interaction term between two dipoles; given integer distance vector {i,j,k} (in units of d). All six
+ * components of the symmetric matrix are computed at once. The elements in result are: [G11, G12, G13, G22, G23, G33]
  */
 {	
 	double re,im;
-
-// this is used for debugging, should be empty define, when not required
-#define PRINT_GVAL /*printf("%d,%d,%d: %g%+gi, %g%+gi, %g%+gi,\n%g%+gi, %g%+gi, %g%+gi\n",\
-	i,j,k,result[0][RE],result[0][IM],result[1][RE],result[1][IM],result[2][RE],result[2][IM],\
-	result[3][RE],result[3][IM],result[4][RE],result[4][IM],result[5][RE],result[5][IM]);*/
-	
 	double qx = i;
 	double qy = j;
 	double qz = k;
-	
+
 	const double rn = sqrt(qx*qx + qy*qy + qz*qz);
 	const double invrn = 1.0/rn;
 	qx *= invrn; qy *= invrn; qz *= invrn;
 	const double qxx=qx*qx, qxy=qx*qy, qxz=qx*qz, qyy = qy*qy, qyz=qy*qz, qzz=qz*qz;	
-	
 	const double rr=rn*gridspace;
 	const double invr3=1.0/(rr*rr*rr);
 	const double kr=WaveNum*rr;
 	const double kr2=kr*kr;
-	
 	const double cov=cos(kr)*invr3;
 	const double siv=sin(kr)*invr3;
-	
+
 	const double t1=(3-kr2), t2=-3*kr, t3=(kr2-1);
-	
+
 #define INTERACT_MUL(I)\
-   result[I][RE] = re*cov - im*siv;\
+	result[I][RE] = re*cov - im*siv;\
 	result[I][IM] = re*siv + im*cov;
-	
+
 	re = t1*qxx + t3;
 	im = kr + t2*qxx;
 	INTERACT_MUL(0)	
@@ -240,18 +206,15 @@ void CalcInterTerm_opt(const int i,const int j,const int k,doublecomplex * restr
 	re = t1*qzz + t3;
 	im = kr + t2*qzz;
 	INTERACT_MUL(5)
-	
+
 #undef INTERACT_MUL	
-	
+
 	PRINT_GVAL;
 }
 
 #endif //USE_SSE3
 
-// sinint.c
-void cisi(double x,double *ci,double *si);
-
-//============================================================
+//=====================================================================================================================
 
 INLINE bool TestTableSize(const double rn)
 // tests if rn fits into the table; if not, returns false and produces info message
@@ -269,12 +232,11 @@ INLINE bool TestTableSize(const double rn)
 	else return true;
 }
 
-//============================================================
+//=====================================================================================================================
 
 void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex * restrict result)
-/* calculates interaction term between two dipoles; given integer distance vector {i,j,k}
- * (in units of d). All six components of the symmetric matrix are computed at once.
- * The elements in result are: [G11, G12, G13, G22, G23, G33]
+/* calculates interaction term between two dipoles; given integer distance vector {i,j,k} (in units of d). All six
+ * components of the symmetric matrix are computed at once. The elements in result are: [G11, G12, G13, G22, G23, G33]
  */
 {
 	static double rr,qvec[3],q2[3],invr3,qavec[3],av[3];
@@ -288,16 +250,6 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 	static double t3q,t3a,t4q,t4a,t5tr,t5aa,t6tr,t6aa;
 	static const bool inter_avg=true; // temporary fixed option for SO formulation
 
-// this is used for debugging, should be empty define, when not required
-#define PRINT_GVAL /*printf("%d,%d,%d: %g%+gi, %g%+gi, %g%+gi,\n%g%+gi, %g%+gi, %g%+gi\n",\
-	i,j,k,result[0][RE],result[0][IM],result[1][RE],result[1][IM],result[2][RE],result[2][IM],\
-	result[3][RE],result[3][IM],result[4][RE],result[4][IM],result[5][RE],result[5][IM]);*/ 
-
-	// self interaction; self term is computed in different subroutine
-	if (!(i || j || k)) for (comp=0;comp<NDCOMP;comp++) {
-		result[comp][RE]=result[comp][IM]=0;
-		return;
-	}
 	//====== calculate some basic constants ======
 	qvec[0]=i; // qvec is normalized below (after IGT)
 	qvec[1]=j;
@@ -341,16 +293,14 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 		cMult(br,expval,result[comp]);
 	}
 	//====== FCD (static and full) ========
-	/* speed of FCD can be improved by using faster version of sici routine, using predefined
-	 * tables, etc (e.g. as is done in GSL library). But currently extra time for this computation
-	 * is already smaller than one main iteration.
+	/* speed of FCD can be improved by using faster version of sici routine, using predefined tables, etc (e.g. as is
+	 * done in GSL library). But currently extra time for this computation is already smaller than one main iteration.
 	 */
 	if (IntRelation==G_FCD_ST) {
-		/* FCD is based on Gay-Balmaz P., Martin O.J.F. "A library for computing the filtered and
-		 * non-filtered 3D Green's tensor associated with infinite homogeneous space and surfaces",
-		 * Comp. Phys. Comm. 144:111-120 (2002), and Piller N.B. "Increasing the performance of the
-		 * coupled-dipole approximation: A spectral approach", IEEE Trans.Ant.Propag. 46(8):
-		 * 1126-1137. Here it differs by a factor of 4*pi*k^2.
+		/* FCD is based on Gay-Balmaz P., Martin O.J.F. "A library for computing the filtered and non-filtered 3D
+		 * Green's tensor associated with infinite homogeneous space and surfaces", Comp. Phys. Comm. 144:111-120
+		 * (2002), and Piller N.B. "Increasing the performance of the coupled-dipole approximation: A spectral
+		 * approach", IEEE Trans.Ant.Propag. 46(8): 1126-1137. Here it differs by a factor of 4*pi*k^2.
 		 */
 		// result = Gp*[3*Si(k_F*r)+k_F*r*cos(k_F*r)-4*sin(k_F*r)]*2/(3*pi)
 		cisi(kfr,&ci,&si);
@@ -378,8 +328,8 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 	}
 	//======= second order corrections ========
 	else if (IntRelation==G_IGT_SO) {
-		/* There is still some space for speed optimization here (e.g. move mu,nu-independent
-		 * operations out of the cycles over components).
+		/* There is still some space for speed optimization here (e.g. move mu,nu-independent operations out of the
+		 * cycles over components).
 		 */
 		kd2=kd*kd;
 		if (kr*rn < G_BOUND_CLOSE && TestTableSize(rn)) {
@@ -449,15 +399,14 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 			// cycle over tensor components
 			for (mu=0,comp=0;mu<3;mu++) for (nu=mu;nu<3;nu++,comp++) {
 				sig=sigV[mu]*sigV[nu]; // sign of some terms below
-				/* indexes for tables of different dimensions based on transformed indices mu and nu
-				 * '...munu' variables are invariant to permutations because both constituent
-				 * vectors and indices are permutated. So this variables can be used, as precomputed
-				 * above.
+				/* indexes for tables of different dimensions based on transformed indices mu and nu '...munu' variables
+				 * are invariant to permutations because both constituent vectors and indices are permutated. So this
+				 * variables can be used, as precomputed above.
 				 */
 				mu1=invord[mu];
 				nu1=invord[nu];
-				/* indmunu is a number of component[mu,nu] in a symmetric matrix, but counted
-				 * differently than comp. This is {{0,1,3},{1,2,4},{3,4,5}}
+				/* indmunu is a number of component[mu,nu] in a symmetric matrix, but counted differently than comp.
+				 * This is {{0,1,3},{1,2,4},{3,4,5}}
 				 */
 				indmunu=mu1+nu1;
 				if (mu1==2 || nu1==2) indmunu++;
@@ -519,9 +468,9 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 		}
 	}
 	else if (IntRelation==G_SO) {
-		/* There is still some space for speed optimization here (e.g. move mu,nu-independent
-		 * operations out of the cycles over components). But now extra time is equivalent to 2-3
-		 * main iterations. So first priority is to make something useful out of SO.
+		/* There is still some space for speed optimization here (e.g. move mu,nu-independent operations out of the
+		 * cycles over components). But now extra time is equivalent to 2-3 main iterations. So first priority is to
+		 * make something useful out of SO.
 		 */
 		// next line should never happen
 		if (anisotropy) LogError(ONE_POS,"Incompatibility error in CalcInterTerm");
@@ -609,15 +558,14 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 			ind2m=6*ind0;			// cycle over tensor components
 			for (mu=0,comp=0;mu<3;mu++) for (nu=mu;nu<3;nu++,comp++) {
 				sig=sigV[mu]*sigV[nu]; // sign of some terms below
-				/* indexes for tables of different dimensions based on transformed indices mu and nu
-				 * '...munu' variables are invariant to permutations because both constituent
-				 * vectors and indices are permutated. So this variables can be used, as precomputed
-				 * above.
+				/* indexes for tables of different dimensions based on transformed indices mu and nu '...munu' variables
+				 * are invariant to permutations because both constituent vectors and indices are permutated. So this
+				 * variables can be used, as precomputed above.
 				 */
 				mu1=invord[mu];
 				nu1=invord[nu];
-				/* indmunu is a number of component[mu,nu] in a symmetric matrix, but counted
-				 * differently than comp. This is {{0,1,3},{1,2,4},{3,4,5}}
+				/* indmunu is a number of component[mu,nu] in a symmetric matrix, but counted differently than comp.
+				 * This is {{0,1,3},{1,2,4},{3,4,5}}
 				 */
 				indmunu=mu1+nu1;
 				if (mu1==2 || nu1==2) indmunu++;
@@ -719,9 +667,7 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 			for (comp=0;comp<NDCOMP;comp++) cMultSelf(result[comp],br);
 			//==== compute and add Gf1 ===
 			if (!inter_avg) for (comp=0;comp<NDCOMP;comp++) {
-				/* br = {delta[mu,nu]*(3-3ikr-2kr^2+ikr^3)-qmunu*(15-15ikr-6kr^2+ikr^3)}*qa
-				 *    + qamunu*(3-3ikr-kr^2)
-				 */
+				// br = {delta[mu,nu]*(3-3ikr-2kr^2+ikr^3)-qmunu*(15-15ikr-6kr^2+ikr^3)}*qa + qamunu*(3-3ikr-kr^2)
 				br[RE]=(6*kr2-15)*qmunu[comp];
 				br[IM]=(15*kr-kr3)*qmunu[comp];
 				if(dmunu[comp]) {
@@ -759,8 +705,8 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 						// Gm1=expval*i*m*temp; temp is set anew
 						vMult(qvec,prop,qavec);
 						temp = 3*qa*(dmunu[comp]-7*qmunu[comp]) + 6*dmunu[comp]*qvec[mu]*prop[mu]
-							 - 7*(dmunu[comp]-9*qmunu[comp])*DotProd(qavec,q2)
-							 + 3*(prop[mu]*qvec[nu]*(1-7*q2[mu])+prop[nu]*qvec[mu]*(1-7*q2[nu]));
+						     - 7*(dmunu[comp]-9*qmunu[comp])*DotProd(qavec,q2)
+						     + 3*(prop[mu]*qvec[nu]*(1-7*q2[mu])+prop[nu]*qvec[mu]*(1-7*q2[nu]));
 						temp*=kd*invrn3/48;
 						cMultReal(temp,m,Gm1);
 						cMult_i(Gm1);
@@ -777,4 +723,122 @@ void CalcInterTerm_complete(const int i,const int j,const int k,doublecomplex *
 	PRINT_GVAL;
 }
 
-#undef PRINT_GVAL
+//=====================================================================================================================
+
+static double * ATT_MALLOC ReadTableFile(const char * restrict sh_fname,const int size_multiplier)
+{
+	FILE * restrict ftab;
+	double * restrict tab_n;
+	int size;
+	char fname[MAX_FNAME];
+	int i;
+
+	size=TAB_SIZE*size_multiplier;
+	memory+=size*sizeof(double);
+	if (!prognosis) {
+		// allocate memory for tab_n
+		MALLOC_VECTOR(tab_n,double,size,ALL);
+		// open file
+		SnprintfErr(ALL_POS,fname,MAX_FNAME,TAB_PATH"%s",sh_fname);
+		ftab=FOpenErr(fname,"r",ALL_POS);
+		// scan file
+		for (i=0; i<size; i++) if (fscanf(ftab,"%lf\t",&(tab_n[i]))!=1)
+			LogError(ALL_POS,"Scan error in file '%s'. Probably file is too small",fname);
+		if (!feof(ftab))
+			LogWarning(EC_WARN,ONE_POS,"File '%s' is longer than specified size (%d)",fname,size);
+		// close file
+		FCloseErr(ftab,fname,ALL_POS);
+		return tab_n;
+	}
+	else return NULL;
+}
+
+//=====================================================================================================================
+
+static void ReadTables(void)
+{
+	int i, j, ymax, Rm2, Rm2x;
+
+	TIME_TYPE tstart=GET_TIME();
+	tab1=ReadTableFile(TAB_FNAME(1),1);
+	tab2=ReadTableFile(TAB_FNAME(2),6);
+	tab3=ReadTableFile(TAB_FNAME(3),3);
+	tab4=ReadTableFile(TAB_FNAME(4),18);
+	tab5=ReadTableFile(TAB_FNAME(5),6);
+	tab6=ReadTableFile(TAB_FNAME(6),36);
+	tab7=ReadTableFile(TAB_FNAME(7),1);
+	tab8=ReadTableFile(TAB_FNAME(8),6);
+	tab9=ReadTableFile(TAB_FNAME(9),1);
+	tab10=ReadTableFile(TAB_FNAME(10),6);
+	Timing_FileIO += GET_TIME() - tstart;
+
+	if (!prognosis) {
+		// allocate memory for tab_index
+		MALLOC_IMATRIX(tab_index,1,TAB_RMAX,0,TAB_RMAX,ALL);
+		// fill tab_index
+		Rm2=TAB_RMAX*TAB_RMAX;
+		tab_index[1][0] = 0;
+		for (i=1; i<=TAB_RMAX; i++) {
+			Rm2x=Rm2-i*i;
+			ymax = MIN(i,(int)floor(sqrt(Rm2x)));
+			for (j=0; j<ymax; j++) {
+				tab_index[i][j+1] = tab_index[i][j] + MIN(j,(int)floor(sqrt(Rm2x-j*j)))+1;
+			}
+			if (i<TAB_RMAX) tab_index[i+1][0] = tab_index[i][ymax]
+			                                  + MIN(ymax,(int)floor(sqrt(Rm2x-ymax*ymax)))+1;
+		}
+	}
+	// PRINTZ("P[5,3]=%d (should be 41)\n",tab_index[5][3]);
+}
+
+//=====================================================================================================================
+
+static void FreeTables(void)
+{
+	Free_iMatrix(tab_index,1,TAB_RMAX,0);
+	Free_general(tab1);
+	Free_general(tab2);
+	Free_general(tab3);
+	Free_general(tab4);
+	Free_general(tab5);
+	Free_general(tab6);
+	Free_general(tab7);
+	Free_general(tab8);
+	Free_general(tab9);
+	Free_general(tab10);
+}
+
+//=====================================================================================================================
+
+void InitInteraction(void)
+// Initialize the interaction calculations
+{
+	if (IntRelation==G_POINT_DIP) {
+		CalcInterTerm = &CalcInterTerm_opt;
+#ifdef USE_SSE3
+		c1 = _mm_set_pd(1.34959795251974073996e-11,3.92582397764340914444e-14);
+		c2 = _mm_set_pd(-8.86096155697856783296e-7,-3.86632385155548605680e-9);
+		c3 = _mm_set_pd(1.74532925199432957214e-2,1.52308709893354299569e-4);
+		zo = _mm_set_pd(0.0,1.0);
+		inv_2pi = _mm_set_sd(1.0/(2*PI));
+		p360 = _mm_set_sd(360.0);
+		prad_to_deg = _mm_set_sd(180.0/PI);
+
+		for (unsigned int i=0; i<=360; i++) {
+			double x = (PI/180.0)*(double)i;
+			exptbl[i] = _mm_set_pd(sin(x),cos(x));
+		}
+#endif
+	}
+	else CalcInterTerm = &CalcInterTerm_complete;
+	// read tables if needed
+	if (IntRelation == G_SO || IntRelation == G_IGT_SO) ReadTables();
+}
+
+//=====================================================================================================================
+
+void FreeInteraction(void)
+// Free buffers used for interaction calculation
+{
+	if (IntRelation == G_SO || IntRelation == G_IGT_SO) FreeTables();
+}
diff --git a/src/interaction.h b/src/interaction.h
index b3a25bba..e4add71f 100644
--- a/src/interaction.h
+++ b/src/interaction.h
@@ -1,36 +1,25 @@
 /* FILE : interaction.h
  * Descr: the functions used to calculate the interaction term
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2011-2013 ADDA contributors
  * This file is part of ADDA.
  *
- * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
- * General Public License as published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
+ * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  *
- * ADDA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
- * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
- * Public License for more details.
+ * ADDA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License along with ADDA. If not, see
  * <http://www.gnu.org/licenses/>.
  */
-
 #ifndef __interaction_h
 #define __interaction_h
 
 #include "cmplx.h"
 
-extern double gridspace, WaveNum;
-
-#ifdef USE_SSE3
-extern __m128d c1, c2, c3, zo, inv_2pi, p360, prad_to_deg;
-extern __m128d exptbl[361];
-#endif //USE_SSE3
-
-extern void (*CalcInterTerm)(const int i,const int j,const int k,doublecomplex * restrict result);
-extern void InitInteraction(void);
-
-//============================================================
+void (*CalcInterTerm)(const int i,const int j,const int k,doublecomplex * restrict result);
+void InitInteraction(void);
+void FreeInteraction(void);
 
 #endif //__interaction_h
diff --git a/src/iterative.c b/src/iterative.c
index b65a400c..2bf3cd67 100644
--- a/src/iterative.c
+++ b/src/iterative.c
@@ -10,7 +10,7 @@
  *        (e.g. -int so), however they do it much slowly than usually. It is recommended then to use
  *        BiCGStab.
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -49,7 +49,7 @@ extern const TIME_TYPE tstart_CE;
 extern doublecomplex *rvec; // can't be declared restrict due to SwapPointers
 extern doublecomplex * restrict vec1,* restrict vec2,* restrict vec3,* restrict Avecbuffer;
 // defined and initialized in fft.c
-#ifndef OPENCL
+#if !defined(OPENCL) && !defined(SPARSE)
 extern doublecomplex * restrict Xmatrix; // used as storage for arrays in WKB init field
 #endif
 // defined and initialized in param.c
@@ -366,7 +366,7 @@ ITER_FUNC(BiCG_CS)
 		scalars[0].ptr=&ro_old;
 		scalars[0].size=sizeof(doublecomplex);
 	}
-	else if (ph==PHASE_INIT); // no specific initialization required
+	else if (ph==PHASE_INIT) {} // no specific initialization required
 	// main iteration cycle
 	else if (ph==PHASE_ITER) {
 		// ro_k-1=r_k-1(*).r_k-1; check for ro_k-1!=0
@@ -384,7 +384,7 @@ ITER_FUNC(BiCG_CS)
 			nIncrem10_cmplx(pvec,rvec,beta,NULL,NULL);
 		}
 		// q_k=Avecbuffer=A.p_k
-		if (niter==1 && matvec_ready); // do nothing, Avecbuffer is ready to use
+		if (niter==1 && matvec_ready) {} // do nothing, Avecbuffer is ready to use
 		else MatVec(pvec,Avecbuffer,NULL,false,&Timing_OneIterComm);
 		// mu_k=p_k.q_k; check for mu_k!=0
 		nDotProd_conj(pvec,Avecbuffer,mu,&Timing_OneIterComm);
@@ -514,7 +514,7 @@ ITER_FUNC(CGNR)
 		scalars[0].ptr=&ro_old;
 		scalars[0].size=sizeof(double);
 	}
-	else if (ph==PHASE_INIT); // no specific initialization required
+	else if (ph==PHASE_INIT) {} // no specific initialization required
 	else if (ph==PHASE_ITER) {
 		// p_1=Ah.r_0 and ro_new=ro_0=|Ah.r_0|^2
 		// since first product is with Ah , matvec_ready can't be employed
@@ -1061,7 +1061,7 @@ static const char *CalcInitField(double zero_resid)
 		nSubtr(rvec,pvec,Avecbuffer,&inprodR,&Timing_InitIterComm);
 		descr="x_0 = E_inc\n";
 	}
-#ifndef ADDA_SPARSE	//currently no support for WKB in sparse mode
+#ifndef SPARSE	//currently no support for WKB in sparse mode
 	else if (InitField==IF_WKB) {
 		if (prop[2]!=1) LogError(ONE_POS,"WKB initial field currently works only with default "
 			"incident direction of the incoming wave (along z-axis)");
@@ -1160,7 +1160,7 @@ static const char *CalcInitField(double zero_resid)
 		nSubtr(rvec,pvec,Avecbuffer,&inprodR,&Timing_InitIterComm);
 		descr="x_0 = result of WKB\n";
 	} // redundant test
-#endif //ADDA_SPARSE	
+#endif // !SPARSE
 	else LogError(ONE_POS,"Unknown method to calculate initial field (%d)",(int)InitField);
 	return descr;
 }
diff --git a/src/make_particle.c b/src/make_particle.c
index 252fbb3e..907a4480 100644
--- a/src/make_particle.c
+++ b/src/make_particle.c
@@ -3,7 +3,7 @@
  * Descr: this module initializes the dipole set, either using predefined shapes or reading from a
  *        file; includes granule generator
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -43,28 +43,33 @@
 // SEMI-GLOBAL VARIABLES
 
 // defined and initialized in param.c
-extern const int sh_Npars;
 extern const enum sh shape;
-extern const double sh_pars[];
 extern const enum sym sym_type;
 extern const double lambda;
 extern double sizeX,dpl,a_eq;
 extern const int jagged;
 extern const char *shape_fname;
 extern const char *shapename;
-extern const char *save_geom_fname;
 extern const bool volcor,save_geom;
 extern opt_index opt_sh;
+#ifndef SPARSE
+extern const int sh_Npars;
+extern const double sh_pars[];
+extern const char *save_geom_fname;
 extern const double gr_vf;
 extern double gr_d;
 extern const int gr_mat;
 extern enum shform sg_format;
 extern bool store_grans;
+#endif
 
 // defined and initialized in timing.c
-extern TIME_TYPE Timing_Particle,Timing_Granul,Timing_GranulComm;
+extern TIME_TYPE Timing_Particle;
+#ifndef SPARSE
+extern TIME_TYPE Timing_Granul,Timing_GranulComm;
+#endif
 
-// used in fft.c
+// used in interaction.c
 double gridspace; // interdipole distance (dipole size)
 
 // used in param.c
@@ -83,7 +88,9 @@ static const char geom_format_ext[]="%d %d %d %d\n";       // extended format of
  */
 static const char ddscat_format_read1[]="%*s %d %d %d %d %d %d\n";
 static const char ddscat_format_read2[]="%*s %d %d %d %d";
+#ifndef SPARSE
 static const char ddscat_format_write[]="%zu %d %d %d %d %d %d\n";
+#endif
 // ratio of scatterer volume to enclosing cube; used for dpl correction and initialization by a_eq
 static double volume_ratio;
 static double Ndip;             // total number of dipoles (in a circumscribing cube)
@@ -93,7 +100,7 @@ static FILE * restrict dipfile; // handle of dipole file
 static enum shform read_format; // format of dipole file, which is read
 static double cX,cY,cZ;         // center for DipoleCoord, it is sometimes used in PlaceGranules
 
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 
 // shape parameters
 static double coat_x,coat_y,coat_z,coat_r2;
@@ -133,11 +140,11 @@ struct segment * restrict contSeg;
 
 // temporary arrays before their real counterparts are allocated
 static unsigned char * restrict material_tmp;
-static double * restrict DipoleCoord_tmp;
 static unsigned short * restrict position_tmp;
-#endif //ADDA_SPARSE
 
-#ifndef ADDA_SPARSE //much of the functionality here is disabled in sparse mode
+#endif // !SPARSE
+
+#ifndef SPARSE //much of the functionality here is disabled in sparse mode
 
 // EXTERNAL FUNCTIONS
 
@@ -231,8 +238,6 @@ static void SaveGeometry(void)
 	Timing_FileIO+=GET_TIME()-tstart;
 }
 
-//===========================================================
-
 //==========================================================
 #define ALLOCATE_SEGMENTS(N) (struct segment *)voidVector((N)*sizeof(struct segment),ALL_POS,\
 	"contour segment");
@@ -1037,7 +1042,7 @@ static size_t PlaceGranules(void)
 
 //===========================================================
 
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 
 static void InitDipFile(const char * restrict fname,int *bX,int *bY,int *bZ,int *Nm,
 	const char **rft)
@@ -1227,17 +1232,21 @@ static void InitDipFile(const char * restrict fname,int *bX,int *bY,int *bZ,int
 //===========================================================
 
 static void ReadDipFile(const char * restrict fname)
-/* read dipole file; no consistency checks are made since they are made in InitDipFile.
- * the file is opened in InitDipFile; this function only closes the file.
+/* read dipole file; no consistency checks are made since they are made in InitDipFile. The file is opened in
+ * InitDipFile; this function only closes the file.
+ *
+ * The operation is quite different in FFT and sparse modes. In FFT mode only material is set here, while position is
+ * set in the main loop over shapes in MakeParticle(). By contrast, in sparse mode position is set here as well (since
+ * the loop in MakeParticle() is skipped altogether.
  */
 {
 	int x,y,z,x0,y0,z0,mat,scanned;
 	size_t index=0;
 	char linebuf[BUF_LINE];	
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 	// to remove possible overflows
 	size_t boxX_l=(size_t)boxX;
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 
 	TIME_TYPE tstart=GET_TIME();
 	
@@ -1262,26 +1271,25 @@ static void ReadDipFile(const char * restrict fname)
 			y0-=minY;
 			z0-=minZ;
 			// initialize box jagged*jagged*jagged instead of one dipole
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 			for (z=jagged*z0;z<jagged*(z0+1);z++) if (z>=local_z0 && z<local_z1_coer)
 				for (x=jagged*x0;x<jagged*(x0+1);x++) for (y=jagged*y0;y<jagged*(y0+1);y++) {
 					index=(z-local_z0)*boxXY+y*boxX_l+x;
 					material_tmp[index]=(unsigned char)(mat-1);
 			}
 #else
-			
 			for (z=0;z<jagged;z++) 
 			for (y=0;y<jagged;y++) 
 			for (x=0;x<jagged;x++) {
 				if ((index >= local_nvoid_d0) && (index < local_nvoid_d1)) {
-					material_full[index]=(unsigned char)(mat-1);
+					material[index-local_nvoid_d0]=(unsigned char)(mat-1);
 					position_full[3*index]=x0*jagged+x;
 					position_full[3*index+1]=y0*jagged+y;
 					position_full[3*index+2]=z0*jagged+z;
 				}
 				index++;
 			}
-#endif //ADDA_SPARSE
+#endif // SPARSE
 		}
 	}
 	
@@ -1333,7 +1341,7 @@ void InitShape(void)
 	double n_sizeX; // new value for size
 	double yx_ratio,zx_ratio;
 	double tmp1,tmp2;
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 	double tmp3;
 #endif
 	TIME_TYPE tstart;
@@ -1371,7 +1379,7 @@ void InitShape(void)
 	// initialization of global option index for error messages
 	opt=opt_sh;
 	// shape initialization
-#ifndef ADDA_SPARSE //shapes other than "read" are disabled in sparse mode
+#ifndef SPARSE //shapes other than "read" are disabled in sparse mode
 	if (shape==SH_AXISYMMETRIC) {
 		/* Axisymmetric homogeneous shape, defined by its contour in ro-z plane of the cylindrical
 		 * coordinate system. Its symmetry axis coincides with the z-axis, and the contour is read
@@ -1782,7 +1790,7 @@ void InitShape(void)
 		Nmat_need=2;
 	}	
 	else 
-#endif //ADDA_SPARSE	
+#endif // !SPARSE
 	if (shape==SH_READ) {
 		symX=symY=symZ=symR=false; // input file is assumed fully asymmetric
 		const char *rf_text;
@@ -1826,11 +1834,11 @@ void InitShape(void)
 	 *   define your own (with more informative names) in the beginning of this function.
 	 */
 	else 
-#ifndef ADDA_SPARSE 
+#ifndef SPARSE 
 		LogError(ONE_POS,"Unknown shape"); // this is mainly to remove 'uninitialized' warnings
 #else
 		LogError(ONE_POS,"Only shapes read from a file are currently supported in sparse mode"); 
-#endif //ADDA_SPARSE
+#endif // SPARSE
 
 	// check for redundancy of input data
 	if (dpl!=UNDEF) {
@@ -1848,7 +1856,7 @@ void InitShape(void)
 				"while shape '%s' sets both the particle size and the size of the grid",shapename);
 		}
 	}
-#ifndef ADDA_SPARSE //granulation disabled in sparse mode
+#ifndef SPARSE //granulation disabled in sparse mode
 	// initialize domain granulation
 	if (sh_granul) {
 		symX=symY=symZ=symR=false; // no symmetry with granules
@@ -1961,13 +1969,13 @@ void InitShape(void)
 			PrintError("Particle (boxY,Z={%d,%d}) does not fit into specified boxY,Z={%d,%d}",
 				n_boxY,n_boxZ,boxY,boxZ);
 	}
-#ifndef ADDA_SPARSE //this check is not needed in sparse mode
+#ifndef SPARSE //this check is not needed in sparse mode
 	// initialize number of dipoles; first check that it fits into size_t type
 	double tmp=((double)boxX)*((double)boxY)*((double)boxZ);
 	if (tmp > SIZE_MAX) LogError(ONE_POS,"Total number of dipoles in the circumscribing "
 		"box (%.0f) is larger than supported by size_t type on this system (%zu). If possible, "
 		"please recompile ADDA in 64-bit mode.",tmp,SIZE_MAX);
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 	Ndip=boxX*boxY*boxZ;
 	// initialize maxiter; not very realistic
 	if (maxiter==UNDEF) maxiter=MIN(INT_MAX,3*Ndip);
@@ -1991,11 +1999,10 @@ void MakeParticle(void)
 // creates a particle; initializes all dipoles counts, dpl, gridspace
 {
 	
-	size_t index;
+	size_t index,dip,i3;
 	TIME_TYPE tstart;
-#ifndef ADDA_SPARSE
 	int i;
-	size_t dip;
+#ifndef SPARSE
 	double jcX,jcY,jcZ; // center for jagged
 	size_t local_nRows_tmp;
 	int j,k,ns;	
@@ -2007,7 +2014,7 @@ void MakeParticle(void)
 	int mat;
 	unsigned short us_tmp;
 	TIME_TYPE tgran;
-#endif //ADDA_SPARSE
+#endif // !SPARSE
 
 	/* TO ADD NEW SHAPE
 	 * Add here all intermediate variables, which are used only inside this function. You may as
@@ -2020,7 +2027,7 @@ void MakeParticle(void)
 	cY=(boxY-1)/2.0;
 	cZ=(boxZ-1)/2.0;
 		
-#ifndef ADDA_SPARSE //shapes other than "read" are disabled in sparse mode
+#ifndef SPARSE //shapes other than "read" are disabled in sparse mode
 	index=0;	
 	// assumed that box's are even
 	jcX=jcY=jcZ=jagged/2.0;
@@ -2031,7 +2038,6 @@ void MakeParticle(void)
 	 * they will be reallocated afterwards (when local_nRows is known).
 	 */
 	MALLOC_VECTOR(material_tmp,uchar,local_Ndip,ALL);
-	MALLOC_VECTOR(DipoleCoord_tmp,double,local_nRows_tmp,ALL);
 	MALLOC_VECTOR(position_tmp,ushort,local_nRows_tmp,ALL);
 
 	for(k=local_z0;k<local_z1_coer;k++)
@@ -2195,49 +2201,35 @@ void MakeParticle(void)
 				 * only in this part of the code), either use 'tmp1'-'tmp3' or define your own (with
 				 * more informative names) in the beginning of this function.
 				 */
-
 				position_tmp[3*index]=(unsigned short)i;
 				position_tmp[3*index+1]=(unsigned short)j;
 				position_tmp[3*index+2]=(unsigned short)k;
 				// afterwards multiplied by gridspace
-				DipoleCoord_tmp[3*index]=i-cX;
-				DipoleCoord_tmp[3*index+1]=j-cY;
-				DipoleCoord_tmp[3*index+2]=k-cZ;
 				material_tmp[index]=(unsigned char)mat;
 				index++;
 			} // End box loop
-#endif //ADDA_SPARSE
-
-#ifdef ADDA_SPARSE
-	MALLOC_VECTOR(material_full,uchar,nvoid_Ndip,ALL);
-	MALLOC_VECTOR(position_full,int,nvoid_Ndip*3,ALL);
-	memory+=(sizeof(char)+sizeof(int)*3)*nvoid_Ndip;
-
-	//for sparse mode, nvoid_Ndip is defined in InitDipFile
-	//so we actually run SetupLocalD before reading the file
-	//(which requires local_nvoid_d0 and local_nvoid_d1 to be defined)
-	SetupLocalD();
+#else // SPARSE
+	// local_nvoid_d0 and local_nvoid_d1 are set earlier in ParSetup()
 	local_nvoid_Ndip=local_nvoid_d1-local_nvoid_d0;
-	local_Ndip = local_nvoid_Ndip;
-#endif
-
+	local_Ndip=local_nvoid_Ndip;
+	MALLOC_VECTOR(material,uchar,local_nvoid_Ndip,ALL);
+	MALLOC_VECTOR(position_full,int,nvoid_Ndip*3,ALL);
+	memory+=3*sizeof(int)*nvoid_Ndip+sizeof(char)*local_nvoid_Ndip;
+#endif // SPARSE
 	if (shape==SH_READ) ReadDipFile(shape_fname);
 	// initialization of mat_count and dipoles counts
-	
-#ifndef ADDA_SPARSE
 	for(i=0;i<=Nmat;i++) mat_count[i]=0;
+#ifdef SPARSE
+	for(dip=0;dip<local_Ndip;dip++) mat_count[material[dip]]++;
+	MyInnerProduct(mat_count,sizet_type,Nmat+1,NULL);
+#else
 	for(dip=0;dip<local_Ndip;dip++) mat_count[material_tmp[dip]]++;
 	local_nvoid_Ndip=local_Ndip-mat_count[Nmat];
 	SetupLocalD();
 	MyInnerProduct(mat_count,sizet_type,Nmat+1,NULL);
-	if ((nvoid_Ndip=Ndip-mat_count[Nmat])==0)
-		LogError(ONE_POS,"All dipoles of the scatterer are void");
-#endif //ADDA_SPARSE
-
-
-	
-	if (nvoid_Ndip==0)
-		LogError(ONE_POS,"All dipoles of the scatterer are void");
+	nvoid_Ndip=Ndip-mat_count[Nmat];
+#endif // !SPARSE
+	if (nvoid_Ndip==0) LogError(ONE_POS,"All dipoles of the scatterer are void");
 	local_nRows=3*local_nvoid_Ndip;
 	// initialize dpl and gridspace
 	volcor_used=(volcor && (volume_ratio!=UNDEF));
@@ -2268,7 +2260,7 @@ void MakeParticle(void)
 	ka_eq = WaveNum*a_eq;
 	inv_G = 1/(PI*a_eq*a_eq);
 	
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 	// granulate one domain, if needed
 	if (sh_granul) {
 		tgran=GET_TIME();
@@ -2296,61 +2288,50 @@ void MakeParticle(void)
 		mat_count[gr_mat]-=mat_count[Nmat-1];
 		Timing_Granul=GET_TIME()-tgran;
 	}
-#endif //ADDA_SPARSE
-	
 	/* allocate main particle arrays, using precise local_nRows even when prognosis is used to enable
 	 * save_geom afterwards.
 	 */
-MALLOC_VECTOR(material,uchar,local_nvoid_Ndip,ALL);
-MALLOC_VECTOR(DipoleCoord,double,local_nRows,ALL);
-#ifndef ADDA_SPARSE	
+	MALLOC_VECTOR(material,uchar,local_nvoid_Ndip,ALL);
 	MALLOC_VECTOR(position,ushort,local_nRows,ALL);
-	memory+=(3*(sizeof(short int)+sizeof(double))+sizeof(char))*local_nvoid_Ndip;
-#else
-	MALLOC_VECTOR(position,int,local_nRows,ALL);
-	memory+=(3*(sizeof(int)+sizeof(double))+sizeof(char))*local_nvoid_Ndip;
-	
-	memcpy(material, &(material_full[local_nvoid_d0]), local_nvoid_Ndip*sizeof(*material_full));
-	memcpy(position, &(position_full[3*local_nvoid_d0]), local_nRows*sizeof(*position_full));      
-
-	for (index=0; index<local_nvoid_Ndip; index++) {
-		DipoleCoord[3*index] = (position[3*index] - cX) * gridspace;
-		DipoleCoord[3*index+1] = (position[3*index+1] - cY) * gridspace;
-		DipoleCoord[3*index+2] = (position[3*index+2] - cZ) * gridspace;
-	}
-#endif
-
-#ifndef ADDA_SPARSE	
+	memory+=(3*sizeof(short int)+sizeof(char))*local_nvoid_Ndip;
 	// copy nontrivial part of arrays
 	index=0;
 	for (dip=0;dip<local_Ndip;dip++) if (material_tmp[dip]<Nmat) {
 		material[index]=material_tmp[dip];
-		// DipoleCoord=gridspace*DipoleCoord_tmp
-		vMultScal(gridspace,DipoleCoord_tmp+3*dip,DipoleCoord+3*index);
 		memcpy(position+3*index,position_tmp+3*dip,3*sizeof(short int));
 		index++;
 	}
 
 	// free temporary memory
 	Free_general(material_tmp);
-	Free_general(DipoleCoord_tmp);
 	Free_general(position_tmp);
 	if (shape==SH_AXISYMMETRIC) {
 		for (ns=0;ns<contNseg;ns++) FreeContourSegment(contSeg+ns);
 		Free_general(contSegRoMin);
 		Free_general(contSegRoMax);
 	}
-#endif
+#else
+	position=position_full + 3*local_nvoid_d0;
+#endif // SPARSE
+	// initialize DipoleCoord
+	MALLOC_VECTOR(DipoleCoord,double,local_nRows,ALL);
+	memory+=3*sizeof(double)*local_nvoid_Ndip;
+	for (index=0; index<local_nvoid_Ndip; index++) {
+		i3=3*index;
+		DipoleCoord[i3] = (position[i3]-cX)*gridspace;
+		DipoleCoord[i3+1] = (position[i3+1]-cY)*gridspace;
+		DipoleCoord[i3+2] = (position[i3+2]-cZ)*gridspace;
+	}
 
 	// save geometry
 	if (save_geom)
-#ifndef ADDA_SPARSE 
+#ifndef SPARSE 
 		SaveGeometry();
 #else
-		LogWarning(EC_WARN,ONE_POS,"Saving the geometry is not supported in sparse mode");
-#endif //ADDA_SPARSE
+		LogError(ONE_POS,"Saving the geometry is not supported in sparse mode");
+#endif // SPARSE
 
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 	/* adjust z-axis of position vector, to speed-up matrix-vector multiplication a little bit;
 	 * after this point 'position(z)' is taken relative to the local_z0.
 	 */
@@ -2360,19 +2341,18 @@ MALLOC_VECTOR(DipoleCoord,double,local_nRows,ALL);
 	}
 	local_Nz_unif=position[3*local_nvoid_Ndip-1]+1;
 	local_z0_unif=local_z0; // TODO: should be changed afterwards
-#endif
+#endif // !SPARSE
 
 	box_origin_unif[0]=-gridspace*cX;
 	box_origin_unif[1]=-gridspace*cY;
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 	box_origin_unif[2]=gridspace*(local_z0_unif-cZ);
 #else
 	box_origin_unif[2]=-gridspace*cZ;
-	
-	InitArraySync();
-	SyncPosition(position_full);
-	SyncMaterial(material_full);
-#endif //ADDA_SPARSE
+#	ifdef PARALLEL
+	AllGather(NULL,position_full,int3_type,NULL);
+#	endif
+#endif // SPARSE
 	
 	Timing_Particle += GET_TIME() - tstart;
 }
diff --git a/src/matvec.c b/src/matvec.c
index 39e48dc6..b5820ccd 100644
--- a/src/matvec.c
+++ b/src/matvec.c
@@ -3,7 +3,7 @@
  * Descr: calculate local matrix vector product of decomposed interaction matrix with r_k or p_k,
  *        using a FFT based convolution algorithm
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -25,32 +25,32 @@
 #include "fft.h"
 #include "function.h"
 #include "io.h"
-#include "sparse_ops.h"
 #include "interaction.h"
 #include "linalg.h"
+#include "oclcore.h"
 #include "prec_time.h"
+#include "sparse_ops.h"
 #include "vars.h"
 // system headers
 #include <stdio.h>
 #include <string.h>
 
-#ifdef OPENCL
-#	include "oclcore.h"
-#endif
-
 // SEMI-GLOBAL VARIABLES
 
+#ifdef SPARSE
+// defined and initialized in calculator.c
+extern doublecomplex * restrict arg_full;
+#elif !defined(OPENCL)
 // defined and initialized in fft.c
-#ifndef OPENCL
 extern const doublecomplex * restrict Dmatrix;
 extern doublecomplex * restrict Xmatrix,* restrict slices,* restrict slices_tr;
-#endif
 extern const size_t DsizeY,DsizeZ,DsizeYZ;
+#endif // !SPARSE && !OPENCL
 
 // defined and initialized in timing.c
 extern size_t TotalMatVec;
 
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 #ifndef OPENCL // the following inline functions are not used in OpenCL or sparse mode
 
 //============================================================
@@ -102,11 +102,11 @@ INLINE size_t IndexDmatrix_mv(size_t x,size_t y,size_t z,const bool transposed)
 	return(NDCOMP*(x*DsizeYZ+z*DsizeY+y));
 }
 #endif // OPENCL
-#endif // ADDA_SPARSE
+#endif // SPARSE
 
 //============================================================
 
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 void MatVec (doublecomplex * restrict argvec,    // the argument vector
              doublecomplex * restrict resultvec, // the result vector
              double *inprod,         // the resulting inner product
@@ -474,11 +474,12 @@ void MatVec (doublecomplex * restrict argvec,    // the argument vector
 	TotalMatVec++;
 }
 
-#else //ADDA_SPARSE is defined
+#else // SPARSE is defined
+
+//============================================================
 
-/* The sparse MatVec is implemented completely separately from the non-sparse version.
- * Although there is some code duplication, this probably makes the both versions easier
- * to maintain.
+/* The sparse MatVec is implemented completely separately from the non-sparse version. Although there is some code
+ * duplication, this probably makes the both versions easier to maintain.
 */
 void MatVec (doublecomplex * restrict argvec,    // the argument vector
              doublecomplex * restrict resultvec, // the result vector
@@ -487,52 +488,33 @@ void MatVec (doublecomplex * restrict argvec,    // the argument vector
              TIME_TYPE *comm_timing) // this variable is incremented by communication time
 {	
 	const bool ipr = (inprod != NULL);
-
-	if (her) {
-		for (size_t j=0; j<local_nRows; j++) {
-			argvec[j][IM] = -argvec[j][IM];		 
-		}
-	}
-	
-	for (size_t j=0; j<local_nvoid_Ndip; j++) {
-		CcMul(argvec,arg_full+3*local_nvoid_d0,j);
-	}
-	SyncArgvec();	
-	
-	for (size_t i=0; i<local_nvoid_Ndip; i++) {
-		const unsigned int i3 = 3*i;		
+	size_t i,j,i3;
+
+	if (her) for (j=0; j<local_nRows; j++) argvec[j][IM] = -argvec[j][IM];
+	for (j=0; j<local_nvoid_Ndip; j++) CcMul(argvec,arg_full+3*local_nvoid_d0,j);
+#	ifdef PARALLEL
+	AllGather(NULL,arg_full,cmplx3_type,comm_timing);
+#	endif
+	for (i=0; i<local_nvoid_Ndip; i++) {
+		i3 = 3*i;
 		resultvec[i3][RE]=resultvec[i3][IM]=0.0;
 		resultvec[i3+1][RE]=resultvec[i3+1][IM]=0.0;
 		resultvec[i3+2][RE]=resultvec[i3+2][IM]=0.0;
-		for (size_t j=0; j<local_nvoid_d0+i; j++) {			
-			AijProd(arg_full, resultvec, i, j);
-		}		
-		for (size_t j=local_nvoid_d0+i+1; j<nvoid_Ndip; j++) {
-			AijProd(arg_full, resultvec, i, j);
-		}
+		for (j=0; j<local_nvoid_d0+i; j++) AijProd(arg_full,resultvec,i,j);
+		for (j=local_nvoid_d0+i+1; j<nvoid_Ndip; j++) AijProd(arg_full,resultvec,i,j);
 	}		
-	
-	for (size_t i=0; i<local_nvoid_Ndip; i++) {	
-		DiagProd(argvec, resultvec, i);		 
+	for (i=0; i<local_nvoid_Ndip; i++) DiagProd(argvec,resultvec,i);
+	if (her) for (i=0; i<local_nRows; i++) {
+		resultvec[i][IM] = -resultvec[i][IM];
+		argvec[i][IM] = -argvec[i][IM];
 	}
-	
-	if (her) {
-		for (size_t i=0; i<local_nRows; i++) {
-			resultvec[i][IM] = -resultvec[i][IM];
-			argvec[i][IM] = -argvec[i][IM];		
-		}
-	}	
-		
 	if (ipr) {
 		*inprod = 0.0;
-		for (size_t i=0; i<local_nRows; i++) {
-			*inprod += resultvec[i][RE]*resultvec[i][RE] + resultvec[i][IM]*resultvec[i][IM];		 
-		}		
+		for (i=0; i<local_nRows; i++)
+			*inprod += resultvec[i][RE]*resultvec[i][RE] + resultvec[i][IM]*resultvec[i][IM];
 		MyInnerProduct(inprod,double_type,1,comm_timing);
 	}
-	
 	TotalMatVec++;
-	
 }
 
-#endif //ADDA_SPARSE
+#endif // SPARSE
diff --git a/src/ocl/Makefile b/src/ocl/Makefile
index e3de38e2..b136f447 100644
--- a/src/ocl/Makefile
+++ b/src/ocl/Makefile
@@ -35,27 +35,31 @@
 #  LDFLAGS += -Wl,--enable-stdcall-fixup
 #endif
 
-ifneq ($(filter CLFFT_APPLE,$(OPTIONS)),)
-  ifeq ($(filter NO_CPP,$(OPTIONS)),)
-    CPPSOURCE += fft_execute.cpp fft_setup.cpp fft_kernelstring.cpp
-    CFLAGS    += -DCLFFT_APPLE
-  else
-    $(error Apple clFFT (CLFFT_APPLE) is implemented in C++, hence incompatible with NO_CPP)
-  endif
-else # AMD clFFT flags
-  # Depending on a particular clAmdFft installation one may need to manually specify paths to its headers and libraries.
-  # Path should be absolute or relative to the location of this Makefile.
-  #CFLAGS  += -I"C:\Program Files (x86)\AMD\clAmdFft\include"
-  #LDFLAGS += -L"C:\Program Files (x86)\AMD\clAmdFft\lib64\import"
+ifneq ($(filter SPARSE,$(OPTIONS)),)
+  $(error OPENCL version of sparse mode is not yet available)
+else # the following is only for non-sparse mode
+  ifneq ($(filter CLFFT_APPLE,$(OPTIONS)),)
+    ifeq ($(filter NO_CPP,$(OPTIONS)),)
+      CPPSOURCE += fft_execute.cpp fft_setup.cpp fft_kernelstring.cpp
+      CFLAGS    += -DCLFFT_APPLE
+    else
+      $(error Apple clFFT (CLFFT_APPLE) is implemented in C++, hence incompatible with NO_CPP)
+    endif
+  else # AMD clFFT flags
+    # Depending on a particular clAmdFft installation one may need to manually specify paths to its headers and 
+    # libraries. Path should be absolute or relative to the location of this Makefile.
+    #CFLAGS  += -I"C:\Program Files (x86)\AMD\clAmdFft\include"
+    #LDFLAGS += -L"C:\Program Files (x86)\AMD\clAmdFft\lib64\import"
   
-  LDFLAGS += -lclAmdFft.Runtime
+    LDFLAGS += -lclAmdFft.Runtime
   
-  # Uncomment the following line if you experience problems during linking, like
-  # "...libclAmdFft.Runtime.so: undefined reference to `clRetainContext@OPENCL_1.0'"
-  # This is caused by Nvidia OpenCL libraries on Linux (hopefully it will disappear in next releases of this library)
-  # The drawback of this workaround is that a warning would appear at each execution of adda_ocl. If you don't like it,
-  # provide another libOpenCL.so (e.g. that from AMD) for linkig.
-  #LDFLAGS +=-Wl,--unresolved-symbols=ignore-in-shared-libs
+    # Uncomment the following line if you experience problems during linking, like
+    # "...libclAmdFft.Runtime.so: undefined reference to `clRetainContext@OPENCL_1.0'"
+    # This is caused by Nvidia OpenCL libraries on Linux (hopefully it will disappear in next releases of this library)
+    # The drawback of this workaround is that a warning would appear at each execution of adda_ocl. If you don't like 
+    # it, provide another libOpenCL.so (e.g. that from AMD) for linkig.
+    #LDFLAGS +=-Wl,--unresolved-symbols=ignore-in-shared-libs
+  endif
 endif
 
 #===================================================================================================
diff --git a/src/oclcore.c b/src/oclcore.c
index af7bdd14..c7f4a001 100644
--- a/src/oclcore.c
+++ b/src/oclcore.c
@@ -2,7 +2,7 @@
  * $Date::                            $
  * Descr: core parts of OpenCL code
  *
- * Copyright (C) 2010-2012 ADDA contributors
+ * Copyright (C) 2010-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -73,9 +73,8 @@ struct string {
 // The kernel source is either loaded from oclkernels.cl at runtime or included at compile time
 //#define OCL_READ_SOURCE_RUNTIME
 
-// For some reason Eclipse points out a syntax error in the following block. Just ignore it.
 #ifndef OCL_READ_SOURCE_RUNTIME
-	const char stringifiedSourceCL[]=
+	const char stringifiedSourceCL[]=""
 	// the following is a pure string generated automatically from oclkernels.cl at compile time
 #	include "ocl/oclkernels.clstr"
 	;
diff --git a/src/oclcore.h b/src/oclcore.h
index 6f06a8d6..3749d65a 100644
--- a/src/oclcore.h
+++ b/src/oclcore.h
@@ -1,8 +1,8 @@
 /* File: oclcore.h
  * $Date::                            $
- * Descr: all common OpenCL variables and functions
+ * Descr: all common OpenCL variables and functions; void in non-OpenCL mode
  *
- * Copyright (C) 2010-2012 ADDA contributors
+ * Copyright (C) 2010-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -16,6 +16,7 @@
  * You should have received a copy of the GNU General Public License along with ADDA. If not, see
  * <http://www.gnu.org/licenses/>.
  */
+#ifdef OPENCL
 
 #ifndef __oclcore_h
 #define __oclcore_h
@@ -73,3 +74,5 @@ INLINE void CheckCLErr(const cl_int err,ERR_LOC_DECL,const char * restrict msg)
 }
 
 #endif // __oclcore_h
+
+#endif // OPENCL
diff --git a/src/param.c b/src/param.c
index eaf59905..b0bd394f 100644
--- a/src/param.c
+++ b/src/param.c
@@ -26,6 +26,7 @@
 #include "fft.h"
 #include "function.h"
 #include "io.h"
+#include "oclcore.h"
 #include "os.h"
 #include "parbas.h"
 #include "vars.h"
@@ -38,11 +39,8 @@
 #include <string.h>
 #include <time.h>
 
-#ifdef OPENCL
-#	include "oclcore.h"
-#	ifdef CLFFT_AMD
-#		include <clAmdFft.h> // for version information
-#	endif
+#ifdef CLFFT_AMD
+#	include <clAmdFft.h> // for version information
 #endif
 
 // definitions for file locking
@@ -80,10 +78,10 @@ extern const char beam_descr[];
 // defined and initialized in make_particle.c
 extern const bool volcor_used;
 extern const char *sh_form_str1,*sh_form_str2;
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 extern const int gr_N;
 extern const double gr_vf_real;
-#endif //ADDA_SPARSE
+#endif //SPARSE
 extern const size_t mat_count[];
 
 // used in CalculateE.c
@@ -110,15 +108,16 @@ const char *scat_grid_parms;      // name of file with parameters of scattering
 double prop_0[3];                 // initial incident direction (in laboratory reference frame)
 double incPolX_0[3],incPolY_0[3]; // initial incident polarizations (in lab RF)
 enum scat ScatRelation;           // type of formulae for scattering quantities
-// used in fft.c
-double igt_lim; // limit (threshold) for integration in IGT
-double igt_eps; // relative error of integration in IGT
+
 // used in GenerateB.c
 int beam_Npars;
 double beam_pars[MAX_N_BEAM_PARMS]; // beam parameters
 opt_index opt_beam;                 // option index of beam option used
 const char *beam_fnameY;            // names of files, defining the beam (for two polarizations)
 const char *beam_fnameX;
+// used in interaction.c
+double igt_lim; // limit (threshold) for integration in IGT
+double igt_eps; // relative error of integration in IGT
 // used in io.c
 char logfname[MAX_FNAME]=""; // name of logfile
 // used in iterative.c
@@ -237,6 +236,7 @@ static const struct subopt_struct beam_opt[]={
 	{NULL,NULL,NULL,0,0}
 };
 static const struct subopt_struct shape_opt[]={
+#ifndef SPARSE
 	{"axisymmetric","<filename>","Axisymmetric homogeneous shape, defined by its contour in "
 		"ro-z plane of the cylindrical coordinate system. Its symmetry axis coincides with the "
 		"z-axis, and the contour is read from file.",FNAME_ARG,SH_AXISYMMETRIC},
@@ -279,10 +279,13 @@ static const struct subopt_struct shape_opt[]={
 		"b, and diameter at the position of the maximum width c. The surface is described by "
 		"ro^4+2S*ro^2*z^2+z^4+P*ro^2+Q*z^2+R=0, ro^2=x^2+y^2, P,Q,R,S are determined by the "
 		"described parameters.",3,SH_RBC},
+#endif // !SPARSE
 	{"read","<filename>","Read a particle geometry from file <filename>",FNAME_ARG,SH_READ},
+#ifndef SPARSE
 	{"sphere","","Homogeneous sphere",0,SH_SPHERE},
 	{"spherebox","<d_sph/Dx>","Sphere (diameter d_sph) in a cube (size Dx, first domain)",1,
 		SH_SPHEREBOX},
+#endif // !SPARSE
 	/* TO ADD NEW SHAPE
 	 * add a row to this list in alphabetical order. It contains:
 	 * shape name (used in command line), usage string, help string, possible number of float
@@ -337,7 +340,9 @@ PARSE_FUNC(eq_rad);
 #ifdef OPENCL
 PARSE_FUNC(gpu);
 #endif
+#ifndef SPARSE
 PARSE_FUNC(granul);
+#endif
 PARSE_FUNC(grid);
 PARSE_FUNC(h) ATT_NORETURN;
 PARSE_FUNC(init_field);
@@ -357,17 +362,23 @@ PARSE_FUNC(pol);
 PARSE_FUNC(prognosis);
 PARSE_FUNC(prop);
 PARSE_FUNC(recalc_resid);
+#ifndef SPARSE
 PARSE_FUNC(save_geom);
+#endif
 PARSE_FUNC(scat);
 PARSE_FUNC(scat_grid_inp);
 PARSE_FUNC(scat_matr);
+#ifndef SPARSE
 PARSE_FUNC(sg_format);
+#endif
 PARSE_FUNC(shape);
 PARSE_FUNC(size);
 PARSE_FUNC(store_beam);
 PARSE_FUNC(store_dip_pol);
 PARSE_FUNC(store_force);
+#ifndef SPARSE
 PARSE_FUNC(store_grans);
+#endif
 PARSE_FUNC(store_int_field);
 PARSE_FUNC(store_scat_grid);
 PARSE_FUNC(sym);
@@ -422,11 +433,13 @@ static struct opt_struct options[]={
 		"only for OpenCL version of ADDA, running on a system with several GPUs.\n"
 		"Default: 0",1,NULL},
 #endif
+#ifndef SPARSE
 	{PAR(granul),"<vol_frac> <diam> [<dom_number>]","Specifies that one particle domain should be "
 		"randomly filled with spherical granules with specified diameter <diam> and volume "
 		"fraction <vol_frac>. Domain number to fill is given by the last optional argument. "
 		"Algorithm may fail for volume fractions > 30-50%.\n"
 		"Default <dom_number>: 1",UNDEF,NULL},
+#endif // !SPARSE
 	{PAR(grid),"<nx> [<ny> <nz>]","Sets dimensions of the computation grid. Arguments should be "
 		"even integers. In most cases <ny> and <nz> can be omitted (they are automatically "
 		"determined by <nx> based on the proportions of the scatterer). This command line option "
@@ -440,10 +453,15 @@ static struct opt_struct options[]={
 		"preceding dash). For some options (e.g. '-beam' or '-shape') specific help on a "
 		"particular suboption <subopt> may be shown.\n"
 		"Example: shape coated",UNDEF,NULL},
-	{PAR(init_field),"{auto|zero|inc|wkb}","Sets prescription to calculate initial (starting) "
-		"field for the iterative solver. 'zero' is a zero vector, 'inc' - equal to the incident "
-		"field, 'wkb' - from Wentzel-Kramers-Brillouin approximation, 'auto' - automatically "
-		"choose from 'zero' and 'inc' based on the lower residual value.\n"
+	{PAR(init_field),"{auto|inc|wkb|zero}",
+		"Sets prescription to calculate initial (starting) field for the iterative solver.\n"
+		"'auto' - automatically choose from 'zero' and 'inc' based on the lower residual value.\n"
+		"'inc' - equal to the incident field,\n"
+		"'wkb' - from Wentzel-Kramers-Brillouin approximation,\n"
+#ifdef SPARSE
+		"!!! 'wkb' is not operational in sparse mode\n"
+#endif
+		"'zero' is a zero vector,\n"
 		"Default: auto",1,NULL},
 	{PAR(int),"{fcd|fcd_st|igt [<lim> [<prec>]]|igt_so|poi|so}",
 		"Sets prescription to calculate the interaction term.\n"
@@ -459,6 +477,9 @@ static struct opt_struct options[]={
 		"'igt_so' - approximate evaluation of IGT using second order of kd approximation.\n"
 		"'poi' - (the simplest) interaction between point dipoles.\n"
 		"'so' - under development and incompatible with '-anisotr'.\n"
+#ifdef SPARSE
+		"!!! All options except 'poi' incur a significant slowing down in sparse mode.\n"
+#endif
 		"Default: poi",UNDEF,NULL},
 	{PAR(iter),"{bicg|bicgstab|cgnr|csym|qmr|qmr2}","Sets the iterative solver.\n"
 		"Default: qmr",1,NULL},
@@ -529,12 +550,14 @@ static struct opt_struct options[]={
 		"Normalization (to the unity vector) is performed automatically.\n"
 		"Default: 0 0 1",3,NULL},
 	{PAR(recalc_resid),"","Recalculate residual at the end of iterative solver.",0,NULL},
+#ifndef SPARSE
 	{PAR(save_geom),"[<filename>]","Save dipole configuration to a file <filename> (a path "
 		"relative to the output directory). Can be used with '-prognosis'.\n"
 		"Default: <type>.geom \n"
 		"(<type> is a first argument to the '-shape' option; '_gran' is added if '-granul' option "
 		"is used; file extension can differ depending on argument of '-sg_format' option).",
 		UNDEF,NULL},
+#endif // !SPARSE
 	{PAR(scat),"{dr|fin|igt_so|so}","Sets prescription to calculate scattering quantities.\n"
 		"'dr' - (by Draine) standard formulation for point dipoles\n"
 		"'fin' - slightly different one, based on a radiative correction for a finite dipole.\n"
@@ -548,12 +571,14 @@ static struct opt_struct options[]={
 		"amplitude) should be saved to file. Amplitude matrix is never integrated (in combination "
 		"with '-orient avg' or '-phi_integr').\n"
 		"Default: muel",1,NULL},
+#ifndef SPARSE
 	{PAR(sg_format),"{text|text_ext|ddscat6|ddscat7}","Specifies format for saving geometry files. "
 		"First two are ADDA default formats for single- and multi-domain particles respectively. "
 		"'text' is automatically changed to 'text_ext' for multi-domain particles. Two DDSCAT "
 		"formats correspond to its shape options 'FRMFIL' (version 6) and 'FROM_FILE' (version 7) "
 		"and output of 'calltarget' utility.\n"
 		"Default: text",1,NULL},
+#endif // !SPARSE
 		/* TO ADD NEW FORMAT OF SHAPE FILE
 		 * Modify string constants after 'PAR(sg_format)': add new argument to list {...} and
 		 * add its description to the next string.
@@ -572,7 +597,9 @@ static struct opt_struct options[]={
 	{PAR(store_dip_pol),"","Save dipole polarizations to a file",0,NULL},
 	{PAR(store_force),"","Calculate the radiation force on each dipole. Implies '-Cpr'",
 		0,NULL},
+#ifndef SPARSE
 	{PAR(store_grans),"","Save granule coordinates (placed by '-granul' option) to a file",0,NULL},
+#endif
 	{PAR(store_int_field),"","Save internal fields to a file",0,NULL},
 	{PAR(store_scat_grid),"","Calculate Mueller matrix for a grid of scattering angles and save it "
 		"to a file.",0,NULL},
@@ -722,7 +749,7 @@ INLINE void TestNarg(const int Narg,const int need)
 			NargError(Narg,buf);
 		}
 	} // otherwise special cases are considered, encoded by negative values
-	else if (need==UNDEF); // do nothing
+	else if (need==UNDEF) {} // do nothing
 	else if (need==FNAME_ARG) {
 		if (Narg!=1) NargError(Narg,"1");
 	}
@@ -901,6 +928,7 @@ PARSE_FUNC(beam)
 	int i,j,need;
 	bool found;
 
+	if (Narg==0) PrintErrorHelp("Suboption (argument) is missing for '-beam' option");
 	Narg--;
 	found=false;
 	i=-1;
@@ -997,6 +1025,7 @@ PARSE_FUNC(gpu)
 	TestNonNegative_i(gpuInd,"GPU index");
 }
 #endif
+#ifndef SPARSE
 PARSE_FUNC(granul)
 {
 	if (Narg!=2 && Narg!=3) NargError(Narg,"2 or 3");
@@ -1012,6 +1041,7 @@ PARSE_FUNC(granul)
 	gr_mat--; // converted to usual indexing starting from 0
 	sh_granul=true;
 }
+#endif
 PARSE_FUNC(grid)
 {
 	if (Narg!=1 && Narg!=3) NargError(Narg,"1 or 3");
@@ -1059,6 +1089,10 @@ PARSE_FUNC(h)
 						while (options[i].sub[++j].name!=NULL)
 							printf("  %s %s\n",options[i].sub[j].name,options[i].sub[j].usage);
 						printf("Type '%s -h %s <subopt>' for details\n",exename,options[i].name);
+#ifdef SPARSE
+						if (strcmp(options[i].name,"shape")==0)
+							printf("!!! Most of the shape options are disabled in sparse mode\n");
+#endif
 					}
 				}
 				found=true;
@@ -1071,6 +1105,9 @@ PARSE_FUNC(h)
 			       "Available options:\n",exename,exeusage);
 			for (i=0;i<LENGTH(options);i++) printf("  -%s %s\n",options[i].name,options[i].usage);
 			printf("Type '%s -h <opt>' for details\n",exename);
+#ifdef SPARSE
+			printf("!!! A number of options are disabled in sparse mode\n");
+#endif
 		}
 	}
 	// exit
@@ -1081,19 +1118,18 @@ PARSE_FUNC(init_field)
 	if (strcmp(argv[1],"auto")==0) InitField=IF_AUTO;
 	else if (strcmp(argv[1],"zero")==0) InitField=IF_ZERO;
 	else if (strcmp(argv[1],"inc")==0) InitField=IF_INC;
-	else if (strcmp(argv[1],"wkb")==0) InitField=IF_WKB;
+	else if (strcmp(argv[1],"wkb")==0)
+#ifdef SPARSE
+		PrintErrorHelp("Initial field 'wkb' is not supported in sparse mode");
+#else
+		InitField=IF_WKB;
+#endif
 	else NotSupported("Initial field prescription",argv[1]);
 }
 PARSE_FUNC(int)
 {
 	double tmp;
 	
-#ifdef ADDA_SPARSE //only point dipoles in sparse mode
-	if (strcmp(argv[1],"poi")!=0) {
-		NotSupported("Interaction term prescription (sparse mode)",argv[1]);
-	}
-#endif
-
 	if (Narg<1 || Narg>3) NargError(Narg,"from 1 to 3");
 	if (strcmp(argv[1],"fcd")==0) IntRelation=G_FCD;
 	else if (strcmp(argv[1],"fcd_st")==0) IntRelation=G_FCD_ST;
@@ -1254,12 +1290,14 @@ PARSE_FUNC(recalc_resid)
 {
 	recalc_resid=true;
 }
+#ifndef SPARSE
 PARSE_FUNC(save_geom)
 {
 	if (Narg>1) NargError(Narg,"0 or 1");
 	save_geom=true;
 	if (Narg==1) save_geom_fname=ScanStrError(argv[1],MAX_FNAME);
 }
+#endif
 PARSE_FUNC(scat)
 {
 	if (strcmp(argv[1],"dr")==0) ScatRelation=SQ_DRAINE;
@@ -1286,6 +1324,7 @@ PARSE_FUNC(scat_matr)
 	else if (strcmp(argv[1],"none")==0) store_mueller=store_ampl=false;
 	else NotSupported("Scattering matrix specifier",argv[1]);
 }
+#ifndef SPARSE
 PARSE_FUNC(sg_format)
 {
 	if (strcmp(argv[1],"text")==0) sg_format=SF_TEXT;
@@ -1303,11 +1342,13 @@ PARSE_FUNC(sg_format)
 	 */
 	else NotSupported("Geometry format",argv[1]);
 }
+#endif
 PARSE_FUNC(shape)
 {
 	int i,j,need;
 	bool found;
 
+	if (Narg==0) PrintErrorHelp("Suboption (argument) is missing for '-shape' option");
 	Narg--;
 	found=false;
 	i=-1;
@@ -1360,10 +1401,12 @@ PARSE_FUNC(store_force)
 	store_force = true;
 	calc_mat_force = true;
 }
+#ifndef SPARSE
 PARSE_FUNC(store_grans)
 {
 	store_grans=true;
 }
+#endif
 PARSE_FUNC(store_int_field)
 {
 	store_int_field=true;
@@ -1521,6 +1564,12 @@ PARSE_FUNC(V)
 #endif
 #ifdef CLFFT_APPLE
 		"CLFFT_APPLE, "
+#endif
+#ifdef SPARSE
+		"SPARSE, "
+#endif
+#ifdef USE_SSE3
+		"USE_SSE3, "
 #endif
 		"";
 		printf("Extra build options: ");
@@ -1857,6 +1906,9 @@ void VariablesInterconnect(void)
 	if (sizeX!=UNDEF && a_eq!=UNDEF) PrintError("'-size' and '-eq_rad' can not be used together");
 	if (calc_mat_force && beamtype!=B_PLANE)
 		PrintError("Currently radiation forces can not be calculated for non-plane incident wave");
+#ifdef SPARSE
+	if (shape==SH_SPHERE) PrintError("Sparse mode requires shape to be read from file (-shape read ...)");
+#endif
 	// scale boxes by jagged; should be completely robust to overflows
 #define JAGGED_BOX(a) if (a!=UNDEF) { \
 	if ((BOX_MAX/(size_t)jagged)<(size_t)a) \
@@ -2030,12 +2082,12 @@ void PrintInfo(void)
 		fprintf(logfile,"lambda: "GFORM"\n",lambda);
 		fprintf(logfile,"shape: ");
 		fprintf(logfile,"%s"GFORM"%s\n",sh_form_str1,sizeX,sh_form_str2);
-#ifndef ADDA_SPARSE
+#ifndef SPARSE
 		if (sh_granul) fprintf(logfile,
 			"  domain %d is filled with %d granules of diameter "GFORMDEF"\n"
 			"    volume fraction: specified - "GFORMDEF", actual - "GFORMDEF"\n",
 			gr_mat+1,gr_N,gr_d,gr_vf,gr_vf_real);
-#endif //ADDA_SPARSE
+#endif // SPARSE
 		fprintf(logfile,"box dimensions: %ix%ix%i\n",boxX,boxY,boxZ);
 		if (anisotropy) {
 			fprintf(logfile,"refractive index (diagonal elements of the tensor):\n");
@@ -2151,8 +2203,10 @@ void PrintInfo(void)
 		fprintf(logfile,"FFTW3\n");
 #elif defined(FFT_TEMPERTON)
 		fprintf(logfile,"by C.Temperton\n");
+#elif defined(SPARSE)
+		fprintf(logfile,"none (sparse mode)\n");
 #endif
-#ifdef OPENCL
+#if defined(OPENCL) && !defined(SPARSE)
 		fprintf(logfile,"OpenCL FFT algorithm: ");
 #	ifdef CLFFT_AMD
 		fprintf(logfile,"by AMD\n");
diff --git a/src/sparse_ops.h b/src/sparse_ops.h
index 26de2469..f9624580 100644
--- a/src/sparse_ops.h
+++ b/src/sparse_ops.h
@@ -1,186 +1,176 @@
 /* File: sparse_ops.h
- * Descr: low-level routines for the sparse mode ADDA
+ * Descr: low-level routines for the sparse mode ADDA; void in non-sparse mode
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2011-2013 ADDA contributors
  * This file is part of ADDA.
  *
- * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
- * General Public License as published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
+ * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  *
- * ADDA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
- * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
- * Public License for more details.
+ * ADDA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License along with ADDA. If not, see
  * <http://www.gnu.org/licenses/>.
  */
+#ifdef SPARSE
 
-#ifndef __aijprod_h
-#define __aijprod_h
+#ifndef __sparse_ops_h
+#define __sparse_ops_h
 
 #include "cmplx.h"
 #include "interaction.h"
 #include "vars.h"
-   
-#ifdef ADDA_SPARSE
 
 #ifdef USE_SSE3
 
-inline static void CcMul(doublecomplex * restrict argvec_src, 
-                         doublecomplex * restrict argvec_dest, const int j)
-/*
-   Takes the j'th block in argvec_src, multiplies by cc_sqrt at that 
-   position and stores the result in argvec_dest.
-*/
+//=====================================================================================================================
+
+static inline void CcMul(doublecomplex * restrict argvec_src,doublecomplex * restrict argvec_dest, const size_t j)
+// Takes the j'th block in argvec_src, multiplies by cc_sqrt at that position and stores the result in argvec_dest.
 {
-   const int j3 = j*3;
-   *(__m128d *)&(argvec_dest[j3]) = cmul(*(__m128d *)&(argvec_src[j3]),
-                                         *(__m128d *)&(cc_sqrt[material[j]][0]));
-   *(__m128d *)&(argvec_dest[j3+1]) = cmul(*(__m128d *)&(argvec_src[j3+1]),
-                                           *(__m128d *)&(cc_sqrt[material[j]][1]));
-   *(__m128d *)&(argvec_dest[j3+2]) = cmul(*(__m128d *)&(argvec_src[j3+2]),
-                                           *(__m128d *)&(cc_sqrt[material[j]][2]));
+	const size_t j3=j*3;
+	*(__m128d *)&(argvec_dest[j3]) = cmul(*(__m128d *)&(argvec_src[j3]),
+		*(__m128d *)&(cc_sqrt[material[j]][0]));
+	*(__m128d *)&(argvec_dest[j3+1]) = cmul(*(__m128d *)&(argvec_src[j3+1]),
+		*(__m128d *)&(cc_sqrt[material[j]][1]));
+	*(__m128d *)&(argvec_dest[j3+2]) = cmul(*(__m128d *)&(argvec_src[j3+2]),
+		*(__m128d *)&(cc_sqrt[material[j]][2]));
 }
 
-inline static void AijProd(doublecomplex * restrict argvec, 
-                           doublecomplex * restrict resultvec,
-                           const int i, const int j)
-/*
-   Handles the multiplication of the i'th block of
-   argvec with the G_ij block of the G-matrix, and adds the result
-   to the j'th block of resultvec.
-*/
+//=====================================================================================================================
+
+static inline void AijProd(doublecomplex * restrict argvec,doublecomplex * restrict resultvec,const size_t i,
+	const size_t j)
+/* Handles the multiplication of the i'th block of argvec with the G_ij block of the G-matrix, and adds the result
+ * to the j'th block of resultvec.
+ */
 {	
-   doublecomplex iterm[6];
-	const unsigned int i3 = 3*i, j3 = 3*j;
-   
-   CalcInterTerm(position[i3]-position_full[j3], position[i3+1]-position_full[j3+1], 
-	              position[i3+2]-position_full[j3+2], iterm);
-
-   __m128d res, tmp;
- 
-   const __m128d argX = _mm_load_pd((double *)(argvec+j3));
-   const __m128d argY = _mm_load_pd((double *)(argvec+j3+1));
-   const __m128d argZ = _mm_load_pd((double *)(argvec+j3+2));
-   
-   res = cmul(argX, *(__m128d *)&(iterm[0]));
-   tmp = cmul(argY, *(__m128d *)&(iterm[1]));
-   res = cadd(tmp,res);
-   tmp = cmul(argZ, *(__m128d *)&(iterm[2]));
-   res = cadd(tmp,res);   
-   *(__m128d *)&(resultvec[i3]) = cadd(res, *(__m128d *)&(resultvec[i3]));
-   
-   res = cmul(argX, *(__m128d *)&(iterm[1]));
-   tmp = cmul(argY, *(__m128d *)&(iterm[3]));
-   res = cadd(tmp,res);
-   tmp = cmul(argZ, *(__m128d *)&(iterm[4]));
-   res = cadd(tmp,res);   
-   *(__m128d *)&(resultvec[i3+1]) = cadd(res, *(__m128d *)&(resultvec[i3+1]));
-   
-   res = cmul(argX, *(__m128d *)&(iterm[2]));
-   tmp = cmul(argY, *(__m128d *)&(iterm[4]));
-   res = cadd(tmp,res);
-   tmp = cmul(argZ, *(__m128d *)&(iterm[5]));
-   res = cadd(tmp,res);   
-   *(__m128d *)&(resultvec[i3+2]) = cadd(res, *(__m128d *)&(resultvec[i3+2]));
+	doublecomplex iterm[6];
+	const size_t i3 = 3*i, j3 = 3*j;
+
+	(*CalcInterTerm)(position[i3]-position_full[j3], position[i3+1]-position_full[j3+1],
+		position[i3+2]-position_full[j3+2], iterm);
+
+	__m128d res, tmp;
+
+	const __m128d argX = _mm_load_pd((double *)(argvec+j3));
+	const __m128d argY = _mm_load_pd((double *)(argvec+j3+1));
+	const __m128d argZ = _mm_load_pd((double *)(argvec+j3+2));
+
+	res = cmul(argX, *(__m128d *)&(iterm[0]));
+	tmp = cmul(argY, *(__m128d *)&(iterm[1]));
+	res = cadd(tmp,res);
+	tmp = cmul(argZ, *(__m128d *)&(iterm[2]));
+	res = cadd(tmp,res);
+	*(__m128d *)&(resultvec[i3]) = cadd(res, *(__m128d *)&(resultvec[i3]));
+
+	res = cmul(argX, *(__m128d *)&(iterm[1]));
+	tmp = cmul(argY, *(__m128d *)&(iterm[3]));
+	res = cadd(tmp,res);
+	tmp = cmul(argZ, *(__m128d *)&(iterm[4]));
+	res = cadd(tmp,res);
+	*(__m128d *)&(resultvec[i3+1]) = cadd(res, *(__m128d *)&(resultvec[i3+1]));
+
+	res = cmul(argX, *(__m128d *)&(iterm[2]));
+	tmp = cmul(argY, *(__m128d *)&(iterm[4]));
+	res = cadd(tmp,res);
+	tmp = cmul(argZ, *(__m128d *)&(iterm[5]));
+	res = cadd(tmp,res);
+	*(__m128d *)&(resultvec[i3+2]) = cadd(res, *(__m128d *)&(resultvec[i3+2]));
 }
 
-inline static void DiagProd(doublecomplex * restrict argvec, doublecomplex * restrict resultvec, 
-                            const int i) 
-/*
-   Multiplies the result in the i'th block of resultvec by cc_sqrt at that block,
-   subtracts the result from the i'th block of argvec, and stores the result in
-   the i'th block if resultvec. 
-*/
+//=====================================================================================================================
+
+static inline void DiagProd(doublecomplex * restrict argvec,doublecomplex * restrict resultvec,const size_t i)
+/* Multiplies the result in the i'th block of resultvec by cc_sqrt at that block, subtracts the result from the i'th
+ * block of argvec, and stores the result in the i'th block if resultvec.
+ */
 {
-   const int i3 = i*3; 
-   
-   const __m128d tmp1 = cmul(*(__m128d *)&(resultvec[i3]),*(__m128d *)&(cc_sqrt[material[i]][0]));
-   const __m128d tmp2 = cmul(*(__m128d *)&(resultvec[i3+1]),*(__m128d *)&(cc_sqrt[material[i]][1]));
-   const __m128d tmp3 = cmul(*(__m128d *)&(resultvec[i3+2]),*(__m128d *)&(cc_sqrt[material[i]][2]));
-   
-   *(__m128d *)&(resultvec[i3]) = _mm_sub_pd(*(__m128d *)&(argvec[i3]),tmp1);
-   *(__m128d *)&(resultvec[i3+1]) = _mm_sub_pd(*(__m128d *)&(argvec[i3+1]),tmp2);
-   *(__m128d *)&(resultvec[i3+2]) = _mm_sub_pd(*(__m128d *)&(argvec[i3+2]),tmp3);
+	const size_t i3 = i*3;
+
+	const __m128d tmp1 = cmul(*(__m128d *)&(resultvec[i3]),*(__m128d *)&(cc_sqrt[material[i]][0]));
+	const __m128d tmp2 = cmul(*(__m128d *)&(resultvec[i3+1]),*(__m128d *)&(cc_sqrt[material[i]][1]));
+	const __m128d tmp3 = cmul(*(__m128d *)&(resultvec[i3+2]),*(__m128d *)&(cc_sqrt[material[i]][2]));
+
+	*(__m128d *)&(resultvec[i3]) = _mm_sub_pd(*(__m128d *)&(argvec[i3]),tmp1);
+	*(__m128d *)&(resultvec[i3+1]) = _mm_sub_pd(*(__m128d *)&(argvec[i3+1]),tmp2);
+	*(__m128d *)&(resultvec[i3+2]) = _mm_sub_pd(*(__m128d *)&(argvec[i3+2]),tmp3);
 }
 
 #else //SSE3 not used
 
-inline static void CcMul(doublecomplex * restrict argvec_src, 
-                         doublecomplex * restrict argvec_dest, const int j)
-/*
-   Takes the j'th block in argvec_src, multiplies by cc_sqrt at that 
-   position and stores the result in argvec_dest.
-*/                         
+//=====================================================================================================================
+
+static inline void CcMul(doublecomplex * restrict argvec_src,doublecomplex * restrict argvec_dest, const size_t j)
+// Takes the j'th block in argvec_src, multiplies by cc_sqrt at that position and stores the result in argvec_dest.
 {
-   const int j3 = j*3;
-   cMult(argvec_src[j3],cc_sqrt[material_full[j]][0],argvec_dest[j3]);
-   cMult(argvec_src[j3+1],cc_sqrt[material_full[j]][1],argvec_dest[j3+1]);
-   cMult(argvec_src[j3+2],cc_sqrt[material_full[j]][2],argvec_dest[j3+2]);                                                               
+	const size_t j3 = j*3;
+	cMult(argvec_src[j3],cc_sqrt[material[j]][0],argvec_dest[j3]);
+	cMult(argvec_src[j3+1],cc_sqrt[material[j]][1],argvec_dest[j3+1]);
+	cMult(argvec_src[j3+2],cc_sqrt[material[j]][2],argvec_dest[j3+2]);
 }
 
-inline static void AijProd(doublecomplex * restrict argvec, 
-                           doublecomplex * restrict resultvec,
-                           const int i, const int j)
-/*
-   Handles the multiplication of the i'th block of
-   argvec with the G_ij block of the G-matrix, and adds the result
-   to the j'th block of resultvec.
-*/                           
-{	
-	static doublecomplex tmp1, resX, resY, resZ;
+//=====================================================================================================================
+
+static inline void AijProd(doublecomplex * restrict argvec,doublecomplex * restrict resultvec,const size_t i,
+	const size_t j)
+/* Handles the multiplication of the i'th block of argvec with the G_ij block of the G-matrix, and adds the result
+ * to the j'th block of resultvec.
+ */
+{
+	static doublecomplex tmp1,resX,resY,resZ;
 	static doublecomplex iterm[6];
-	const unsigned int i3 = 3*i, j3 = 3*j;
-	
-	//D("%d %d %d %d %d %d %d %d", i, j, position[3*i], position[3*i+1], position[3*i+2], position[3*j], position[3*j+1], position[3*j+2]);
-	CalcInterTerm(position[i3]-position_full[j3], position[i3+1]-position_full[j3+1], 
-	              position[i3+2]-position_full[j3+2], iterm);
-	
+	const size_t i3=3*i,j3=3*j;
+
+	//D("%d %d %d %d %d %d %d %d",i,j,position[3*i],position[3*i+1],position[3*i+2],
+	//	position[3*j],position[3*j+1],position[3*j+2]);
+	(*CalcInterTerm)(position[i3]-position_full[j3],position[i3+1]-position_full[j3+1],
+		position[i3+2]-position_full[j3+2],iterm);
+
 	cMult(argvec[j3],iterm[0],resX);
 	cMult(argvec[j3+1],iterm[1],tmp1);
 	cAdd(resX,tmp1,resX);
 	cMult(argvec[j3+2],iterm[2],tmp1);
 	cAdd(resX,tmp1,resX);
-	
+
 	cMult(argvec[j3],iterm[1],resY);
 	cMult(argvec[j3+1],iterm[3],tmp1);
 	cAdd(resY,tmp1,resY);
 	cMult(argvec[j3+2],iterm[4],tmp1);
 	cAdd(resY,tmp1,resY);
-	
+
 	cMult(argvec[j3],iterm[2],resZ);
 	cMult(argvec[j3+1],iterm[4],tmp1);
 	cAdd(resZ,tmp1,resZ);
 	cMult(argvec[j3+2],iterm[5],tmp1);
 	cAdd(resZ,tmp1,resZ);
-	
+
 	cAdd(resX,resultvec[i3],resultvec[i3]);
 	cAdd(resY,resultvec[i3+1],resultvec[i3+1]);
 	cAdd(resZ,resultvec[i3+2],resultvec[i3+2]);
 }
 
-inline static void DiagProd(doublecomplex * restrict argvec, doublecomplex * restrict resultvec, 
-                            const int i) 
-/*
-   Multiplies the result in the i'th block of resultvec by cc_sqrt at that block,
-   subtracts the result from the i'th block of argvec, and stores the result in
-   the i'th block if resultvec.
-*/
+//=====================================================================================================================
+
+static inline void DiagProd(doublecomplex * restrict argvec,doublecomplex * restrict resultvec,const size_t i)
+/* Multiplies the result in the i'th block of resultvec by cc_sqrt at that block, subtracts the result from the i'th
+ * block of argvec, and stores the result in the i'th block if resultvec.
+ */
 {
-   const int i3 = i*3;
-   static doublecomplex tmp1, tmp2, tmp3;
-   
-   cMult(resultvec[i3],cc_sqrt[material[i]][0],tmp1);
-   cMult(resultvec[i3+1],cc_sqrt[material[i]][1],tmp2);
-   cMult(resultvec[i3+2],cc_sqrt[material[i]][2],tmp3);
-   cSubtr(argvec[i3], tmp1, resultvec[i3]);
-   cSubtr(argvec[i3+1], tmp2, resultvec[i3+1]);
-   cSubtr(argvec[i3+2], tmp3, resultvec[i3+2]);
+	const size_t i3 = i*3;
+	static doublecomplex tmp1, tmp2, tmp3;
+
+	cMult(resultvec[i3],cc_sqrt[material[i]][0],tmp1);
+	cMult(resultvec[i3+1],cc_sqrt[material[i]][1],tmp2);
+	cMult(resultvec[i3+2],cc_sqrt[material[i]][2],tmp3);
+	cSubtr(argvec[i3], tmp1, resultvec[i3]);
+	cSubtr(argvec[i3+1], tmp2, resultvec[i3+1]);
+	cSubtr(argvec[i3+2], tmp3, resultvec[i3+2]);
 }
 
-#endif //USE_SSE3
+#endif // !USE_SSE3
 
-#endif //ADDA_SPARSE
+#endif // __sparse_ops_h
 
-#endif //__aijprod_h
+#endif // SPARSE
diff --git a/src/timing.c b/src/timing.c
index 6ff82f71..63722540 100644
--- a/src/timing.c
+++ b/src/timing.c
@@ -2,7 +2,7 @@
  * $Date::                            $
  * Descr: basic timing and statistics routines
  *
- * Copyright (C) 2006,2008-2012 ADDA contributors
+ * Copyright (C) 2006,2008-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -95,7 +95,7 @@ void InitTiming(void)
 	TotalIter=TotalMatVec=TotalEval=TotalEFieldPlane=0;
 	Timing_EField=Timing_FileIO=Timing_IntField=Timing_ScatQuan=Timing_Integration=0;
 	Timing_ScatQuanComm=Timing_InitDmComm=0;
-#ifdef ADDA_SPARSE
+#ifdef SPARSE
 	Timing_Dm_Init=Timing_Granul=Timing_FFT_Init=Timing_GranulComm=0;
 #endif	
 }
@@ -147,14 +147,16 @@ void FinalStatistics(void)
 				"    init OpenCL          "FFORMT"\n",TO_SEC(Timing_OCL_Init));
 
 #endif
+#ifndef SPARSE
 			fprintf(logfile,
 				"    init Dmatrix         "FFORMT"\n",TO_SEC(Timing_Dm_Init));
-#ifdef PARALLEL
+#	ifdef PARALLEL
 			fprintf(logfile,
 				"      communication:       "FFORMT"\n",TO_SEC(Timing_InitDmComm));
-#endif
+#	endif
 			fprintf(logfile,
 				"    FFT setup:           "FFORMT"\n",TO_SEC(Timing_FFT_Init));
+#endif // !SPARSE
 		}
 		fprintf(logfile,
 			"    make particle:       "FFORMT"\n",TO_SEC(Timing_Particle));
diff --git a/src/vars.c b/src/vars.c
index e027cd14..0ce81ddb 100644
--- a/src/vars.c
+++ b/src/vars.c
@@ -6,7 +6,7 @@
  *        source files are called 'semi-global' and not listed here. They are defined in one file
  *        and referenced with 'extern' in another one.
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -135,7 +135,7 @@ TIME_TYPE Timing_EField,      // time for calculating scattered fields
           Timing_Integration, // time for all integrations (with precomputed values)
           tstart_main;        // starting time of the program (after MPI_Init in parallel)
           
-#ifndef ADDA_SPARSE //These variables are exclusive to the FFT mode
+#ifndef SPARSE //These variables are exclusive to the FFT mode
 
 unsigned short * restrict position; /* position of the dipoles; in the very end of make_particle()
                                      * z-components are adjusted to be relative to the local_z0
@@ -168,14 +168,9 @@ size_t local_x0,local_x1,local_Nx; /* starting, ending x for current processor a
 
 #else //These variables are exclusive to the sparse mode
 
-int * restrict position;   // no reason to restrict this to short in sparse mode
+int *position; // no reason to restrict this to short in sparse mode; actually it points to a part of position_full
 //in sparse mode, all coordinates must be available to each process
-int * restrict position_full;  
+int * restrict position_full;
 
-unsigned char * restrict material_full;
-doublecomplex * restrict arg_full;
-
-double local_f0, local_f1;
-
-#endif //ADDA_SPARSE
+#endif //SPARSE
 
diff --git a/src/vars.h b/src/vars.h
index 5ee8a82e..d6733419 100644
--- a/src/vars.h
+++ b/src/vars.h
@@ -6,7 +6,7 @@
  *        source files are called 'semi-global' and not listed here. They are defined in one file
  *        and referenced with 'extern' in another one.
  *
- * Copyright (C) 2006-2012 ADDA contributors
+ * Copyright (C) 2006-2013 ADDA contributors
  * This file is part of ADDA.
  *
  * ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU
@@ -89,7 +89,7 @@ extern size_t local_Ndip,local_nvoid_Ndip,local_nRows,local_nvoid_d0,local_nvoid
 extern time_t wt_start,last_chp_wt;
 extern TIME_TYPE Timing_EField,Timing_FileIO,Timing_Integration,tstart_main;
 
-#ifndef ADDA_SPARSE //These variables are exclusive to the FFT mode
+#ifndef SPARSE //These variables are exclusive to the FFT mode
 
 extern unsigned short * restrict position;
 
@@ -105,14 +105,9 @@ extern size_t local_Nz,local_x0,local_x1,local_Nx;
 
 #else //These variables are exclusive to the sparse mode
 
-extern int * restrict position;
-extern double * restrict DipoleCoord_full;
+extern int *position;
 extern int * restrict position_full;
-extern unsigned char * restrict material_full;
-extern doublecomplex * restrict arg_full;
 
-extern double local_f0, local_f1;
-
-#endif //ADDA_SPARSE
+#endif //SPARSE
 
 #endif // __vars_h
diff --git a/tests/2exec/README b/tests/2exec/README
index 3525fc7d..5d002a5f 100644
--- a/tests/2exec/README
+++ b/tests/2exec/README
@@ -1,15 +1,13 @@
-Utils for automatic cross-testing different ADDA executables (by comparing the simulation results).
-The main part is a bash script 'comp2exec'. Under Windows one may use MSYS to run it. This script 
-contains comments in the head and throughout the file to explain its usage. In particular, one may
-specify the particular executables to cross test (a few typical pairs can be selected by command 
-line options to this script).
+Utils for automatic cross-testing different ADDA executables (by comparing the simulation results). The main part is a
+bash script 'comp2exec'. Under Windows one may use MSYS to run it. This script contains comments in the head and
+throughout the file to explain its usage. In particular, one may specify the particular executables to cross test (a few
+typical pairs can be selected by command line options to this script).
 
-The particular tests to perform are specified by file 'suite', by a list of command lines with 
-certain additions and abbreviations (see comments in the file). Currently, this file provides an
-extensive list of tests for testing future ADDA versions against a stable release. Some of this
-tests probably need to be removed for more specific tests, since they will produce errors. 
-A different file (instead of 'suite') can also be used, if specified by command line option to
+The particular tests to perform are specified by file 'suite', by a list of command lines with certain additions and
+abbreviations (see comments in the file). Currently, this file provides an extensive list of tests for testing future
+ADDA versions against a stable release. Some of this tests probably need to be removed for more specific tests, since
+they will produce errors. A different file (instead of 'suite') can also be used, if specified by command line option to
 'comp2exec'.
 
-Another important part is script 'diff_numeric.awk', which compares two files with given numerical
-tolerance. It may also need modification to fit a specific application.
\ No newline at end of file
+Another important part is script 'diff_numeric.awk', which compares two files with given numerical tolerance. It may
+also need modification to fit a specific application.
diff --git a/tests/2exec/comp2exec b/tests/2exec/comp2exec
index 8edfebb2..2e4cd985 100755
--- a/tests/2exec/comp2exec
+++ b/tests/2exec/comp2exec
@@ -1,16 +1,15 @@
 #!/bin/bash
-# First parameter is seq (default), mpi, mpi_seq, ocl, or ocl_seq. 
-# One-word mode compares corresponding current version with previous stable version (e.g. 1.0)
-# - some differences are always expected.
-# Two-word mode compares two current versions corresponding to different modes (e.g. mpi vs. seq)
-# - only minor differences are generally expected.
+# First parameter is seq (default), mpi, mpi_seq, ocl, or ocl_seq.
+# One-word mode compares corresponding current version with previous stable version (e.g. 1.0) - some differences are
+# always expected. Two-word mode compares two current versions corresponding to different modes (e.g. mpi vs. seq) -
+# only minor differences are generally expected.
 # Second (if given) specifies file with a test suite (default - "suite").
-# Third (if given) specifies bash pattern (best if quoted). Tests are started only from the command 
-# line that matches it. (convenient to restart the test suite from a break point)
+# Third (if given) specifies bash pattern (best if quoted). Tests are started only from the command line that matches
+# it. (convenient to restart the test suite from a break point)
 #
 # Look below for "!!!", which mark the places where adjustments are probably need to be done
 
-#---------------- Set parameters and define internal functions ---------------------------------
+#---------------- Set parameters and define internal functions ---------------------------------------------------------
 
 # Location of sample input files (not needed if all input files are already present)
 INPUTDIR="./../../input"
@@ -19,25 +18,53 @@ ADDASEQ="./../../src/seq/adda"
 ADDAMPI="./../../src/mpi/adda_mpi"
 ADDAOCL="./../../src/ocl/adda_ocl"
 # MPI command prefix
-MPIRUN="mpiexec -n 2"
+MPIRUN="mpiexec -n 4"
+
+# Set the following flag to ignore differences related to different FFT methods, such as FFT grid sizes and memory.
+# Also useful for comparison of sparse version with the standard one
+#FFTCOMP=1
+
+# SPARSE indicates that sparse mode is compared with sparse. SPARSE_STANDARD - that sparse is compared with standard
+# (automatically implies FFTCOMP above). Use at most one of them
+#SPARSE=1
+#SPARSE_STANDARD=1
+
+if [ -n "$SPARSE_STANDARD" ]; then
+  FFTCOMP=1
+fi
+
+if [ -n "$SPARSE" ]; then
+  DEFSUITE=suite_sparse
+  SEQREF="./adda_sparse" # !!! This should be adjusted
+  MPIREF="./adda_mpi_sparse" # !!! This should be adjusted
+  OCLREF="./adda_ocl_sparse" # !!! This should be adjusted
+else
+  if [ -n "$SPARSE_STANDARD" ]; then
+    DEFSUITE=suite_sparse
+  else
+    DEFSUITE=suite
+  fi
+  SEQREF="./adda_1.1" # !!! This should be adjusted
+  MPIREF="./adda_mpi_1.1.exe" # !!! This should be adjusted
+  OCLREF="./adda_ocl_1.1" # !!! This should be adjusted
+fi
 
 MODE=${1:-seq}
-DEFSUITE=suite
 SUITEFILE=${2:-$DEFSUITE}
 
 if [ $MODE == "seq" ]; then
-  EXECREF="./adda"  # !!! This should be adjusted
+  EXECREF=$SEQREF
   EXECTEST=$ADDASEQ
   IGERRREF=1
 elif [ $MODE == "mpi" ]; then
-  EXECREF="$MPIRUN ./adda_mpi" # !!! This should be adjusted
+  EXECREF="$MPIRUN $MPIREF"
   EXECTEST="$MPIRUN $ADDAMPI"
   IGERRREF=1
 elif [ $MODE == "mpi_seq" ]; then
   EXECREF=$ADDASEQ
   EXECTEST="$MPIRUN $ADDAMPI"
 elif [ $MODE == "ocl" ]; then
-  EXECREF="./adda_ocl_amd" # !!! This should be adjusted
+  EXECREF=$OCLREF
   EXECTEST=$ADDAOCL
   IGERRREF=1
 elif [ $MODE == "ocl_seq" ]; then
@@ -48,12 +75,8 @@ else
   exit 1
 fi
 
-# Set the following flag to ignore differences related to different FFT methods, such as FFT grid sizes and memory
-#FFTCOMP=1
-
-# Whether errors in running reference version of ADDA should be ignored. Useful for comparing with 
-# older versions, which lack all the tested functionality. This variable is set above for some 
-# modes.
+# Whether errors in running reference version of ADDA should be ignored. Useful for comparing with older versions, which
+# lack all the tested functionality. This variable is set above for some modes.
 # !!! It can also be explicitly set here.
 #IGERRREF=1
 
@@ -67,8 +90,8 @@ TMPREF=ref.tmp # temporary files for text processing
 TMPTEST=test.tmp
 
 function cleanfile {
-  # Processes file $1 and stores the result in $2. Basic step is removing lines matching regexp $3.
-  # Additional optional step is cutting the end of the file starting from line matching regexp $4.
+  # Processes file $1 and stores the result in $2. Basic step is removing lines matching regexp $3. Additional optional
+  # step is cutting the end of the file starting from line matching regexp $4.
   if [ -n "$4" ]; then
     awk "BEGIN{p=1} /$4/{p=0} p" $1 | grep -v -E -e "$3" > $2
   else
@@ -76,21 +99,20 @@ function cleanfile {
   fi
 }
 function numdiff {
-  # Performs comparison of two files, ignoring differences in numerical values smaller than 
-  # absolute and relative tolerances, specified by variables atol and rtol (-log of real value).
-  # The main logic is inside awk script 'diff_numeric'.
+  # Performs comparison of two files, ignoring differences in numerical values smaller than absolute and relative
+  # tolerances, specified by variables atol and rtol (-log of real value). The main logic is inside awk script
+  # 'diff_numeric'.
   diff $1 $2 | awk -f diff_numeric.awk -v abs_tol=1e-$atol -v rel_tol=1e-$rtol -  >&2
 }
 function numigndiff {
-  # Same as numdiff, but additionally ignores certain differences in input files - see description
-  # of function cleanfile.
+  # Same as numdiff, but additionally ignores certain differences in input files - see description of function cleanfile
   cleanfile $1 $TMPREF "$3" "$4"
   cleanfile $2 $TMPTEST "$3" "$4"
   diff $TMPREF $TMPTEST | awk -f diff_numeric.awk -v abs_tol=1e-$atol -v rel_tol=1e-$rtol -  >&2
 }
 function igndiff {
-  # Performs comparison of two files, ignoring differences in lines matching regexp $3 and all the
-  # differences after the line matching regexp $4.
+  # Performs comparison of two files, ignoring differences in lines matching regexp $3 and all the differences after the
+  # line matching regexp $4.
   cleanfile $1 $TMPREF "$3" "$4"
   cleanfile $2 $TMPTEST "$3" "$4"
   diff $TMPREF $TMPTEST >&2
@@ -102,10 +124,9 @@ function asmin {
   fi
 }
 function mycmp {
-  # Determines which differences are considered significant, depending on files. Use of $ at the
-  # end of ignore patterns seems to be not portable, since it does not work on Windows because of
-  # different EOL style. So we skip it for now.
-  
+  # Determines which differences are considered significant, depending on files. Use of $ at the end of ignore patterns
+  # seems to be not portable, since it does not work on Windows because of different EOL style. So we skip it for now.
+
   # First, set default numerical accuracy or adjust it based on cmdline
   if [[ "$cmdline" == -granul\ * ]]; then
     asmin atol 1
@@ -124,10 +145,15 @@ function mycmp {
     if [ $MODE == "mpi_seq" ]; then
       IGNORE="$IGNORE|^(M|Total m|Maximum m|Additional m)emory usage"
     elif [ $MODE == "ocl_seq" ]; then
-      IGNORE="$IGNORE|^Using OpenCL device|^Device memory|^Searching for OpenCL devices|^Initializing (clFFT|FFTW3)|^(M|Total m|OpenCL m)emory usage"  
+      # double definition to wrap line
+	  IGNORE="$IGNORE|^Using OpenCL device|^Device memory|^Searching for OpenCL devices|^Initializing (clFFT|FFTW3)"
+      IGNORE="$IGNORE|^(M|Total m|OpenCL m)emory usage"
     fi
     if [ -n "$FFTCOMP" ]; then
-      IGNORE="$IGNORE|^(M|Total m|OpenCL m)emory usage"
+      IGNORE="$IGNORE|^(M|Total m|OpenCL m|Maximum m)emory usage"
+    fi
+    if [ -n "$SPARSE_STANDARD" ]; then
+      IGNORE="$IGNORE|^Calculating Green's function|^Fourier transform of Dmatrix|^Initializing (clFFT|FFTW3)"
     fi
     if [[ $MODE == "mpi" || $MODE == "mpi_seq" ]]; then
       CUT="^Error posting writev, " # due to typical random errors of MPICH under Windows
@@ -148,10 +174,10 @@ function mycmp {
     if [ $MODE == "mpi_seq" ]; then
       IGNORE="$IGNORE|^The program was run on: |^(M|Total m|Maximum m|Additional m)emory usage"
     elif [ $MODE == "ocl_seq" ]; then
-      IGNORE="$IGNORE|^Using OpenCL device|^Device memory|^OpenCL FFT algorithm: by |^(M|Total m|OpenCL m)emory usage"  
+      IGNORE="$IGNORE|^Using OpenCL device|^Device memory|^OpenCL FFT algorithm: |^(M|Total m|OpenCL m)emory usage"
     fi
     if [ -n "$FFTCOMP" ]; then
-      IGNORE="$IGNORE|^(|OpenCL )FFT algorithm: by |The FFT grid is: [0-9]+x[0-9]+x[0-9]+|^(M|Total m|OpenCL m)emory usage"
+      IGNORE="$IGNORE|^(|OpenCL )FFT algorithm: |^The FFT grid is: |^(M|Total m|OpenCL m|Maximum m)emory usage"
     fi
     CUT="^Total wall time: "
     asmin rtol 4
@@ -163,15 +189,15 @@ function mycmp {
     asmin rtol 5
     numdiff $1 $2
   elif [[ "$base" == log_int_* || "$base" == "log_orient_avg" ]]; then
-    asmin atol 13
+    asmin atol 12
     numdiff $1 $2
   elif [[ "$base" == "granules" ]]; then #compare only some comments and total number of lines
     if [ `wc -l < $1` == `wc -l < $2` ]; then
       igndiff $1 $2 "^([^#]|#generated by ADDA v\.)"
-    else 
+    else
       echo "Different number of granules"
       return 1
-    fi  
+    fi
   elif [[ "$base" == *.geom || "$base" == *.dat ]]; then
     igndiff $1 $2 "generated by ADDA v\."
   else
@@ -183,17 +209,16 @@ function mydiff {
   if !(mycmp $1 $2); then
     echo "!!! Difference between files '$1' and '$2'" >&2
     # !!! This should be adjusted
-    # It is recommended to put here a GUI diff program, which allows quick estimate of the 
-    # importance of differences (e.g. when there are differences in minor digits of many numbers).
-    # However, most effort should be put in improving mycmp, so that calling the function below 
-    # will indicate significant error by itself. The following line can be commented out at all, 
-    # since function mycmp will produce (significant) diff of compared files to stderr.
+    # It is recommended to put here a GUI diff program, which allows quick estimate of the importance of differences
+    # (e.g. when there are differences in minor digits of many numbers). However, most effort should be put in improving
+    # mycmp, so that calling the function below will indicate significant error by itself. The following line can be
+    # also commented out, since function mycmp will produce (significant) diff of compared files to stderr.
     # Options include, for example: tortoisemerge, meld, vimdiff
     tortoisemerge $1 $2
   fi
 }
 
-#---------------- Prepare input files -----------------------------------------
+#---------------- Prepare input files ----------------------------------------------------------------------------------
 
 NEEDEDFILES="scat_params.dat avg_params.dat alldir_params.dat"
 NEEDEDDIRS="tables"
@@ -210,16 +235,16 @@ for dir in $NEEDEDDIRS; do
   fi
 done
 
-#---------------- Run comparison ----------------------------------------------
+#---------------- Run comparison ---------------------------------------------------------------------------------------
 
 # This in combination with stdin redirection to ADDA calls below redirects current stdin directly
 # to ADDA call. While the stdin of the code block (while...done) is kept intact. This is especially
-# relevant to mpi version, since then ADDA is executed with mpiexec, which creates forks and a big 
+# relevant to mpi version, since then ADDA is executed with mpiexec, which creates forks and a big
 # mess with stdin.
 exec 3<&0
 imax=-1
 # initialize skipping of lines up to pattern $3
-if [ -n "$3" ]; then 
+if [ -n "$3" ]; then
   skip=1
 else
   skip=0
@@ -249,7 +274,7 @@ while read -r cmpfiles cmdline; do
         echo -e "\nERROR while running \n$runref\nsee $SOREF" >&2
         exit 1
       fi
-    else 
+    else
       refok=1
     fi
     # test run
diff --git a/tests/2exec/diff_numeric.awk b/tests/2exec/diff_numeric.awk
index 9416a104..3c9f9b87 100644
--- a/tests/2exec/diff_numeric.awk
+++ b/tests/2exec/diff_numeric.awk
@@ -1,8 +1,6 @@
-# Processes the diff of two files and ignores the small errors in numbers -
-# either small absolute or relative errors - controlled by 'abs_tol' and 'rel_tol'.
-# These values can be either specified outside by e.g. '-v abs_tol=1e-10' or assume
-# default values specified below.
-# Numbers should be separated from other text by spaces, '=', ',', or brackets -
+# Processes the diff of two files and ignores the small errors in numbers - either small absolute or relative errors -
+# controlled by 'abs_tol' and 'rel_tol'. These values can be either specified outside by e.g. '-v abs_tol=1e-10' or
+# assume default values specified below. Numbers should be separated from other text by spaces, '=', ',', or brackets -
 # controlled by 'FS'. Output happens only if differences are found.
 
 BEGIN {
@@ -23,15 +21,6 @@ function relerr(a,b,  c) {
 	return (c==0) ? 0 : abs(a-b)/c
 }
 
-/^[0-9]/ {
-	if (i1>i2) {
-		printf "First file contains unmatched line:\n%s\n", ref[i2+1]
-		exit 3
-	}
-	i1=0
-	i2=0
-}
-
 /^< / {
 	i1++
 	ref[i1]=substr($0,3)
@@ -55,9 +44,26 @@ function relerr(a,b,  c) {
 				exit 2
 			}
 			else if (abs(p1[j]-p2[j])>abs_tol && relerr(p1[j],p2[j])>rel_tol) {
+				# wrapping the following line produces some errors on Windows, so keep it like this
 				printf "Significant difference between numbers: '%s' and '%s' (abs_tol='%s', rel_tol='%s')\n", p1[j], p2[j], abs_tol, rel_tol
 				exit 1
 			}
 		}
 	}
+}
+
+/^[0-9]/ {
+	if (i1>i2) {
+		printf "First file contains unmatched line:\n%s\n", ref[i2+1]
+		exit 3
+	}
+	i1=0
+	i2=0
+}
+
+END {
+	if (i1>i2) {
+		printf "First file contains unmatched line:\n%s\n", ref[i2+1]
+		exit 3
+	}
 }
\ No newline at end of file
diff --git a/tests/2exec/ellipsoid.geom b/tests/2exec/ellipsoid.geom
index d8d91072..98c0807b 100644
--- a/tests/2exec/ellipsoid.geom
+++ b/tests/2exec/ellipsoid.geom
@@ -1,283 +1,203 @@
-#generated by ADDA v.1.0a3
+#generated by ADDA v.1.1
 #shape: 'ellipsoid'
-#box size: 16x8x4
-6 1 0
-7 1 0
-8 1 0
-9 1 0
+#box size: 8x4x12
+3 1 0
+4 1 0
+3 2 0
 4 2 0
-5 2 0
-6 2 0
-7 2 0
-8 2 0
-9 2 0
-10 2 0
-11 2 0
-3 3 0
-4 3 0
-5 3 0
-6 3 0
-7 3 0
-8 3 0
-9 3 0
-10 3 0
-11 3 0
-12 3 0
-3 4 0
-4 4 0
-5 4 0
-6 4 0
-7 4 0
-8 4 0
-9 4 0
-10 4 0
-11 4 0
-12 4 0
-4 5 0
-5 5 0
-6 5 0
-7 5 0
-8 5 0
-9 5 0
-10 5 0
-11 5 0
-6 6 0
-7 6 0
-8 6 0
-9 6 0
-5 0 1
-6 0 1
-7 0 1
-8 0 1
-9 0 1
-10 0 1
 2 1 1
 3 1 1
 4 1 1
 5 1 1
-6 1 1
-7 1 1
-8 1 1
-9 1 1
-10 1 1
-11 1 1
-12 1 1
-13 1 1
-1 2 1
 2 2 1
 3 2 1
 4 2 1
 5 2 1
-6 2 1
-7 2 1
-8 2 1
-9 2 1
-10 2 1
-11 2 1
-12 2 1
-13 2 1
-14 2 1
-0 3 1
-1 3 1
-2 3 1
-3 3 1
-4 3 1
-5 3 1
-6 3 1
-7 3 1
-8 3 1
-9 3 1
-10 3 1
-11 3 1
-12 3 1
-13 3 1
-14 3 1
-15 3 1
-0 4 1
-1 4 1
-2 4 1
-3 4 1
-4 4 1
-5 4 1
-6 4 1
-7 4 1
-8 4 1
-9 4 1
-10 4 1
-11 4 1
-12 4 1
-13 4 1
-14 4 1
-15 4 1
-1 5 1
-2 5 1
-3 5 1
-4 5 1
-5 5 1
-6 5 1
-7 5 1
-8 5 1
-9 5 1
-10 5 1
-11 5 1
-12 5 1
-13 5 1
-14 5 1
-2 6 1
-3 6 1
-4 6 1
-5 6 1
-6 6 1
-7 6 1
-8 6 1
-9 6 1
-10 6 1
-11 6 1
-12 6 1
-13 6 1
-5 7 1
-6 7 1
-7 7 1
-8 7 1
-9 7 1
-10 7 1
-5 0 2
-6 0 2
-7 0 2
-8 0 2
-9 0 2
-10 0 2
+3 0 2
+4 0 2
+1 1 2
 2 1 2
 3 1 2
 4 1 2
 5 1 2
 6 1 2
-7 1 2
-8 1 2
-9 1 2
-10 1 2
-11 1 2
-12 1 2
-13 1 2
 1 2 2
 2 2 2
 3 2 2
 4 2 2
 5 2 2
 6 2 2
-7 2 2
-8 2 2
-9 2 2
-10 2 2
-11 2 2
-12 2 2
-13 2 2
-14 2 2
-0 3 2
-1 3 2
-2 3 2
 3 3 2
 4 3 2
-5 3 2
-6 3 2
-7 3 2
-8 3 2
-9 3 2
-10 3 2
-11 3 2
-12 3 2
-13 3 2
-14 3 2
-15 3 2
-0 4 2
-1 4 2
-2 4 2
-3 4 2
-4 4 2
-5 4 2
-6 4 2
-7 4 2
-8 4 2
-9 4 2
-10 4 2
-11 4 2
-12 4 2
-13 4 2
-14 4 2
-15 4 2
-1 5 2
-2 5 2
-3 5 2
-4 5 2
-5 5 2
-6 5 2
-7 5 2
-8 5 2
-9 5 2
-10 5 2
-11 5 2
-12 5 2
-13 5 2
-14 5 2
-2 6 2
-3 6 2
-4 6 2
-5 6 2
-6 6 2
-7 6 2
-8 6 2
-9 6 2
-10 6 2
-11 6 2
-12 6 2
-13 6 2
-5 7 2
-6 7 2
-7 7 2
-8 7 2
-9 7 2
-10 7 2
+2 0 3
+3 0 3
+4 0 3
+5 0 3
+1 1 3
+2 1 3
+3 1 3
+4 1 3
+5 1 3
 6 1 3
-7 1 3
-8 1 3
-9 1 3
+1 2 3
+2 2 3
+3 2 3
 4 2 3
 5 2 3
 6 2 3
-7 2 3
-8 2 3
-9 2 3
-10 2 3
-11 2 3
+2 3 3
 3 3 3
 4 3 3
 5 3 3
-6 3 3
-7 3 3
-8 3 3
-9 3 3
-10 3 3
-11 3 3
-12 3 3
-3 4 3
-4 4 3
-5 4 3
-6 4 3
-7 4 3
-8 4 3
-9 4 3
-10 4 3
-11 4 3
-12 4 3
-4 5 3
-5 5 3
-6 5 3
-7 5 3
-8 5 3
-9 5 3
-10 5 3
-11 5 3
-6 6 3
-7 6 3
-8 6 3
-9 6 3
+2 0 4
+3 0 4
+4 0 4
+5 0 4
+0 1 4
+1 1 4
+2 1 4
+3 1 4
+4 1 4
+5 1 4
+6 1 4
+7 1 4
+0 2 4
+1 2 4
+2 2 4
+3 2 4
+4 2 4
+5 2 4
+6 2 4
+7 2 4
+2 3 4
+3 3 4
+4 3 4
+5 3 4
+1 0 5
+2 0 5
+3 0 5
+4 0 5
+5 0 5
+6 0 5
+0 1 5
+1 1 5
+2 1 5
+3 1 5
+4 1 5
+5 1 5
+6 1 5
+7 1 5
+0 2 5
+1 2 5
+2 2 5
+3 2 5
+4 2 5
+5 2 5
+6 2 5
+7 2 5
+1 3 5
+2 3 5
+3 3 5
+4 3 5
+5 3 5
+6 3 5
+1 0 6
+2 0 6
+3 0 6
+4 0 6
+5 0 6
+6 0 6
+0 1 6
+1 1 6
+2 1 6
+3 1 6
+4 1 6
+5 1 6
+6 1 6
+7 1 6
+0 2 6
+1 2 6
+2 2 6
+3 2 6
+4 2 6
+5 2 6
+6 2 6
+7 2 6
+1 3 6
+2 3 6
+3 3 6
+4 3 6
+5 3 6
+6 3 6
+2 0 7
+3 0 7
+4 0 7
+5 0 7
+0 1 7
+1 1 7
+2 1 7
+3 1 7
+4 1 7
+5 1 7
+6 1 7
+7 1 7
+0 2 7
+1 2 7
+2 2 7
+3 2 7
+4 2 7
+5 2 7
+6 2 7
+7 2 7
+2 3 7
+3 3 7
+4 3 7
+5 3 7
+2 0 8
+3 0 8
+4 0 8
+5 0 8
+1 1 8
+2 1 8
+3 1 8
+4 1 8
+5 1 8
+6 1 8
+1 2 8
+2 2 8
+3 2 8
+4 2 8
+5 2 8
+6 2 8
+2 3 8
+3 3 8
+4 3 8
+5 3 8
+3 0 9
+4 0 9
+1 1 9
+2 1 9
+3 1 9
+4 1 9
+5 1 9
+6 1 9
+1 2 9
+2 2 9
+3 2 9
+4 2 9
+5 2 9
+6 2 9
+3 3 9
+4 3 9
+2 1 10
+3 1 10
+4 1 10
+5 1 10
+2 2 10
+3 2 10
+4 2 10
+5 2 10
+3 1 11
+4 1 11
+3 2 11
+4 2 11
diff --git a/tests/2exec/sphere.geom b/tests/2exec/sphere.geom
new file mode 100644
index 00000000..76c32bfb
--- /dev/null
+++ b/tests/2exec/sphere.geom
@@ -0,0 +1,283 @@
+#generated by ADDA v.1.0
+#shape: 'sphere'
+#box size: 8x8x8
+3 2 0
+4 2 0
+2 3 0
+3 3 0
+4 3 0
+5 3 0
+2 4 0
+3 4 0
+4 4 0
+5 4 0
+3 5 0
+4 5 0
+2 1 1
+3 1 1
+4 1 1
+5 1 1
+1 2 1
+2 2 1
+3 2 1
+4 2 1
+5 2 1
+6 2 1
+1 3 1
+2 3 1
+3 3 1
+4 3 1
+5 3 1
+6 3 1
+1 4 1
+2 4 1
+3 4 1
+4 4 1
+5 4 1
+6 4 1
+1 5 1
+2 5 1
+3 5 1
+4 5 1
+5 5 1
+6 5 1
+2 6 1
+3 6 1
+4 6 1
+5 6 1
+3 0 2
+4 0 2
+1 1 2
+2 1 2
+3 1 2
+4 1 2
+5 1 2
+6 1 2
+1 2 2
+2 2 2
+3 2 2
+4 2 2
+5 2 2
+6 2 2
+0 3 2
+1 3 2
+2 3 2
+3 3 2
+4 3 2
+5 3 2
+6 3 2
+7 3 2
+0 4 2
+1 4 2
+2 4 2
+3 4 2
+4 4 2
+5 4 2
+6 4 2
+7 4 2
+1 5 2
+2 5 2
+3 5 2
+4 5 2
+5 5 2
+6 5 2
+1 6 2
+2 6 2
+3 6 2
+4 6 2
+5 6 2
+6 6 2
+3 7 2
+4 7 2
+2 0 3
+3 0 3
+4 0 3
+5 0 3
+1 1 3
+2 1 3
+3 1 3
+4 1 3
+5 1 3
+6 1 3
+0 2 3
+1 2 3
+2 2 3
+3 2 3
+4 2 3
+5 2 3
+6 2 3
+7 2 3
+0 3 3
+1 3 3
+2 3 3
+3 3 3
+4 3 3
+5 3 3
+6 3 3
+7 3 3
+0 4 3
+1 4 3
+2 4 3
+3 4 3
+4 4 3
+5 4 3
+6 4 3
+7 4 3
+0 5 3
+1 5 3
+2 5 3
+3 5 3
+4 5 3
+5 5 3
+6 5 3
+7 5 3
+1 6 3
+2 6 3
+3 6 3
+4 6 3
+5 6 3
+6 6 3
+2 7 3
+3 7 3
+4 7 3
+5 7 3
+2 0 4
+3 0 4
+4 0 4
+5 0 4
+1 1 4
+2 1 4
+3 1 4
+4 1 4
+5 1 4
+6 1 4
+0 2 4
+1 2 4
+2 2 4
+3 2 4
+4 2 4
+5 2 4
+6 2 4
+7 2 4
+0 3 4
+1 3 4
+2 3 4
+3 3 4
+4 3 4
+5 3 4
+6 3 4
+7 3 4
+0 4 4
+1 4 4
+2 4 4
+3 4 4
+4 4 4
+5 4 4
+6 4 4
+7 4 4
+0 5 4
+1 5 4
+2 5 4
+3 5 4
+4 5 4
+5 5 4
+6 5 4
+7 5 4
+1 6 4
+2 6 4
+3 6 4
+4 6 4
+5 6 4
+6 6 4
+2 7 4
+3 7 4
+4 7 4
+5 7 4
+3 0 5
+4 0 5
+1 1 5
+2 1 5
+3 1 5
+4 1 5
+5 1 5
+6 1 5
+1 2 5
+2 2 5
+3 2 5
+4 2 5
+5 2 5
+6 2 5
+0 3 5
+1 3 5
+2 3 5
+3 3 5
+4 3 5
+5 3 5
+6 3 5
+7 3 5
+0 4 5
+1 4 5
+2 4 5
+3 4 5
+4 4 5
+5 4 5
+6 4 5
+7 4 5
+1 5 5
+2 5 5
+3 5 5
+4 5 5
+5 5 5
+6 5 5
+1 6 5
+2 6 5
+3 6 5
+4 6 5
+5 6 5
+6 6 5
+3 7 5
+4 7 5
+2 1 6
+3 1 6
+4 1 6
+5 1 6
+1 2 6
+2 2 6
+3 2 6
+4 2 6
+5 2 6
+6 2 6
+1 3 6
+2 3 6
+3 3 6
+4 3 6
+5 3 6
+6 3 6
+1 4 6
+2 4 6
+3 4 6
+4 4 6
+5 4 6
+6 4 6
+1 5 6
+2 5 6
+3 5 6
+4 5 6
+5 5 6
+6 5 6
+2 6 6
+3 6 6
+4 6 6
+5 6 6
+3 2 7
+4 2 7
+2 3 7
+3 3 7
+4 3 7
+5 3 7
+2 4 7
+3 4 7
+4 4 7
+5 4 7
+3 5 7
+4 5 7
diff --git a/tests/2exec/sphere4.geom b/tests/2exec/sphere4.geom
new file mode 100644
index 00000000..4aa65c6f
--- /dev/null
+++ b/tests/2exec/sphere4.geom
@@ -0,0 +1,35 @@
+#generated by ADDA v.1.0
+#shape: 'sphere'
+#box size: 4x4x4
+1 1 0
+2 1 0
+1 2 0
+2 2 0
+1 0 1
+2 0 1
+0 1 1
+1 1 1
+2 1 1
+3 1 1
+0 2 1
+1 2 1
+2 2 1
+3 2 1
+1 3 1
+2 3 1
+1 0 2
+2 0 2
+0 1 2
+1 1 2
+2 1 2
+3 1 2
+0 2 2
+1 2 2
+2 2 2
+3 2 2
+1 3 2
+2 3 2
+1 1 3
+2 1 3
+1 2 3
+2 2 3
diff --git a/tests/2exec/suite b/tests/2exec/suite
index e6d6a002..40c52b4d 100644
--- a/tests/2exec/suite
+++ b/tests/2exec/suite
@@ -1,7 +1,6 @@
-#------------------------- Variable definitons ---------------------------------
-# All variable names should be enclosed in ';' and start the line - its value
-# is assigned to the rest of the line. It can be defined through other variables
-# defined below. Each variable must be defined only once.
+#------------------------- Variable definitons -------------------------------------------------------------------------
+# All variable names should be enclosed in ';' and start the line - its value is assigned to the rest of the line. It
+# can be defined through other variables defined below. Each variable must be defined only once.
 
 # default for most tests
 ;mgn; ;m; ;g; ;n;
@@ -22,13 +21,13 @@
 ;n; -ntheta 5
 ;p; -prop 1 2 3
 ;se; -shape ellipsoid 0.5 1.5
-#----------------------------- List of tests -----------------------------------
-# The format is the following: '<list,of,files,to,compare> <cmdline>'
-# the first one is coma-separated list of files to compare or 'all' (which compares 
-# all produced files. <cmdline> is everything after the first space and it is passed
+#----------------------------- List of tests ---------------------------------------------------------------------------
+# The format is the following: '<list,of,files,to,compare> <cmdline>' the first one is coma-separated list of files to
+# compare or 'all' (which compares all produced files. <cmdline> is everything after the first space and it is passed
 # directly to ADDA.
 
 all
+
 # testing of different grids - relevant for FFT methods
 # to remove redundant warnings for ocl_seq, only (2,3,5) numbers are used
 all -grid 2 ;smn;
@@ -97,8 +96,8 @@ all -eps 10 ;mgn;
 all -h eq_rad
 all -eq_rad 1 ;mgn;
 
-# it is hard to make meaningful comparison of stdout and log for random placement of granules
-# however, optical properties are compared using rather large tolerances
+# It is hard to make meaningful comparison of stdout and log for random placement of granules. However, optical
+# properties are compared using rather large tolerances
 all -h granul
 CrossSec-Y,CrossSec-X,mueller -granul 0.2 0.5 2 -size 8 -shape coated 0.5 ;3m; ;n;
 CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n;
@@ -208,7 +207,7 @@ all -h scat_matr
 all -scat_matr muel ;mgn;
 all -scat_matr ampl ;mgn;
 all -scat_matr both ;mgn;
-all -scat_matr none ;m; ;g; 
+all -scat_matr none ;m; ;g;
 
 all -h shape
 all -h shape axisymmetric
@@ -253,7 +252,7 @@ all -shape sphere ;mgn;
 all -h shape spherebox
 all -shape spherebox 0.5 ;2mgn;
 
-all -h size 
+all -h size
 all -size 8 ;mgn;
 
 all -h store_beam
@@ -276,7 +275,7 @@ all -store_int_field ;se; ;mgn;
 all -h store_scat_grid
 all -store_scat_grid ;sep; ;mgn;
 
-all -h sym 
+all -h sym
 all -sym auto ;mgn;
 all -sym no ;mgn;
 all -sym enf ;mgn;
diff --git a/tests/2exec/suite_sparse b/tests/2exec/suite_sparse
new file mode 100644
index 00000000..4b67bed8
--- /dev/null
+++ b/tests/2exec/suite_sparse
@@ -0,0 +1,275 @@
+#------------------------- Variable definitons -------------------------------------------------------------------------
+# All variable names should be enclosed in ';' and start the line - its value is assigned to the rest of the line. It
+# can be defined through other variables defined below. Each variable must be defined only once.
+
+# default for most tests
+;mgn; ;m; ;g; ;n;
+# default for two-domain particles
+;2mgn; ;2m; ;g; ;n;
+# for large computations (e.g. orientation averaging)
+;mg4n; ;m; -shape read sphere4.geom -sym enf ;n;
+# default addition to make particle completely non-symmetric
+;sep; ;se; ;p;
+# default addition for non-spherical shapes
+;mn; ;m; ;n;
+
+# Elementary variables
+;m; -m 1.1 0.1
+;2m; -m 1.1 0.1 1.2 0.2
+;3m; -m 1.05 0.1 1.1 0.2 1.2 0.3
+;g; -shape read sphere.geom -sym enf
+;n; -ntheta 5
+;p; -prop 1 2 3
+;se; -shape read ellipsoid.geom
+
+#----------------------------- List of tests ---------------------------------------------------------------------------
+# The format is the following: '<list,of,files,to,compare> <cmdline>' the first one is coma-separated list of files to
+# compare or 'all' (which compares all produced files. <cmdline> is everything after the first space and it is passed
+# directly to ADDA.
+
+all ;g;
+
+all -h alldir_inp
+all -alldir_inp adp.dat -Csca ;mgn;
+
+all -h anisotr
+all -anisotr ;3m; ;g; ;n;
+
+all -h asym
+all -asym ;sep; ;mn;
+
+all -h beam
+all -h beam plane
+all -beam plane ;mgn;
+all -h beam lminus
+all -beam lminus 2 1 2 3 ;mgn;
+all -h beam davis3
+all -beam davis3 2 1 2 3 ;mgn;
+all -h beam barton5
+all -beam barton5 2 1 2 3 ;mgn;
+all -h beam read
+all -beam read IncBeam-Y IncBeam-X ;se; ;mn;
+
+all -h chpoint
+all -chpoint 1s -eps 3 ;mgn;
+all -h chp_type
+all -chp_type normal -chpoint 1s -eps 3 ;mgn;
+all -chp_dir chp_tmp -chp_type regular -chpoint 1s -eps 3 ;mgn;
+all -h chp_dir
+all -chp_dir chp_tmp -chp_type always -eps 3 ;mgn;
+all -h chp_load
+all -chp_dir chp_tmp -chp_load ;mgn;
+
+all -h Cpr
+all -Cpr ;sep; ;mn;
+
+all -h Csca
+all -Csca ;se; ;mn;
+
+all -h dir
+
+all -h dpl
+all -dpl 20 ;mgn;
+
+all -h eps
+all -eps 10 ;mgn;
+
+all -h eq_rad
+all -eq_rad 1 ;mgn;
+
+# It is hard to make meaningful comparison of stdout and log for random placement of granules. However, optical
+# properties are compared using rather large tolerances
+#all -h granul
+#CrossSec-Y,CrossSec-X,mueller -granul 0.2 0.5 2 -size 8 -shape coated 0.5 ;3m; ;n;
+#CrossSec-Y,CrossSec-X,mueller -granul 0.2 2 2 -size 8 -shape coated 0.5 ;3m; ;n;
+
+all -h grid
+#all -grid 4 6 8 ;m; ;n;
+
+all -h
+all -h h
+
+all -h init_field
+all -init_field auto ;mgn;
+all -init_field zero ;mgn;
+all -init_field inc ;mgn;
+#all -init_field wkb ;mgn;
+
+all -h int
+all -int fcd ;mgn;
+all -int fcd_st ;mgn;
+all -int igt ;mg4n;
+all -int igt 3 ;mg4n;
+all -int igt 3 0.01 ;mg4n;
+all -int igt_so ;mgn;
+all -int poi ;mgn;
+all -int so ;mgn;
+
+all -h iter
+all -iter bicg ;mgn;
+all -iter bicgstab ;mgn;
+all -iter cgnr ;mgn;
+all -iter csym ;mgn;
+all -iter qmr ;mgn;
+all -iter qmr2 ;mgn;
+
+all -h jagged
+all -jagged 2 ;mg4n;
+
+all -h lambda
+all -lambda 1 ;mgn;
+
+all -h m
+all -m 1.2 0.2 ;g; ;n;
+
+all -h maxiter
+all -maxiter 5 ;mgn;
+
+all -h no_reduced_fft
+all -no_reduced_fft ;mgn;
+
+all -h no_vol_cor
+all -no_vol_cor -size 3 ;mgn;
+
+all -h ntheta
+all -ntheta 10 ;m; ;g;
+
+all -h opt
+all -opt speed ;mgn;
+all -opt mem ;mgn;
+
+all -h orient
+all -orient 10 20 30 ;se; ;mn;
+all -orient avg ;se; ;mn;
+all -orient avg ap.dat ;se; ;mn;
+
+all -h phi_integr
+all -phi_integr 31 ;sep; ;mn;
+
+all -h pol
+all -pol cldr ;p; ;mgn;
+all -pol cm ;mgn;
+all -pol dgf ;mgn;
+all -pol fcd ;mgn;
+all -pol igt_so ;mgn;
+all -pol ldr ;p; ;mgn;
+all -pol ldr avgpol ;p; ;mgn;
+all -pol rrc ;mgn;
+all -pol so ;p; ;mgn;
+
+all -h prognosis
+all -prognosis ;g;
+
+all -h prop
+all -prop 5 1 3 ;se; ;mn;
+
+all -h recalc_resid
+all -recalc_resid ;mgn;
+
+#all -h save_geom
+#all -save_geom -shape read coated.geom -prognosis
+#all -h sg_format
+#all -save_geom ;se; -prognosis -sg_format text
+#all -save_geom ;se; -prognosis -sg_format text_ext
+#all -save_geom ;se; -prognosis -sg_format ddscat6
+#all -save_geom ;se; -prognosis -sg_format ddscat7
+
+all -h scat
+all -scat dr ;mgn;
+all -scat fin ;mgn;
+all -scat igt_so ;mgn;
+all -scat so ;mgn;
+
+all -h scat_grid_inp
+all -scat_grid_inp sp.dat ;mgn;
+
+all -h scat_matr
+all -scat_matr muel ;mgn;
+all -scat_matr ampl ;mgn;
+all -scat_matr both ;mgn;
+all -scat_matr none ;m; ;g; 
+
+all -h shape
+#all -h shape axisymmetric
+#all -shape axisymmetric 196.txt ;mgn;
+#all -h shape bicoated
+#all -shape bicoated 3 0.5 ;2mgn;
+#all -h shape biellipsoid
+#all -shape biellipsoid 0.5 1.5 0.75 0.5 1.5 ;2mgn;
+#all -h shape bisphere
+#all -shape bisphere 2 ;mgn;
+#all -h shape box
+#all -shape box ;mgn;
+#all -shape box 0.5 1.5 ;mgn;
+#all -h shape capsule
+#all -shape capsule 1.5 ;mgn;
+#all -h shape chebyshev
+#all -shape chebyshev 0.3 5 ;mgn;
+#all -h shape coated
+#all -shape coated 0.5 ;2mgn;
+#all -shape coated 0.4 0.1 0.15 0.2 ;2mgn;
+#all -h shape cylinder
+#all -shape cylinder 1 ;mgn;
+#all -h shape egg
+#all -shape egg 0.5 0.2 ;mgn;
+#all -h shape ellipsoid
+#all -shape ellipsoid 0.25 2 ;mgn;
+#all -h shape line
+#all -shape line -grid 16 ;m; ;n;
+#all -h shape plate
+#all -shape plate 0.5 ;mgn;
+#all -h shape prism
+#all -shape prism 5 1.5 ;mgn;
+#all -h shape rbc
+#all -shape rbc 0.3 0.1 0.3 ;mgn;
+all -h shape read
+all -shape read ellipsoid.geom ;m; ;n;
+all -shape read coated.geom ;2m; ;n;
+all -shape read ell_ddscat6.dat ;m; ;n;
+all -shape read ell_ddscat7.dat ;m; ;n;
+#all -h shape sphere
+#all -shape sphere ;mgn;
+#all -h shape spherebox
+#all -shape spherebox 0.5 ;2mgn;
+
+all -h size 
+all -size 8 ;mgn;
+
+all -h store_beam
+all -store_beam ;se; ;mn;
+
+all -h store_dip_pol
+all -store_dip_pol ;se; ;mn;
+
+all -h store_force
+all -store_force -Cpr ;sep; ;mn;
+all -store_force ;mgn;
+
+#all -h store_grans
+#granules -store_grans -granul 0.2 1 -size 4 ;2mgn;
+
+all -h store_int_field
+all -store_int_field ;se; ;mn;
+
+all -h store_scat_grid
+all -store_scat_grid ;sep; ;mn;
+
+all -h sym 
+all -sym auto ;mn; -shape read sphere.geom
+all -sym no ;mn; -shape read sphere.geom
+all -sym enf ;mn; -shape read sphere.geom
+all -sym auto ;sep; ;mn;
+all -sym no ;sep; ;mn;
+all -sym enf ;sep; ;mn;
+
+all -h test
+all -test ;mgn;
+
+all -h V
+all -V
+
+all -h vec
+all -vec ;sep; ;mn;
+
+all -h yz
+all -yz -store_scat_grid ;mgn;