🎉 introducing 2d and LGF kernel support

vortexlab-uclouvain · Nov 24, 2019 · 3b1ae21 · 3b1ae21
2 parents 60352eb + 7eb3075
commit 3b1ae21
Show file tree

Hide file tree

Showing 297 changed files with 3,351 additions and 1,472 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,8 @@ prof
 .vscode
 doc/html
 doc/latex
+*.d
+*.in
 *.x
 *.xlsx
 *.log

diff --git a/Makefile b/Makefile
@@ -68,6 +68,13 @@ ifneq (,$(findstring -DHAVE_METIS,$(CXXFLAGS)))
 	LIB+= -L$(METIS_LIB) -lmetis  -Wl,-rpath,$(METIS_LIB)
 endif
 
+#-----------------------------------------------------------------------------
+# LGF SPECIAL CASE
+# by default the LGF kernel data is installed in the include directory
+LGF_PATH=$(abspath $(PREFIX)/include)
+DEF += -DKERNEL_PATH=${LGF_PATH}
+LGF_DATA := $(wildcard $(PREFIX)/kernel/*.ker)
+
 #-----------------------------------------------------------------------------
 ## add the wanted folders - common folders
 SRC := $(notdir $(wildcard $(SRC_DIR)/*.cpp))
@@ -129,13 +136,15 @@ install_dynamic: lib_dynamic
 	@cp $(TARGET_LIB_A2A).so $(PREFIX)/lib
 	@cp $(TARGET_LIB_NB).so $(PREFIX)/lib
 	@cp $(API) $(PREFIX)/include
+	@cp $(LGF_DATA) $(PREFIX)/include
 
 install_static: lib_static 
 	@mkdir -p $(PREFIX)/lib
 	@mkdir -p $(PREFIX)/include
 	@cp $(TARGET_LIB_A2A).a $(PREFIX)/lib
 	@cp $(TARGET_LIB_NB).a $(PREFIX)/lib
 	@cp $(API) $(PREFIX)/include
+	@cp $(LGF_DATA) $(PREFIX)/include
 
 # for a standard installation, do the dynamic link	
 install: info install_static
@@ -163,6 +172,7 @@ info: logo
 	$(info compil. flags = $(CXXFLAGS) $(INC) $(DEF) -fPIC -MMD)
 	$(info linker flags = -shared $(LDFLAGS))
 	$(info using arch file = $(ARCH_FILE) )
+	$(info LGF path = $(LGF_PATH) )
 	$(info ------------)
 	$(info FFTW:)
 	$(info - include: -I$(FFTW_INC) )
@@ -177,6 +187,7 @@ info: logo
 	$(info - OBJ A2A = $(OBJ_A2A))
 	$(info - OBJ NB = $(OBJ_NB))
 	$(info - DEP = $(DEP))
+	$(info - LGF_DATA = $(LGF_DATA))
 	$(info ------------)
 
 .NOTPARALLEL: logo

diff --git a/README.md b/README.md
@@ -15,13 +15,21 @@ For the list of all the contributors to the development of FLUPS, description an
 If you use FLUPS, please cite it as follows in your publications:
 - Caprace et al., **FLUPS - A Fourier-based Library of Unbounded Poisson Solvers**, SIAM Journal on Scientific Computing, 2019 (under review)
 
+### Why should you use FLUPS?
+- You can solve the Poisson on rectangular and uniform distributed 2D/3D grids;
+- You can use any boundary conditions, including truly unbounded boundary conditions and semi-unbounded conditions
+- You can solve may times the same Poisson problem at low cost using precomputed Green's function and communication patterns;
+- You can use threads and/or MPI to fasten the execution;
+- You can use the build-in profiler to optimize the execution speed;
+- You can use any part of the library on its own, especially the pre-computed communications and the FFTs;
+- You can apply filters or do any computation you want while in the Fourier space.
 
 ### Installation
 
 FLUPS is a C++ library, with an API in C.
-The compilation of FLUPS was tested with Intel compilers and GCC.
+The compilation of FLUPS was tested with Intel compilers and GCC.,
 
-#### 1. Dependencies
+#### Dependencies
 First, you need to install the dependencies, typically using the following configuration commands (for the intel compilers)
 - FFTW (> v3.3.8) in the `fftw_prefix` dir:
 ```shell
@@ -31,9 +39,8 @@ CC=icc CXX=icpc FC=ifort ./configure --prefix=fftw_prefix --enable-mpi --enable-
 ```shell
 CC=mpiicc CXX=mpiicpc FC=mpif90 ./configure --prefix=hdf5_prefix --enable-build-mode=production --enable-parallel
 ```
-- METIS (> v5.1.0) - only if compiling with `REORDER_RANKS`
 
-#### 2. The Library
+#### Compilation
 You need now to create a architecture/compiler dependent file in `make_arch` to define `CXX`, `CXXFLAGS`, `FFTWDIR` and `HDF5DIR`.
 For example:
 ```makefile
@@ -57,15 +64,21 @@ HDF5_LIB := ${HDF5_DIR}/lib
 HDF5_INC := ${HDF5_DIR}/include
 ```
 By default, the Makefile is looking for `-lfftw3_openmp -lfftw3` and `-lhdf5`. You can overwrite this by changing the variable `FFTW_LIBNAME` and `HDF5_LIBNAME` in your arch file.
-
-Then you need to reference the created configuration file and the prefix you wish to :
-```shell
-export ARCH_FILE=make_arch/my_arch_dependent_file
+For example:
+```makefile
+FFTW_LIBNAME := -lfftw3_omp -lfftw3
+HDF5_LIBNAME := -lhdf5_openmpi
 ```
 
+Then you need to reference the created configuration file (using `ARCH_FILE`) and the prefix in you wish to install the library (using `PREFIX`).
+You can either `export` the variables or reference them later while calling the Makefile.
+If no prefix is given, `make install` uses the current working directory to install the library
+
 Finally, go to the main folder and type the compilation command.
-- Check the compilation details before doing the installation
+- Check the compilation details before doing the installation\
 ```shell
+export ARCH_FILE=make_arch/my_arch_dependent_file
+export PREFIX=/my/lib/prefix
 make info
 ## or
 ARCH_FILE=make_arch/my_arch_dependent_file PREFIX=/my/lib/prefix make info
@@ -77,31 +90,59 @@ make install
 ARCH_FILE=make_arch/my_arch_dependent_file PREFIX=/my/lib/prefix make install
 ```
 
-#### 3. Documentation
+:warning: you must **install** the library. Indeed, we copy some data required by the solver.
+If you wish to keep everything local, simply do not give a prefix and the current directory will be selected.
+
+#### Documentation
 
-The documentation is built with Doxygen.
-To build the documentation, please go to the `./doc` subfolder and type `doxygen`.
+The documentation is built using Doxygen.
+To build the documentation, go to the `./doc` subfolder and type `doxygen`.
 
-#### 4. Compilation flags
+#### Available compilation flags
 Here is an exhautstive list of the compilation flags that can be used to change the behavior of the code. To use `MY_FLAG`, simply add `-DMY_FLAG` to the variable `CXXFLAGS` in your `make_arch`.
 - `DUMP_DBG`: if specified, the solver will I/O fields using the HDF5 library.
 - `COMM_NONBLOCK`: if specified, the code will use the non-blocking communication pattern instead of the all to all version.
 - `PERF_VERBOSE`: requires an extensive I/O on the communication pattern used. For performance tuning and debugging purpose only.
 - `NDEBUG`: use this flag to bypass various checks inside the library
 - `PROF`: allow you to use the build-in profiler to have a detailed view of the timing in each part of the solve. Make sure you have created a folder ```./prof``` next to your executable.
 - `REORDER_RANKS`: try to reorder the MPI ranks based on the precomputed communication graph, using call to MPI_Dist_graph. We recommend the use of this feature when the number of processes > 128 and the nodes are allocated exclusive for your application, especially on fully unbounded domains.
-- `HAVE_METIS`: in combination with REORDER_RANKS, use METIS instead of MPI_Dist_graph to partition the call graph based on the allocated ressources
+- `HAVE_METIS`: in combination with REORDER_RANKS, use METIS instead of MPI_Dist_graph to partition the call graph based on the allocated ressources. You must hence install metis for this functionality.
 
 :warning: You may also change the memory alignement and the FFTW planner flag in the `flups.h` file.
 
 ### How to use a solver?
 
 #### Detailed reference
+The scientific background of the library is explained in "Caprace et al., **FLUPS - A Fourier-based Library of Unbounded Poisson Solvers**, SIAM Journal on Scientific Computing, 2019 (under review)".
 
-The scientific background of the library is explained in "Caprace et al., **FLUPS - A Fourier-based Library of Unbounded Poisson Solvers**, SIAM Journal on Scientific Computing, 2019 (under review)"
+A detailed description of the API is provided in the documentation (@ref flups.h), as well as many implementation details.
+
+#### Memory layout
+In this project we choose to handle the memory in a **Fortran** way of doing even if we are in C/C++.
+So, the memory is aligned as a single row of size `n[0] * n[1] * n[2]`.
+The fastest rotating index is set to be `n[0]` then `n[1]` and finally `n[2]`.
+
+We have chosen this way of doing to reuse the 3D code in a 2D framework.
+Indeed having the last dimension in the slower rotating index does not penalize the loops writting.
 
-For the detailed specifications of the API, have a look at @ref flups.h .
+As an example, we here is how we access the memory
+
+```cpp
+double* data =(double*) flups_malloc(n[0] * n[1] * n[2] * sizeof(double));
+
+for(int iz=0; iz<n[2]; iz++){
+    for(int iy=0; iy<n[1]; iy++){
+        for(int ix=0; ix<n[0]; ix++){
+            // n[0] is the fastest rotating index
+            const int id = iz*n[1]*n[0] + iy * n[0] + ix;
+
+            data[id] = 1.0 ;
+        }
+    }
+}
 
+flups_free(data);
+```
 
 #### FLUPS in a nutshell
 To use the solver, you first need to create a topology
@@ -112,40 +153,39 @@ int  nproc[3]  = {2, 1, 3};      // 6 procs; 2 x 1 x 3
 bool isComplex = false;          // real data
 
 // no specific alignement => we put a value of 1
-Topology *topo = new Topology(axis, nglob, nproc, isComplex,NULL,1, MPI_COMM_WORLD);
+FLUPS_Topology *topo = flups_topo_new(axis, nglob, nproc, isComplex, NULL, 1, MPI_COMM_WORLD);
 
 // define additional quantities
 double L = {1.0, 2.0, 1.0};
 double h = {L[0] / nglob[0], L[1] / nglob[1], L[2] / nglob[2]};
 ```
 
-Then, you can define a new solver and it's boundary condition
+Then, you can define a new solver and its boundary condition
 ```cpp
 // define the solver
-const BoundaryType mybc[3][2] = {{UNB, UNB}, {EVEN, ODD}, {UNB, EVEN}};  // BC in X,Y,Z
-Solver *      mysolver   = new Solver(topo, mybc, h, L);
+const FLUPS_BoundaryType mybc[3][2] = {{UNB, UNB}, {EVEN, ODD}, {UNB, EVEN}};  // BC in X,Y,Z
+FLUPS_Solver *mysolver = flups_init(topo, mybc, h, L,prof);
 
 // setup the solver
-mysolver->set_GreenType(HEJ2);
-mysolver->setup(false);
+flups_set_greenType(mysolver,typeGreen);
+flups_setup(mysolver,false);
 ```
 
 To solve a field `rhs` that has been defined on the topology, use
 ```cpp
-mysolver->solve(rhs, rhs, SRHS);
+flups_solve(mysolver,rhs, rhs, SRHS);
 ```
 
-Then, destroy the solver
+Then, destroy the solver and the created topology
 ```
-delete (mysolver);
+flups_cleanup(mysolver);
+flups_topo_free(topo);
 ```
 
 #### Advanced usage
-
 Examples of usage of FLUPS in C programs are provided in the `./sample` subfolder.
 
 #### Memory footprint
-
 For the recommanded configuration of 128^3 unknowns per processor in full unbounded, we have measured the memory usage of FLUPS on a 2000 cores run:
 - the all to all version uses ~530Mb (O.253kB/unknown)
 - the non-blocking version uses ~560Mb (O.267kB/unknown)
@@ -157,17 +197,17 @@ For 1.5Go, max 168
 21*8 
 7*24-->
 
-**CAUTION**
-FLUPS was nerver tested above 1024^3 unknowns per core.
+:warning: FLUPS was nerver tested above 1024^3 unknowns per core.
 
 ### Implementation details and developers guide
 #### C++ use
 We use the C++ language in a very limited way, on purpose.
 The features used are the object oriented layout and some usefull features of the standard library.
 
 #### Conventions
-
-- Put a ```BEGIN_FUNC;``` at the begining of each function
+- Put a ```BEGIN_FUNC;``` at the begining and a ```END_FUNC;``` at the end of each function
+- Use ```FLUPS_INFO``` for verbosity (several levels available), ```FLUPS_CHECK``` for assertions and ```FLUPS_ERROR``` for error management
+- Use ```flups_malloc``` and ```flups_free``` function to allocate/free memory
 - how to name an action? ```action_mySuperFunction``` where ```action``` = ```set```, ```get```, ```execute```, ```switch```, ```cmpt```
 - how to name a function? ```mySuperFunction```
 - how to name an class? ```MyClass```
@@ -183,31 +223,6 @@ Set then the value:
 
 Inspired from https://clang.llvm.org/docs/ClangFormatStyleOptions.html (*Configurable Format Style Options* section)
 
-#### Memory layout
-In this project we choose to handle the memory in a **Fortran** way of doing iven if we are in C/C++.
-So, the memory is aligned as a single row of size `n[0] * n[1] * n[2]`.
-The fastest rotating index is set to be `n[0]` then `n[1]` and finally `n[2]`.
-
-We have chosen this way of doing to reuse the 3D code in a 2D framework.
-Indeed having the last dimension in the slower rotating index does not penalize the loops writting.
-
-As an example, we here is how we access the memory
-
-```cpp
-double* data =(double*) flups_malloc(n[0] * n[1] * n[2] * sizeof(double));
-
-for(int iz=0; iz<n[2]; iz++){
-    for(int iy=0; iy<n[1]; iy++){
-        for(int ix=0; ix<n[0]; ix++){
-            // n[0] is the fastest rotating index
-            const int id = iz*n[1]*n[0] + iy * n[0] + ix;
-
-            data[id] = 1.0 ;
-        }
-    }
-}
-```
-
 #### Debugging
 
 FLUPS can be compiled with different levels of verbosity. The following compilation flags are accepted:

diff --git a/kernel/LGF_2d_sym_acc12_32.ker b/kernel/LGF_2d_sym_acc12_32.ker
diff --git a/kernel/LGF_3d_sym_acc12_64.ker b/kernel/LGF_3d_sym_acc12_64.ker
diff --git a/samples/compareP3DFFT++/main_compare++.cpp b/samples/compareP3DFFT++/main_compare++.cpp
@@ -260,24 +260,24 @@ int main(int argc, char *argv[]) {
     // //-------------------------------------------------------------------------
     // /** - allocate rhs and solution */
     // //-------------------------------------------------------------------------
-
-    printf("[FLUPS] topo IN glob : %d %d %d \n",topoIn->nglob(0),topoIn->nglob(1),topoIn->nglob(2));
-    printf("[FLUPS] topo IN loc : %d*%d*%d = %d (check: %d %d %d)\n",topoIn->nmem(0),topoIn->nmem(1),topoIn->nmem(2),topoIn->memsize(),topoIn->nloc(0),topoIn->nloc(1),topoIn->nloc(2));
-    printf("[FLUPS] topo OUT glob : %d %d %d \n",topoSpec->nglob(0),topoSpec->nglob(1),topoSpec->nglob(2));
-    printf("[FLUPS] topo OUT loc  : nmem: %d*%d*%d nf:%d (nloc: %d %d %d)  \n",topoSpec->nmem(0),topoSpec->nmem(1),topoSpec->nmem(2),topoSpec->nf(),topoSpec->nloc(0),topoSpec->nloc(1),topoSpec->nloc(2));
+    if(rank == 0) {
+        printf("[FLUPS] topo IN glob : %d %d %d \n",topoIn->nglob(0),topoIn->nglob(1),topoIn->nglob(2));
+        printf("[FLUPS] topo IN loc : %d*%d*%d = %d (check: %d %d %d)\n",topoIn->nmem(0),topoIn->nmem(1),topoIn->nmem(2),topoIn->memsize(),topoIn->nloc(0),topoIn->nloc(1),topoIn->nloc(2));
+        printf("[FLUPS] topo OUT glob : %d %d %d \n",topoSpec->nglob(0),topoSpec->nglob(1),topoSpec->nglob(2));
+        printf("[FLUPS] topo OUT loc  : nmem: %d*%d*%d nf:%d (nloc: %d %d %d)  \n",topoSpec->nmem(0),topoSpec->nmem(1),topoSpec->nmem(2),topoSpec->nf(),topoSpec->nloc(0),topoSpec->nloc(1),topoSpec->nloc(2));
 
 #ifndef SKIP_P3D
-    printf("[P3DFFT++] topo IN glob  : %d %d %d  \n",gdimsIN[0],gdimsIN[1],gdimsIN[2]);
-    printf("[P3DFFT++] topo IN loc   : %d %d %d (is: %d %d %d) \n",P3DnlocIN[0],P3DnlocIN[1],P3DnlocIN[2],glob_startIN[0],glob_startIN[1],glob_startIN[2]);
-    printf("[P3DFFT++] topo OUT glob : %d %d %d  \n",gdimsOUT[0],gdimsOUT[1],gdimsOUT[2]);
-    printf("[P3DFFT++] topo OUT loc  : %d %d %d (is: %d %d %d) \n",P3DnlocOUT[0],P3DnlocOUT[1],P3DnlocOUT[2],glob_startOUT[0],glob_startOUT[1],glob_startOUT[2]);
+        printf("[P3DFFT++] topo IN glob  : %d %d %d  \n",gdimsIN[0],gdimsIN[1],gdimsIN[2]);
+        printf("[P3DFFT++] topo IN loc   : %d %d %d (is: %d %d %d) \n",P3DnlocIN[0],P3DnlocIN[1],P3DnlocIN[2],glob_startIN[0],glob_startIN[1],glob_startIN[2]);
+        printf("[P3DFFT++] topo OUT glob : %d %d %d  \n",gdimsOUT[0],gdimsOUT[1],gdimsOUT[2]);
+        printf("[P3DFFT++] topo OUT loc  : %d %d %d (is: %d %d %d) \n",P3DnlocOUT[0],P3DnlocOUT[1],P3DnlocOUT[2],glob_startOUT[0],glob_startOUT[1],glob_startOUT[2]);
 #endif
 
-
-    printf("I am going to allocate FLUPS: %d (inside FLUPS: %d)\n",FLUmemsizeIN,FLUmemsizeOUT);
+        printf("I am going to allocate FLUPS: %d (inside FLUPS: %d)\n",FLUmemsizeIN,FLUmemsizeOUT);
 #ifndef SKIP_P3D    
-    printf("                        P3D: %d (out %d C) \n",P3DmemsizeIN,P3DmemsizeOUT);
+        printf("                        P3D: %d (out %d C) \n",P3DmemsizeIN,P3DmemsizeOUT);
 #endif
+    }
 
 
     double *rhsFLU   = (double *)fftw_malloc(sizeof(double) * FLUmemsizeIN);

diff --git a/samples/compareP3DFFT++/run/zenobe_kernel.sh b/samples/compareP3DFFT++/run/zenobe_kernel.sh
@@ -36,12 +36,6 @@ MY_SIZE_Z=$((${MY_SIZE}*${LZ}))
 echo "launching  mpirun -n ${NCPUS} -genv OMP_NUM_THREADS=${MY_NTH} ./${EXE} -np ${MY_NY} ${MY_NZ} -res ${MY_SIZE_X} ${MY_SIZE_Y} ${MY_SIZE_Z} -ni 100 >> stdout_${PBS_JOBID}"
 mpirun -n ${NCPUS} -genv OMP_NUM_THREADS=${MY_NTH} ./${EXE} -np ${MY_NY} ${MY_NZ} -res ${MY_SIZE_X} ${MY_SIZE_Y} ${MY_SIZE_Z} -ni 100 >> stdout_${PBS_JOBID}
 
-echo "============================== FLUPS ONLY, WITHOUT METIS ==========================================" >> stdout_${PBS_JOBID}
-
-echo "launching  mpirun -n ${NCPUS} -genv OMP_NUM_THREADS=${MY_NTH} ./${EXE} -np ${MY_NY} ${MY_NZ} -res ${MY_SIZE_X} ${MY_SIZE_Y} ${MY_SIZE_Z} -ni 100 >> stdout_${PBS_JOBID}"
-mpirun -n ${NCPUS} -genv OMP_NUM_THREADS=${MY_NTH} ./${EXE}_noP3D -np ${MY_NY} ${MY_NZ} -res ${MY_SIZE_X} ${MY_SIZE_Y} ${MY_SIZE_Z} -ni 100 >> stdout_${PBS_JOBID}
-
-
 ################## 
 echo "End time : " $(date)
 echo "----------------- Computation over, bye bye! ----"

diff --git a/samples/compareP3DFFT++/run/zenobe_weakscaling_a2a.sh b/samples/compareP3DFFT++/run/zenobe_weakscaling_a2a.sh
@@ -9,7 +9,7 @@ VER=a2a
 EXE=flups_vs_p3dfft++_${VER}
 
 ######### WEAK -> increase the number of CPU and the size
-SCRATCH=/SCRATCH/acad/examples/dcaprace/flupsVSp3dfft3_weak_$VER
+SCRATCH=/SCRATCH/acad/examples/dcaprace/flupsVSp3dfft3_weak_${VER}_V4
 
 # clean the validation dir
 # rm -rf ${SCRATCH}
@@ -18,7 +18,6 @@ mkdir -p $SCRATCH/data
 mkdir -p $SCRATCH/prof
 # copy the needed info
 cp $HOME_FLUPS/$EXE $SCRATCH
-cp $HOME_FLUPS/${EXE}_noP3D $SCRATCH
 cp $HOME_FLUPS/run/zenobe_kernel.sh $SCRATCH
 
 cd $SCRATCH
@@ -38,18 +37,22 @@ cd $SCRATCH
 # qsub -q large -v EXE=${EXE},MY_NY=32,MY_NZ=32,LX=8,LY=8,LZ=16,MY_SIZE=64,MY_NTH=1, -l select=256:ncpus=4:mem=10500mb:mpiprocs=4:ompthreads=1 ./zenobe_kernel.sh
 
 #####################   size = 128^3/proc  #################################
-# cpu = 64
+## CANNOT DO cpu=96,192,384... due to P3D !
+
+# cpu = 128 (-> actually allocating 144)
 # same on large
-qsub -q large -v EXE=${EXE},MY_NY=8,MY_NZ=16,LX=4,LY=4,LZ=8,MY_SIZE=128,MY_NTH=1, -l select=32:ncpus=4:mem=10500mb:mpiprocs=4:ompthreads=1 ./zenobe_kernel.sh
+qsub -q large -v EXE=${EXE},MY_NY=8,MY_NZ=16,LX=4,LY=4,LZ=8,MY_SIZE=128,MY_NTH=1, -l select=6:ncpus=24:mem=63000mb:mpiprocs=24:ompthreads=1 ./zenobe_kernel.sh
 
-# cpu = 256
-qsub -q large -v EXE=${EXE},MY_NY=16,MY_NZ=16,LX=4,LY=8,LZ=8,MY_SIZE=128,MY_NTH=1, -l select=64:ncpus=4:mem=10500mb:mpiprocs=4:ompthreads=1 ./zenobe_kernel.sh
+# cpu = 256 (->264)
+qsub -q large -v EXE=${EXE},MY_NY=16,MY_NZ=16,LX=4,LY=8,LZ=8,MY_SIZE=128,MY_NTH=1, -l select=11:ncpus=24:mem=63000mb:mpiprocs=24:ompthreads=1 ./zenobe_kernel.sh
 
-# cpu = 512
-qsub -q large -v EXE=${EXE},MY_NY=16,MY_NZ=32,LX=8,LY=8,LZ=8,MY_SIZE=128,MY_NTH=1, -l select=128:ncpus=4:mem=10500mb:mpiprocs=4:ompthreads=1 ./zenobe_kernel.sh
+# cpu = 512 (->528)
+qsub -q large -v EXE=${EXE},MY_NY=16,MY_NZ=32,LX=8,LY=8,LZ=8,MY_SIZE=128,MY_NTH=1, -l select=22:ncpus=24:mem=63000mb:mpiprocs=24:ompthreads=1 ./zenobe_kernel.sh
 
-# cpu = 1024
-qsub -q large -v EXE=${EXE},MY_NY=32,MY_NZ=32,LX=8,LY=8,LZ=16,MY_SIZE=128,MY_NTH=1, -l select=256:ncpus=4:mem=10500mb:mpiprocs=4:ompthreads=1 ./zenobe_kernel.sh
+# cpu = 1024 (->1032)
+qsub -q large -v EXE=${EXE},MY_NY=32,MY_NZ=32,LX=8,LY=8,LZ=16,MY_SIZE=128,MY_NTH=1, -l select=43:ncpus=24:mem=63000mb:mpiprocs=24:ompthreads=1 ./zenobe_kernel.sh
 
+# cpu = 2048 (->2064)
+qsub -q large -v EXE=${EXE},MY_NY=32,MY_NZ=64,LX=8,LY=16,LZ=16,MY_SIZE=128,MY_NTH=1, -l select=86:ncpus=24:mem=63000mb:mpiprocs=24:ompthreads=1 ./zenobe_kernel.sh
 
 #end of file
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,8 @@ prof @@
     .vscode
     doc/html
     doc/latex
+    *.d
+    *.in
     *.x
     *.xlsx
     *.log
@@ Expand Down @@