From 307f6f173c41203e64fc5ff53a892e1dd2dfcac9 Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@umd.edu>
Date: Tue, 20 Jun 2023 17:31:15 -0400
Subject: [PATCH 1/3] Add working OpenACC port of XSBench

---
 .gitignore               |   2 +-
 openacc/GridInit.c       | 177 ++++++++++++++
 openacc/Main.c           | 115 +++++++++
 openacc/Makefile         | 108 +++++++++
 openacc/Materials.c      | 118 ++++++++++
 openacc/Simulation.c     | 402 +++++++++++++++++++++++++++++++
 openacc/XSbench_header.h | 131 +++++++++++
 openacc/XSutils.c        |  48 ++++
 openacc/io.c             | 494 +++++++++++++++++++++++++++++++++++++++
 9 files changed, 1594 insertions(+), 1 deletion(-)
 create mode 100644 openacc/GridInit.c
 create mode 100644 openacc/Main.c
 create mode 100644 openacc/Makefile
 create mode 100644 openacc/Materials.c
 create mode 100644 openacc/Simulation.c
 create mode 100644 openacc/XSbench_header.h
 create mode 100644 openacc/XSutils.c
 create mode 100644 openacc/io.c

diff --git a/.gitignore b/.gitignore
index 08f0e1b3..d6ff91a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
+*~
 *.o
-*/XSBench
diff --git a/openacc/GridInit.c b/openacc/GridInit.c
new file mode 100644
index 00000000..3787dff4
--- /dev/null
+++ b/openacc/GridInit.c
@@ -0,0 +1,177 @@
+#include "XSbench_header.h"
+
+SimulationData grid_init_do_not_profile( Inputs in, int mype )
+{
+	// Structure to hold all allocated simuluation data arrays
+	SimulationData SD;
+
+	// Keep track of how much data we're allocating
+	size_t nbytes = 0;
+
+	// Set the initial seed value
+	uint64_t seed = 42;	
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Nuclide Grids
+	////////////////////////////////////////////////////////////////////
+	
+	if(mype == 0) printf("Intializing nuclide grids...\n");
+
+	// First, we need to initialize our nuclide grid. This comes in the form
+	// of a flattened 2D array that hold all the information we need to define
+	// the cross sections for all isotopes in the simulation. 
+	// The grid is composed of "NuclideGridPoint" structures, which hold the
+	// energy level of the grid point and all associated XS data at that level.
+	// An array of structures (AOS) is used instead of
+	// a structure of arrays, as the grid points themselves are accessed in 
+	// a random order, but all cross section interaction channels and the
+	// energy level are read whenever the gridpoint is accessed, meaning the
+	// AOS is more cache efficient.
+	
+	// Initialize Nuclide Grid
+	SD.length_nuclide_grid = in.n_isotopes * in.n_gridpoints;
+	SD.nuclide_grid     = (NuclideGridPoint *) malloc( SD.length_nuclide_grid * sizeof(NuclideGridPoint));
+	assert(SD.nuclide_grid != NULL);
+	nbytes += SD.length_nuclide_grid * sizeof(NuclideGridPoint);
+	for( int i = 0; i < SD.length_nuclide_grid; i++ )
+	{
+		SD.nuclide_grid[i].energy        = LCG_random_double(&seed);
+		SD.nuclide_grid[i].total_xs      = LCG_random_double(&seed);
+		SD.nuclide_grid[i].elastic_xs    = LCG_random_double(&seed);
+		SD.nuclide_grid[i].absorbtion_xs = LCG_random_double(&seed);
+		SD.nuclide_grid[i].fission_xs    = LCG_random_double(&seed);
+		SD.nuclide_grid[i].nu_fission_xs = LCG_random_double(&seed);
+	}
+
+	// Sort so that each nuclide has data stored in ascending energy order.
+	for( int i = 0; i < in.n_isotopes; i++ )
+		qsort( &SD.nuclide_grid[i*in.n_gridpoints], in.n_gridpoints, sizeof(NuclideGridPoint), NGP_compare);
+	
+	// error debug check
+	/*
+	for( int i = 0; i < in.n_isotopes; i++ )
+	{
+		printf("NUCLIDE %d ==============================\n", i);
+		for( int j = 0; j < in.n_gridpoints; j++ )
+			printf("E%d = %lf\n", j, SD.nuclide_grid[i * in.n_gridpoints + j].energy);
+	}
+	*/
+	
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Acceleration Structure
+	////////////////////////////////////////////////////////////////////
+	
+	if( in.grid_type == NUCLIDE )
+	{
+		SD.length_unionized_energy_array = 0;
+		SD.length_index_grid = 0;
+	}
+	
+	if( in.grid_type == UNIONIZED )
+	{
+		if(mype == 0) printf("Intializing unionized grid...\n");
+
+		// Allocate space to hold the union of all nuclide energy data
+		SD.length_unionized_energy_array = in.n_isotopes * in.n_gridpoints;
+		SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
+		assert(SD.unionized_energy_array != NULL );
+		nbytes += SD.length_unionized_energy_array * sizeof(double);
+
+		// Copy energy data over from the nuclide energy grid
+		for( int i = 0; i < SD.length_unionized_energy_array; i++ )
+			SD.unionized_energy_array[i] = SD.nuclide_grid[i].energy;
+
+		// Sort unionized energy array
+		qsort( SD.unionized_energy_array, SD.length_unionized_energy_array, sizeof(double), double_compare);
+
+		// Allocate space to hold the acceleration grid indices
+		SD.length_index_grid = SD.length_unionized_energy_array * in.n_isotopes;
+		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
+		assert(SD.index_grid != NULL);
+		nbytes += SD.length_index_grid * sizeof(int);
+
+		// Generates the double indexing grid
+		int * idx_low = (int *) calloc( in.n_isotopes, sizeof(int));
+		assert(idx_low != NULL );
+		double * energy_high = (double *) malloc( in.n_isotopes * sizeof(double));
+		assert(energy_high != NULL );
+
+		for( int i = 0; i < in.n_isotopes; i++ )
+			energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + 1].energy;
+
+		for( long e = 0; e < SD.length_unionized_energy_array; e++ )
+		{
+			double unionized_energy = SD.unionized_energy_array[e];
+			for( long i = 0; i < in.n_isotopes; i++ )
+			{
+				if( unionized_energy < energy_high[i]  )
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+				else if( idx_low[i] == in.n_gridpoints - 2 )
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+				else
+				{
+					idx_low[i]++;
+					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
+					energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;	
+				}
+			}
+		}
+
+		free(idx_low);
+		free(energy_high);
+	}
+
+	if( in.grid_type == HASH )
+	{
+		if(mype == 0) printf("Intializing hash grid...\n");
+		SD.length_unionized_energy_array = 0;
+		SD.length_index_grid  = in.hash_bins * in.n_isotopes;
+		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int)); 
+		assert(SD.index_grid != NULL);
+		nbytes += SD.length_index_grid * sizeof(int);
+
+		double du = 1.0 / in.hash_bins;
+
+		// For each energy level in the hash table
+                #pragma omp parallel for
+		for( long e = 0; e < in.hash_bins; e++ )
+		{
+			double energy = e * du;
+
+			// We need to determine the bounding energy levels for all isotopes
+			for( long i = 0; i < in.n_isotopes; i++ )
+			{
+				SD.index_grid[e * in.n_isotopes + i] = grid_search_nuclide( in.n_gridpoints, energy, SD.nuclide_grid + i * in.n_gridpoints, 0, in.n_gridpoints-1);
+			}
+		}
+	}
+
+	////////////////////////////////////////////////////////////////////
+	// Initialize Materials and Concentrations
+	////////////////////////////////////////////////////////////////////
+	if(mype == 0) printf("Intializing material data...\n");
+	
+	// Set the number of nuclides in each material
+	SD.num_nucs  = load_num_nucs(in.n_isotopes);
+	SD.length_num_nucs = 12; // There are always 12 materials in XSBench
+
+	// Intialize the flattened 2D grid of material data. The grid holds
+	// a list of nuclide indices for each of the 12 material types. The
+	// grid is allocated as a full square grid, even though not all
+	// materials have the same number of nuclides.
+	SD.mats = load_mats(SD.num_nucs, in.n_isotopes, &SD.max_num_nucs);
+	SD.length_mats = SD.length_num_nucs * SD.max_num_nucs;
+
+	// Intialize the flattened 2D grid of nuclide concentration data. The grid holds
+	// a list of nuclide concentrations for each of the 12 material types. The
+	// grid is allocated as a full square grid, even though not all
+	// materials have the same number of nuclides.
+	SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
+	SD.length_concs = SD.length_mats;
+
+	if(mype == 0) printf("Intialization complete. Allocated %.0lf MB of data.\n", nbytes/1024.0/1024.0 );
+
+	return SD;
+
+}
diff --git a/openacc/Main.c b/openacc/Main.c
new file mode 100644
index 00000000..1d4d4b15
--- /dev/null
+++ b/openacc/Main.c
@@ -0,0 +1,115 @@
+#include "XSbench_header.h"
+
+#ifdef MPI
+#include<mpi.h>
+#endif
+
+int main( int argc, char* argv[] )
+{
+	// =====================================================================
+	// Initialization & Command Line Read-In
+	// =====================================================================
+	int version = 20;
+	int mype = 0;
+	double omp_start, omp_end;
+	int nprocs = 1;
+	unsigned long long verification;
+
+	#ifdef MPI
+	MPI_Status stat;
+	MPI_Init(&argc, &argv);
+	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+	MPI_Comm_rank(MPI_COMM_WORLD, &mype);
+	#endif
+
+	// Process CLI Fields -- store in "Inputs" structure
+	Inputs in = read_CLI( argc, argv );
+
+	// Set number of OpenMP Threads
+	//omp_set_num_threads(in.nthreads); 
+
+	// Print-out of Input Summary
+	if( mype == 0 )
+		print_inputs( in, nprocs, version );
+
+	// =====================================================================
+	// Prepare Nuclide Energy Grids, Unionized Energy Grid, & Material Data
+	// This is not reflective of a real Monte Carlo simulation workload,
+	// therefore, do not profile this region!
+	// =====================================================================
+	
+	SimulationData SD;
+
+	// If read from file mode is selected, skip initialization and load
+	// all simulation data structures from file instead
+	if( in.binary_mode == READ )
+		SD = binary_read(in);
+	else
+		SD = grid_init_do_not_profile( in, mype );
+
+	// If writing from file mode is selected, write all simulation data
+	// structures to file
+	if( in.binary_mode == WRITE && mype == 0 )
+		binary_write(in, SD);
+
+
+	// =====================================================================
+	// Cross Section (XS) Parallel Lookup Simulation
+	// This is the section that should be profiled, as it reflects a 
+	// realistic continuous energy Monte Carlo macroscopic cross section
+	// lookup kernel.
+	// =====================================================================
+
+	if( mype == 0 )
+	{
+		printf("\n");
+		border_print();
+		center_print("SIMULATION", 79);
+		border_print();
+	}
+
+	// Start Simulation Timer
+	omp_start = omp_get_wtime();
+
+	// Run simulation
+	if( in.simulation_method == EVENT_BASED )
+	{
+		if( in.kernel_id == 0 )
+			verification = run_event_based_simulation(in, SD, mype);
+		else
+		{
+			printf("Error: No kernel ID %d found!\n", in.kernel_id);
+			exit(1);
+		}
+	}
+	else
+	{
+		printf("History-based simulation not implemented in OpenMP offload code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
+		exit(1);
+	}
+
+	if( mype == 0)	
+	{	
+		printf("\n" );
+		printf("Simulation complete.\n" );
+	}
+
+	// End Simulation Timer
+	omp_end = omp_get_wtime();
+
+	// =====================================================================
+	// Output Results & Finalize
+	// =====================================================================
+
+	// Final Hash Step
+	verification = verification % 999983;
+
+	// Print / Save Results and Exit
+	int is_invalid_result = print_results( in, mype, omp_end-omp_start, nprocs, verification );
+
+	#ifdef MPI
+	MPI_Finalize();
+	#endif
+
+	return is_invalid_result;
+}
diff --git a/openacc/Makefile b/openacc/Makefile
new file mode 100644
index 00000000..51a0bbd4
--- /dev/null
+++ b/openacc/Makefile
@@ -0,0 +1,108 @@
+#===============================================================================
+# User Options
+#===============================================================================
+
+COMPILER    = pgi
+OPTIMIZE    = yes
+DEBUG       = yes
+PROFILE     = no
+MPI         = no
+OPENACC     = no
+
+#===============================================================================
+# Program name & source code list
+#===============================================================================
+
+program = XSBench
+
+source = \
+Main.c \
+io.c \
+Simulation.c \
+GridInit.c \
+XSutils.c \
+Materials.c
+
+obj = $(source:.c=.o)
+
+#===============================================================================
+# Sets Flags
+#===============================================================================
+
+# Standard Flags
+CFLAGS := -std=gnu99 -Wall
+
+# Linker Flags
+LDFLAGS = -lm
+
+# PGI Compiler
+ifeq ($(COMPILER),pgi)
+  CC = pgcc
+  CFLAGS = -c99 -Minform=warn -mp -Minfo=mp
+  ifeq ($(OPTIMIZE),yes)
+    CFLAGS += -fast
+  endif
+  ifeq ($(VEC_INFO),yes)
+    CFLAGS += -Minfo=vect
+  endif
+  # OpenACC (assumes NVIDA GPU)
+  ifeq ($(OPENACC),yes)
+    CFLAGS += -acc -Minfo=accel -gpu=cc80
+  endif
+endif
+
+# GNU Compiler
+ifeq ($(COMPILER),gnu)
+  CC = gcc
+  CFLAGS += -std=gnu99 -Wall -fopenmp
+  # Optimization
+  ifeq ($(OPTIMIZE),yes)
+    CFLAGS += -O3
+    # Compiler Vectorization (needs -O3 flag) information
+    ifeq ($(VEC_INFO),yes)
+      CFLAGS += -ftree-vectorizer-verbose=6
+    endif
+  endif
+endif
+
+# Debug Flags
+ifeq ($(DEBUG),yes)
+  CFLAGS += -g
+  LDFLAGS  += -g
+endif
+
+# Profiling Flags
+ifeq ($(PROFILE),yes)
+  CFLAGS += -pg
+  LDFLAGS  += -pg
+endif
+
+# Optimization Flags
+ifeq ($(OPTIMIZE),yes)
+  CFLAGS += -O3
+endif
+
+# MPI
+ifeq ($(MPI),yes)
+  CC = mpicc
+  CFLAGS += -DMPI
+endif
+
+#===============================================================================
+# Targets to Build
+#===============================================================================
+
+$(program): $(obj) XSbench_header.h Makefile
+	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
+
+%.o: %.c XSbench_header.h Makefile
+	$(CC) $(CFLAGS) -c $< -o $@
+
+clean:
+	rm -rf $(program) $(obj)
+
+edit:
+	vim -p $(source) XSbench_header.h
+
+run:
+	./$(program)
diff --git a/openacc/Materials.c b/openacc/Materials.c
new file mode 100644
index 00000000..69f1c7f7
--- /dev/null
+++ b/openacc/Materials.c
@@ -0,0 +1,118 @@
+// Material data is hard coded into the functions in this file.
+// Note that there are 12 materials present in H-M (large or small)
+
+#include "XSbench_header.h"
+
+// num_nucs represents the number of nuclides that each material contains
+int * load_num_nucs(long n_isotopes)
+{
+	int * num_nucs = (int*)malloc(12*sizeof(int));
+	
+	// Material 0 is a special case (fuel). The H-M small reactor uses
+	// 34 nuclides, while H-M larges uses 300.
+	if( n_isotopes == 68 )
+		num_nucs[0]  = 34; // HM Small is 34, H-M Large is 321
+	else
+		num_nucs[0]  = 321; // HM Small is 34, H-M Large is 321
+
+	num_nucs[1]  = 5;
+	num_nucs[2]  = 4;
+	num_nucs[3]  = 4;
+	num_nucs[4]  = 27;
+	num_nucs[5]  = 21;
+	num_nucs[6]  = 21;
+	num_nucs[7]  = 21;
+	num_nucs[8]  = 21;
+	num_nucs[9]  = 21;
+	num_nucs[10] = 9;
+	num_nucs[11] = 9;
+
+	return num_nucs;
+}
+
+// Assigns an array of nuclide ID's to each material
+int * load_mats( int * num_nucs, long n_isotopes, int * max_num_nucs )
+{
+	*max_num_nucs = 0;
+	int num_mats = 12;
+	for( int m = 0; m < num_mats; m++ )
+	{
+		if( num_nucs[m] > *max_num_nucs )
+			*max_num_nucs = num_nucs[m];
+	}
+	int * mats = (int *) malloc( num_mats * (*max_num_nucs) * sizeof(int) );
+
+	// Small H-M has 34 fuel nuclides
+	int mats0_Sml[] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53, 
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	// Large H-M has 300 fuel nuclides
+	int mats0_Lrg[321] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53,
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	for( int i = 0; i < 321-34; i++ )
+		mats0_Lrg[34+i] = 68 + i; // H-M large adds nuclides to fuel only
+	
+	// These are the non-fuel materials	
+	int mats1[] =  { 63, 64, 65, 66, 67 }; // cladding
+	int mats2[] =  { 24, 41, 4, 5 }; // cold borated water
+	int mats3[] =  { 24, 41, 4, 5 }; // hot borated water
+	int mats4[] =  { 19, 20, 21, 22, 35, 36, 37, 38, 39, 25, 27, 28, 29,
+	                 30, 31, 32, 26, 49, 50, 51, 11, 12, 13, 14, 6, 16,
+	                 17 }; // RPV
+	int mats5[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // lower radial reflector
+	int mats6[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top reflector / plate
+	int mats7[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom plate
+	int mats8[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom nozzle
+	int mats9[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top nozzle
+	int mats10[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // top of FA's
+	int mats11[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // bottom FA's
+	
+	// H-M large v small dependency
+	if( n_isotopes == 68 )
+		memcpy( mats,  mats0_Sml,  num_nucs[0]  * sizeof(int) );	
+	else
+		memcpy( mats,  mats0_Lrg,  num_nucs[0]  * sizeof(int) );
+	
+	// Copy other materials
+	memcpy( mats + *max_num_nucs * 1,  mats1,  num_nucs[1]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 2,  mats2,  num_nucs[2]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 3,  mats3,  num_nucs[3]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 4,  mats4,  num_nucs[4]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 5,  mats5,  num_nucs[5]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 6,  mats6,  num_nucs[6]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 7,  mats7,  num_nucs[7]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 8,  mats8,  num_nucs[8]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 9,  mats9,  num_nucs[9]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 10, mats10, num_nucs[10] * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 11, mats11, num_nucs[11] * sizeof(int) );	
+
+
+	return mats;
+}
+
+// Randomizes the concentrations of all nuclides in a variety of materials
+double * load_concs( int * num_nucs, int max_num_nucs )
+{
+	uint64_t seed = STARTING_SEED * STARTING_SEED;
+	double * concs = (double *) malloc( 12 * max_num_nucs * sizeof( double ) );
+	
+	for( int i = 0; i < 12; i++ )
+		for( int j = 0; j < num_nucs[i]; j++ )
+			concs[i * max_num_nucs + j] = LCG_random_double(&seed);
+
+	// test
+	/*
+	for( int i = 0; i < 12; i++ )
+		for( int j = 0; j < num_nucs[i]; j++ )
+			printf("concs[%d][%d] = %lf\n", i, j, concs[i][j] );
+	*/
+
+	return concs;
+}
+
diff --git a/openacc/Simulation.c b/openacc/Simulation.c
new file mode 100644
index 00000000..2f0c459b
--- /dev/null
+++ b/openacc/Simulation.c
@@ -0,0 +1,402 @@
+#include "XSbench_header.h"
+
+////////////////////////////////////////////////////////////////////////////////////
+// BASELINE FUNCTIONS
+////////////////////////////////////////////////////////////////////////////////////
+// All "baseline" code is at the top of this file. The baseline code is a simple
+// implementation of the algorithm, with only minor CPU optimizations in place.
+// Following these functions are a number of optimized variants,
+// which each deploy a different combination of optimizations strategies. By
+// default, XSBench will only run the baseline implementation. Optimized variants
+// are not yet implemented for this OpenMP targeting offload port.
+////////////////////////////////////////////////////////////////////////////////////
+
+unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype)
+{
+	if( mype == 0)	
+		printf("Beginning event based simulation...\n");
+	
+	////////////////////////////////////////////////////////////////////////////////
+	// SUMMARY: Simulation Data Structure Manifest for "SD" Object
+	// Here we list all heap arrays (and lengths) in SD that would need to be
+	// offloaded manually if using an accelerator with a seperate memory space
+	////////////////////////////////////////////////////////////////////////////////
+	// int * num_nucs;                     // Length = length_num_nucs;
+	// double * concs;                     // Length = length_concs
+	// int * mats;                         // Length = length_mats
+	// double * unionized_energy_array;    // Length = length_unionized_energy_array
+	// int * index_grid;                   // Length = length_index_grid
+	// NuclideGridPoint * nuclide_grid;    // Length = length_nuclide_grid
+	// 
+	// Note: "unionized_energy_array" and "index_grid" can be of zero length
+	//        depending on lookup method.
+	//
+	// Note: "Lengths" are given as the number of objects in the array, not the
+	//       number of bytes.
+	////////////////////////////////////////////////////////////////////////////////
+
+
+	////////////////////////////////////////////////////////////////////////////////
+	// Begin Actual Simulation Loop 
+	////////////////////////////////////////////////////////////////////////////////
+	unsigned long long * verification = (unsigned long long *) malloc(in.lookups * sizeof(unsigned long long));
+
+        #pragma acc parallel loop\
+        copyin( SD)\
+        copyin( SD.max_num_nucs)\
+	copyin( SD.num_nucs[0:SD.length_num_nucs])\
+	copyin( SD.concs[0:SD.length_concs])\
+	copyin( SD.mats[0:SD.length_mats])\
+	copyin( SD.unionized_energy_array[0:SD.length_unionized_energy_array])\
+	copyin( SD.index_grid[0:SD.length_index_grid])\
+	copyin( SD.nuclide_grid[0:SD.length_nuclide_grid])\
+        copyout( verification[0:in.lookups])
+	for( int i = 0; i < in.lookups; i++ )
+	{
+		// Set the initial seed value
+		uint64_t seed = STARTING_SEED;
+
+		// Forward seed to lookup index (we need 2 samples per lookup)
+		seed = fast_forward_LCG(seed, 2*i);
+
+		// Randomly pick an energy and material for the particle
+		double p_energy = LCG_random_double(&seed);
+		int mat         = pick_mat(&seed); 
+
+		// debugging
+		//printf("E = %lf mat = %d\n", p_energy, mat);
+
+		double macro_xs_vector[5] = {0};
+		
+		// Perform macroscopic Cross Section Lookup
+		calculate_macro_xs(
+				p_energy,        // Sampled neutron energy (in lethargy)
+				mat,             // Sampled material type index neutron is in
+				in.n_isotopes,   // Total number of isotopes in simulation
+				in.n_gridpoints, // Number of gridpoints per isotope in simulation
+				SD.num_nucs,     // 1-D array with number of nuclides per material
+				SD.concs,        // Flattened 2-D array with concentration of each nuclide in each material
+				SD.unionized_energy_array, // 1-D Unionized energy array
+				SD.index_grid,   // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
+				SD.nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
+				SD.mats,         // Flattened 2-D array with nuclide indices defining composition of each type of material
+				macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
+				in.grid_type,    // Lookup type (nuclide, hash, or unionized)
+				in.hash_bins,    // Number of hash bins used (if using hash lookup type)
+				SD.max_num_nucs  // Maximum number of nuclides present in any material
+				);
+
+		// For verification, and to prevent the compiler from optimizing
+		// all work out, we interrogate the returned macro_xs_vector array
+		// to find its maximum value index, then increment the verification
+		// value by that index. In this implementation, we prevent thread
+		// contention by using an OMP reduction on the verification value.
+		// For accelerators, a different approach might be required
+		// (e.g., atomics, reduction of thread-specific values in large
+		// array via CUDA thrust, etc).
+		double max = -1.0;
+		int max_idx = 0;
+		for(int j = 0; j < 5; j++ )
+		{
+			if( macro_xs_vector[j] > max )
+			{
+				max = macro_xs_vector[j];
+				max_idx = j;
+			}
+		}
+		verification[i] = max_idx+1;
+	}
+
+  // Reduce validation hash on the host
+  unsigned long long validation_hash = 0;
+	for( int i = 0; i < in.lookups; i++ )
+    validation_hash += verification[i];
+
+	return validation_hash;
+}
+
+// Calculates the microscopic cross section for a given nuclide & energy
+void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
+                           long n_gridpoints,
+                           double *  egrid, int *  index_data,
+                           NuclideGridPoint *  nuclide_grids,
+                           long idx, double *  xs_vector, int grid_type, int hash_bins ){
+	// Variables
+	double f;
+	NuclideGridPoint * low, * high;
+
+	// If using only the nuclide grid, we must perform a binary search
+	// to find the energy location in this particular nuclide's grid.
+	if( grid_type == NUCLIDE )
+	{
+		// Perform binary search on the Nuclide Grid to find the index
+		idx = grid_search_nuclide( n_gridpoints, p_energy, &nuclide_grids[nuc*n_gridpoints], 0, n_gridpoints-1);
+
+		// pull ptr from nuclide grid and check to ensure that
+		// we're not reading off the end of the nuclide's grid
+		if( idx == n_gridpoints - 1 )
+			low = &nuclide_grids[nuc*n_gridpoints + idx - 1];
+		else
+			low = &nuclide_grids[nuc*n_gridpoints + idx];
+	}
+	else if( grid_type == UNIONIZED) // Unionized Energy Grid - we already know the index, no binary search needed.
+	{
+		// pull ptr from energy grid and check to ensure that
+		// we're not reading off the end of the nuclide's grid
+		if( index_data[idx * n_isotopes + nuc] == n_gridpoints - 1 )
+			low = &nuclide_grids[nuc*n_gridpoints + index_data[idx * n_isotopes + nuc] - 1];
+		else
+			low = &nuclide_grids[nuc*n_gridpoints + index_data[idx * n_isotopes + nuc]];
+	}
+	else // Hash grid
+	{
+		// load lower bounding index
+		int u_low = index_data[idx * n_isotopes + nuc];
+
+		// Determine higher bounding index
+		int u_high;
+		if( idx == hash_bins - 1 )
+			u_high = n_gridpoints - 1;
+		else
+			u_high = index_data[(idx+1)*n_isotopes + nuc] + 1;
+
+		// Check edge cases to make sure energy is actually between these
+		// Then, if things look good, search for gridpoint in the nuclide grid
+		// within the lower and higher limits we've calculated.
+		double e_low  = nuclide_grids[nuc*n_gridpoints + u_low].energy;
+		double e_high = nuclide_grids[nuc*n_gridpoints + u_high].energy;
+		int lower;
+		if( p_energy <= e_low )
+			lower = 0;
+		else if( p_energy >= e_high )
+			lower = n_gridpoints - 1;
+		else
+			lower = grid_search_nuclide( n_gridpoints, p_energy, &nuclide_grids[nuc*n_gridpoints], u_low, u_high);
+
+		if( lower == n_gridpoints - 1 )
+			low = &nuclide_grids[nuc*n_gridpoints + lower - 1];
+		else
+			low = &nuclide_grids[nuc*n_gridpoints + lower];
+	}
+	
+	high = low + 1;
+	
+	// calculate the re-useable interpolation factor
+	f = (high->energy - p_energy) / (high->energy - low->energy);
+
+	// Total XS
+	xs_vector[0] = high->total_xs - f * (high->total_xs - low->total_xs);
+	
+	// Elastic XS
+	xs_vector[1] = high->elastic_xs - f * (high->elastic_xs - low->elastic_xs);
+	
+	// Absorbtion XS
+	xs_vector[2] = high->absorbtion_xs - f * (high->absorbtion_xs - low->absorbtion_xs);
+	
+	// Fission XS
+	xs_vector[3] = high->fission_xs - f * (high->fission_xs - low->fission_xs);
+	
+	// Nu Fission XS
+	xs_vector[4] = high->nu_fission_xs - f * (high->nu_fission_xs - low->nu_fission_xs);
+	
+	//test
+	/*
+	if( omp_get_thread_num() == 0 )
+	{
+		printf("Lookup: Energy = %lf, nuc = %d\n", p_energy, nuc);
+		printf("e_h = %lf e_l = %lf\n", high->energy , low->energy);
+		printf("xs_h = %lf xs_l = %lf\n", high->elastic_xs, low->elastic_xs);
+		printf("total_xs = %lf\n\n", xs_vector[1]);
+	}
+	*/
+	
+}
+
+// Calculates macroscopic cross section based on a given material & energy 
+void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
+                         long n_gridpoints, int *  num_nucs,
+                         double *  concs,
+                         double *  egrid, int *  index_data,
+                         NuclideGridPoint *  nuclide_grids,
+                         int *  mats,
+                         double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs ){
+	int p_nuc; // the nuclide we are looking up
+	long idx = -1;	
+	double conc; // the concentration of the nuclide in the material
+
+	// cleans out macro_xs_vector
+	for( int k = 0; k < 5; k++ )
+		macro_xs_vector[k] = 0;
+
+	// If we are using the unionized energy grid (UEG), we only
+	// need to perform 1 binary search per macroscopic lookup.
+	// If we are using the nuclide grid search, it will have to be
+	// done inside of the "calculate_micro_xs" function for each different
+	// nuclide in the material.
+	if( grid_type == UNIONIZED )
+		idx = grid_search( n_isotopes * n_gridpoints, p_energy, egrid);	
+	else if( grid_type == HASH )
+	{
+		double du = 1.0 / hash_bins;
+		idx = p_energy / du;
+	}
+	
+	// Once we find the pointer array on the UEG, we can pull the data
+	// from the respective nuclide grids, as well as the nuclide
+	// concentration data for the material
+	// Each nuclide from the material needs to have its micro-XS array
+	// looked up & interpolatied (via calculate_micro_xs). Then, the
+	// micro XS is multiplied by the concentration of that nuclide
+	// in the material, and added to the total macro XS array.
+	// (Independent -- though if parallelizing, must use atomic operations
+	//  or otherwise control access to the xs_vector and macro_xs_vector to
+	//  avoid simulataneous writing to the same data structure)
+	for( int j = 0; j < num_nucs[mat]; j++ )
+	{
+		double xs_vector[5];
+		p_nuc = mats[mat*max_num_nucs + j];
+		conc = concs[mat*max_num_nucs + j];
+		calculate_micro_xs( p_energy, p_nuc, n_isotopes,
+		                    n_gridpoints, egrid, index_data,
+		                    nuclide_grids, idx, xs_vector, grid_type, hash_bins );
+		for( int k = 0; k < 5; k++ )
+			macro_xs_vector[k] += xs_vector[k] * conc;
+	}
+	
+	//test
+	/*
+	for( int k = 0; k < 5; k++ )
+		printf("Energy: %lf, Material: %d, XSVector[%d]: %lf\n",
+		       p_energy, mat, k, macro_xs_vector[k]);
+			   */
+}
+
+
+// binary search for energy on unionized energy grid
+// returns lower index
+long grid_search( long n, double quarry, double *  A)
+{
+	long lowerLimit = 0;
+	long upperLimit = n-1;
+	long examinationPoint;
+	long length = upperLimit - lowerLimit;
+
+	while( length > 1 )
+	{
+		examinationPoint = lowerLimit + ( length / 2 );
+		
+		if( A[examinationPoint] > quarry )
+			upperLimit = examinationPoint;
+		else
+			lowerLimit = examinationPoint;
+		
+		length = upperLimit - lowerLimit;
+	}
+	
+	return lowerLimit;
+}
+
+// binary search for energy on nuclide energy grid
+long grid_search_nuclide( long n, double quarry, NuclideGridPoint * A, long low, long high)
+{
+	long lowerLimit = low;
+	long upperLimit = high;
+	long examinationPoint;
+	long length = upperLimit - lowerLimit;
+
+	while( length > 1 )
+	{
+		examinationPoint = lowerLimit + ( length / 2 );
+		
+		if( A[examinationPoint].energy > quarry )
+			upperLimit = examinationPoint;
+		else
+			lowerLimit = examinationPoint;
+		
+		length = upperLimit - lowerLimit;
+	}
+	
+	return lowerLimit;
+}
+
+// picks a material based on a probabilistic distribution
+int pick_mat( uint64_t * seed )
+{
+	// I have a nice spreadsheet supporting these numbers. They are
+	// the fractions (by volume) of material in the core. Not a 
+	// *perfect* approximation of where XS lookups are going to occur,
+	// but this will do a good job of biasing the system nonetheless.
+
+	// Also could be argued that doing fractions by weight would be 
+	// a better approximation, but volume does a good enough job for now.
+
+	double dist[12];
+	dist[0]  = 0.140;	// fuel
+	dist[1]  = 0.052;	// cladding
+	dist[2]  = 0.275;	// cold, borated water
+	dist[3]  = 0.134;	// hot, borated water
+	dist[4]  = 0.154;	// RPV
+	dist[5]  = 0.064;	// Lower, radial reflector
+	dist[6]  = 0.066;	// Upper reflector / top plate
+	dist[7]  = 0.055;	// bottom plate
+	dist[8]  = 0.008;	// bottom nozzle
+	dist[9]  = 0.015;	// top nozzle
+	dist[10] = 0.025;	// top of fuel assemblies
+	dist[11] = 0.013;	// bottom of fuel assemblies
+	
+	double roll = LCG_random_double(seed);
+
+	// makes a pick based on the distro
+	for( int i = 0; i < 12; i++ )
+	{
+		double running = 0;
+		for( int j = i; j > 0; j-- )
+			running += dist[j];
+		if( roll < running )
+			return i;
+	}
+
+	return 0;
+}
+
+double LCG_random_double(uint64_t * seed)
+{
+	// LCG parameters
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	const uint64_t a = 2806196910506780709ULL;
+	const uint64_t c = 1ULL;
+	*seed = (a * (*seed) + c) % m;
+	return (double) (*seed) / (double) m;
+	//return ldexp(*seed, -63);
+
+}	
+
+uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
+{
+	// LCG parameters
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	uint64_t a = 2806196910506780709ULL;
+	uint64_t c = 1ULL;
+
+	n = n % m;
+
+	uint64_t a_new = 1;
+	uint64_t c_new = 0;
+
+	while(n > 0) 
+	{
+		if(n & 1)
+		{
+			a_new *= a;
+			c_new = c_new * a + c;
+		}
+		c *= (a + 1);
+		a *= a;
+
+		n >>= 1;
+	}
+
+	return (a_new * seed + c_new) % m;
+
+}
+
diff --git a/openacc/XSbench_header.h b/openacc/XSbench_header.h
new file mode 100644
index 00000000..1140c9c0
--- /dev/null
+++ b/openacc/XSbench_header.h
@@ -0,0 +1,131 @@
+#ifndef __XSBENCH_HEADER_H__
+#define __XSBENCH_HEADER_H__
+
+#include<stdio.h>
+#include<stdlib.h>
+#include<time.h>
+#include<string.h>
+#include<strings.h>
+#include<math.h>
+#include<omp.h>
+#ifdef _OPENACC
+#include<openacc.h>
+#endif
+#include<unistd.h>
+#include<sys/time.h>
+#include<assert.h>
+#include<stdint.h>
+
+// Papi Header
+#ifdef PAPI
+#include "papi.h"
+#endif
+
+// Grid types
+#define UNIONIZED 0
+#define NUCLIDE 1
+#define HASH 2
+
+// Simulation types
+#define HISTORY_BASED 1
+#define EVENT_BASED 2
+
+// Binary Mode Type
+#define NONE 0
+#define READ 1
+#define WRITE 2
+
+// Starting Seed
+#define STARTING_SEED 1070
+
+// Structures
+typedef struct{
+	double energy;
+	double total_xs;
+	double elastic_xs;
+	double absorbtion_xs;
+	double fission_xs;
+	double nu_fission_xs;
+} NuclideGridPoint;
+
+typedef struct{
+	int nthreads;
+	long n_isotopes;
+	long n_gridpoints;
+	int lookups;
+	char * HM;
+	int grid_type; // 0: Unionized Grid (default)    1: Nuclide Grid
+	int hash_bins;
+	int particles;
+	int simulation_method;
+	int binary_mode;
+	int kernel_id;
+} Inputs;
+
+typedef struct{
+	int * num_nucs;                     // Length = length_num_nucs;
+	double * concs;                     // Length = length_concs
+	int * mats;                         // Length = length_mats
+	double * unionized_energy_array;    // Length = length_unionized_energy_array
+	int * index_grid;                   // Length = length_index_grid
+	NuclideGridPoint * nuclide_grid;    // Length = length_nuclide_grid
+	int length_num_nucs;
+	int length_concs;
+	int length_mats;
+	int length_unionized_energy_array;
+	long length_index_grid;
+	int length_nuclide_grid;
+	int max_num_nucs;
+	double * p_energy_samples;
+	int length_p_energy_samples;
+	int * mat_samples;
+	int length_mat_samples;
+} SimulationData;
+
+// io.c
+void logo(int version);
+void center_print(const char *s, int width);
+void border_print(void);
+void fancy_int(long a);
+Inputs read_CLI( int argc, char * argv[] );
+void print_CLI_error(void);
+void print_inputs(Inputs in, int nprocs, int version);
+int print_results( Inputs in, int mype, double runtime, int nprocs, unsigned long long vhash );
+void binary_write( Inputs in, SimulationData SD );
+SimulationData binary_read( Inputs in );
+
+// Simulation.c
+unsigned long long run_event_based_simulation(Inputs in, SimulationData SD, int mype);
+unsigned long long run_history_based_simulation(Inputs in, SimulationData SD, int mype);
+void calculate_micro_xs(   double p_energy, int nuc, long n_isotopes,
+                           long n_gridpoints,
+                           double *  egrid, int *  index_data,
+                           NuclideGridPoint *  nuclide_grids,
+                           long idx, double *  xs_vector, int grid_type, int hash_bins );
+void calculate_macro_xs( double p_energy, int mat, long n_isotopes,
+                         long n_gridpoints, int *  num_nucs,
+                         double *  concs,
+                         double *  egrid, int *  index_data,
+                         NuclideGridPoint *  nuclide_grids,
+                         int *  mats,
+                         double *  macro_xs_vector, int grid_type, int hash_bins, int max_num_nucs );
+long grid_search( long n, double quarry, double *  A);
+long grid_search_nuclide( long n, double quarry, NuclideGridPoint * A, long low, long high);
+int pick_mat( uint64_t * seed );
+double LCG_random_double(uint64_t * seed);
+uint64_t fast_forward_LCG(uint64_t seed, uint64_t n);
+unsigned long long run_event_based_simulation_optimization_1(Inputs in, SimulationData SD, int mype);
+
+// GridInit.c
+SimulationData grid_init_do_not_profile( Inputs in, int mype );
+
+// XSutils.c
+int NGP_compare( const void * a, const void * b );
+int double_compare(const void * a, const void * b);
+size_t estimate_mem_usage( Inputs in );
+
+// Materials.c
+int * load_num_nucs(long n_isotopes);
+int * load_mats( int * num_nucs, long n_isotopes, int * max_num_nucs );
+double * load_concs( int * num_nucs, int max_num_nucs );
+#endif
diff --git a/openacc/XSutils.c b/openacc/XSutils.c
new file mode 100644
index 00000000..d433ecdc
--- /dev/null
+++ b/openacc/XSutils.c
@@ -0,0 +1,48 @@
+#include "XSbench_header.h"
+
+int double_compare(const void * a, const void * b)
+{
+	double A = *((double *) a);
+	double B = *((double *) b);
+
+	if( A > B )
+		return 1;
+	else if( A < B )
+		return -1;
+	else
+		return 0;
+}
+
+int NGP_compare(const void * a, const void * b)
+{
+	NuclideGridPoint A = *((NuclideGridPoint *) a);
+	NuclideGridPoint B = *((NuclideGridPoint *) b);
+
+	if( A.energy > B.energy )
+		return 1;
+	else if( A.energy < B.energy )
+		return -1;
+	else
+		return 0;
+}
+
+
+size_t estimate_mem_usage( Inputs in )
+{
+	size_t single_nuclide_grid = in.n_gridpoints * sizeof( NuclideGridPoint );
+	size_t all_nuclide_grids   = in.n_isotopes * single_nuclide_grid;
+	size_t size_UEG            = in.n_isotopes*in.n_gridpoints*sizeof(double) + in.n_isotopes*in.n_gridpoints*in.n_isotopes*sizeof(int);
+	size_t size_hash_grid      = in.hash_bins * in.n_isotopes * sizeof(int);
+	size_t memtotal;
+
+	if( in.grid_type == UNIONIZED )
+		memtotal          = all_nuclide_grids + size_UEG;
+	else if( in.grid_type == NUCLIDE )
+		memtotal          = all_nuclide_grids;
+	else
+		memtotal          = all_nuclide_grids + size_hash_grid;
+
+	memtotal          = ceil(memtotal / (1024.0*1024.0));
+	return memtotal;
+}
+
diff --git a/openacc/io.c b/openacc/io.c
new file mode 100644
index 00000000..aa584a92
--- /dev/null
+++ b/openacc/io.c
@@ -0,0 +1,494 @@
+#include "XSbench_header.h"
+
+#ifdef MPI
+#include<mpi.h>
+#endif
+
+// Prints program logo
+void logo(int version)
+{
+	border_print();
+	printf(
+	"                   __   __ ___________                 _                        \n"
+	"                   \\ \\ / //  ___| ___ \\               | |                       \n"
+	"                    \\ V / \\ `--.| |_/ / ___ _ __   ___| |__                     \n"
+	"                    /   \\  `--. \\ ___ \\/ _ \\ '_ \\ / __| '_ \\                    \n"
+	"                   / /^\\ \\/\\__/ / |_/ /  __/ | | | (__| | | |                   \n"
+	"                   \\/   \\/\\____/\\____/ \\___|_| |_|\\___|_| |_|                   \n\n"
+    );
+	border_print();
+	center_print("Developed at Argonne National Laboratory", 79);
+	char v[100];
+	sprintf(v, "Version: %d", version);
+	center_print(v, 79);
+	border_print();
+}
+
+// Prints Section titles in center of 80 char terminal
+void center_print(const char *s, int width)
+{
+	int length = strlen(s);
+	int i;
+	for (i=0; i<=(width-length)/2; i++) {
+		fputs(" ", stdout);
+	}
+	fputs(s, stdout);
+	fputs("\n", stdout);
+}
+
+int print_results( Inputs in, int mype, double runtime, int nprocs,
+	unsigned long long vhash )
+{
+	// Calculate Lookups per sec
+	int lookups = 0;
+	if( in.simulation_method == HISTORY_BASED )
+		lookups = in.lookups * in.particles;
+	else if( in.simulation_method == EVENT_BASED )
+		lookups = in.lookups;
+	int lookups_per_sec = (int) ((double) lookups / runtime);
+	
+	// If running in MPI, reduce timing statistics and calculate average
+	#ifdef MPI
+	int total_lookups = 0;
+	MPI_Barrier(MPI_COMM_WORLD);
+	MPI_Reduce(&lookups_per_sec, &total_lookups, 1, MPI_INT,
+	           MPI_SUM, 0, MPI_COMM_WORLD);
+	#endif
+
+	int is_invalid_result = 1;
+	
+	// Print output
+	if( mype == 0 )
+	{
+		border_print();
+		center_print("RESULTS", 79);
+		border_print();
+
+		// Print the results
+    printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
+		#ifdef MPI
+		printf("MPI ranks:   %d\n", nprocs);
+		#endif
+		#ifdef MPI
+		printf("Total Lookups/s:            ");
+		fancy_int(total_lookups);
+		printf("Avg Lookups/s per MPI rank: ");
+		fancy_int(total_lookups / nprocs);
+		#else
+		printf("Runtime:     %.3lf seconds\n", runtime);
+		printf("Lookups:     "); fancy_int(lookups);
+		printf("Lookups/s:   ");
+		fancy_int(lookups_per_sec);
+		#endif
+	}
+
+	unsigned long long large = 0;
+	unsigned long long small = 0; 
+	if( in.simulation_method == EVENT_BASED )
+	{
+		small = 945990;
+		large = 952131;
+	}
+	else if( in.simulation_method == HISTORY_BASED )
+	{
+		small = 941535;
+		large = 954318; 
+	}
+	if( strcmp(in.HM, "large") == 0 )
+	{
+		if( vhash == large )
+			is_invalid_result = 0;
+	}
+	else if( strcmp(in.HM, "small") == 0 )
+	{
+		if( vhash == small )
+			is_invalid_result = 0;
+	}
+
+	if(mype == 0 )
+	{
+		if( is_invalid_result )
+			printf("Verification checksum: %llu (WARNING - INAVALID CHECKSUM!)\n", vhash);
+		else
+			printf("Verification checksum: %llu (Valid)\n", vhash);
+		border_print();
+	}
+
+	return is_invalid_result;
+}
+
+void print_inputs(Inputs in, int nprocs, int version )
+{
+	// Calculate Estimate of Memory Usage
+	int mem_tot = estimate_mem_usage( in );
+	logo(version);
+	center_print("INPUT SUMMARY", 79);
+	border_print();
+	printf("Programming Model:            OpenACC\n");
+	if( in.simulation_method == EVENT_BASED )
+		printf("Simulation Method:            Event Based\n");
+	else
+		printf("Simulation Method:            History Based\n");
+	if( in.grid_type == NUCLIDE )
+		printf("Grid Type:                    Nuclide Grid\n");
+	else if( in.grid_type == UNIONIZED )
+		printf("Grid Type:                    Unionized Grid\n");
+	else
+		printf("Grid Type:                    Hash\n");
+
+	printf("Materials:                    %d\n", 12);
+	printf("H-M Benchmark Size:           %s\n", in.HM);
+	printf("Total Nuclides:               %ld\n", in.n_isotopes);
+	printf("Gridpoints (per Nuclide):     ");
+	fancy_int(in.n_gridpoints);
+	if( in.grid_type == HASH )
+	{
+		printf("Hash Bins:                    ");
+		fancy_int(in.hash_bins);
+	}
+	if( in.grid_type == UNIONIZED )
+	{
+		printf("Unionized Energy Gridpoints:  ");
+		fancy_int(in.n_isotopes*in.n_gridpoints);
+	}
+	if( in.simulation_method == HISTORY_BASED )
+	{
+		printf("Particle Histories:           "); fancy_int(in.particles);
+		printf("XS Lookups per Particle:      "); fancy_int(in.lookups);
+	}
+	printf("Total XS Lookups:             "); fancy_int(in.lookups);
+	#ifdef MPI
+	printf("MPI Ranks:                    %d\n", nprocs);
+	printf("Mem Usage per MPI Rank (MB):  "); fancy_int(mem_tot);
+	#else
+	printf("Est. Memory Usage (MB):       "); fancy_int(mem_tot);
+	#endif
+	printf("Binary File Mode:             ");
+	if( in.binary_mode == NONE )
+		printf("Off\n");
+	else if( in.binary_mode == READ)
+		printf("Read\n");
+	else
+		printf("Write\n");
+	border_print();
+	center_print("INITIALIZATION - DO NOT PROFILE", 79);
+	border_print();
+}
+
+void border_print(void)
+{
+	printf(
+	"==================================================================="
+	"=============\n");
+}
+
+// Prints comma separated integers - for ease of reading
+void fancy_int( long a )
+{
+    if( a < 1000 )
+        printf("%ld\n",a);
+
+    else if( a >= 1000 && a < 1000000 )
+        printf("%ld,%03ld\n", a / 1000, a % 1000);
+
+    else if( a >= 1000000 && a < 1000000000 )
+        printf("%ld,%03ld,%03ld\n",a / 1000000,(a % 1000000) / 1000,a % 1000 );
+
+    else if( a >= 1000000000 )
+        printf("%ld,%03ld,%03ld,%03ld\n",
+               a / 1000000000,
+               (a % 1000000000) / 1000000,
+               (a % 1000000) / 1000,
+               a % 1000 );
+    else
+        printf("%ld\n",a);
+}
+
+void print_CLI_error(void)
+{
+	printf("Usage: ./XSBench <options>\n");
+	printf("Options include:\n");
+	printf("  -m <simulation method>   Simulation method (history, event)\n");
+	printf("  -s <size>                Size of H-M Benchmark to run (small, large, XL, XXL)\n");
+	printf("  -g <gridpoints>          Number of gridpoints per nuclide (overrides -s defaults)\n");
+	printf("  -G <grid type>           Grid search type (unionized, nuclide, hash). Defaults to unionized.\n");
+	printf("  -p <particles>           Number of particle histories\n");
+	printf("  -l <lookups>             History Based: Number of Cross-section (XS) lookups per particle. Event Based: Total number of XS lookups.\n");
+	printf("  -h <hash bins>           Number of hash bins (only relevant when used with \"-G hash\")\n");
+	printf("  -b <binary mode>         Read or write all data structures to file. If reading, this will skip initialization phase. (read, write)\n");
+	printf("  -k <kernel ID>           Specifies which kernel to run. 0 is baseline, 1, 2, etc are optimized variants. (0 is default.)\n");
+	printf("Default is equivalent to: -m history -s large -l 34 -p 500000 -G unionized\n");
+	printf("See readme for full description of default run values\n");
+	exit(4);
+}
+
+Inputs read_CLI( int argc, char * argv[] )
+{
+	Inputs input;
+
+	// defaults to the history based simulation method
+	input.simulation_method = HISTORY_BASED;
+	
+	// defaults to max threads on the system	
+	input.nthreads = omp_get_num_procs();
+	
+	// defaults to 355 (corresponding to H-M Large benchmark)
+	input.n_isotopes = 355;
+	
+	// defaults to 11303 (corresponding to H-M Large benchmark)
+	input.n_gridpoints = 11303;
+
+	// defaults to 500,000
+	input.particles = 500000;
+	
+	// defaults to 34
+	input.lookups = 34;
+	
+	// default to unionized grid
+	input.grid_type = UNIONIZED;
+
+	// default to unionized grid
+	input.hash_bins = 10000;
+
+	// default to no binary read/write
+	input.binary_mode = NONE;
+	
+	// defaults to baseline kernel
+	input.kernel_id = 0;
+	
+	// defaults to H-M Large benchmark
+	input.HM = (char *) malloc( 6 * sizeof(char) );
+	input.HM[0] = 'l' ; 
+	input.HM[1] = 'a' ; 
+	input.HM[2] = 'r' ; 
+	input.HM[3] = 'g' ; 
+	input.HM[4] = 'e' ; 
+	input.HM[5] = '\0';
+	
+	// Check if user sets these
+	int user_g = 0;
+
+	int default_lookups = 1;
+	int default_particles = 1;
+	
+	// Collect Raw Input
+	for( int i = 1; i < argc; i++ )
+	{
+		char * arg = argv[i];
+
+		// n_gridpoints (-g)
+		if( strcmp(arg, "-g") == 0 )
+		{	
+			if( ++i < argc )
+			{
+				user_g = 1;
+				input.n_gridpoints = atol(argv[i]);
+			}
+			else
+				print_CLI_error();
+		}
+		// Simulation Method (-m)
+		else if( strcmp(arg, "-m") == 0 )
+		{
+			char * sim_type;
+			if( ++i < argc )
+				sim_type = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(sim_type, "history") == 0 )
+				input.simulation_method = HISTORY_BASED;
+			else if( strcmp(sim_type, "event") == 0 )
+			{
+				input.simulation_method = EVENT_BASED;
+				// Also resets default # of lookups
+				if( default_lookups && default_particles )
+				{
+					input.lookups =  input.lookups * input.particles;
+					input.particles = 0;
+				}
+			}
+			else
+				print_CLI_error();
+		}
+		// lookups (-l)
+		else if( strcmp(arg, "-l") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.lookups = atoi(argv[i]);
+				default_lookups = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// hash bins (-h)
+		else if( strcmp(arg, "-h") == 0 )
+		{
+			if( ++i < argc )
+				input.hash_bins = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// particles (-p)
+		else if( strcmp(arg, "-p") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.particles = atoi(argv[i]);
+				default_particles = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// HM (-s)
+		else if( strcmp(arg, "-s") == 0 )
+		{	
+			if( ++i < argc )
+				input.HM = argv[i];
+			else
+				print_CLI_error();
+		}
+		// grid type (-G)
+		else if( strcmp(arg, "-G") == 0 )
+		{
+			char * grid_type;
+			if( ++i < argc )
+				grid_type = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(grid_type, "unionized") == 0 )
+				input.grid_type = UNIONIZED;
+			else if( strcmp(grid_type, "nuclide") == 0 )
+				input.grid_type = NUCLIDE;
+			else if( strcmp(grid_type, "hash") == 0 )
+				input.grid_type = HASH;
+			else
+				print_CLI_error();
+		}
+		// binary mode (-b)
+		else if( strcmp(arg, "-b") == 0 )
+		{
+			char * binary_mode;
+			if( ++i < argc )
+				binary_mode = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(binary_mode, "read") == 0 )
+				input.binary_mode = READ;
+			else if( strcmp(binary_mode, "write") == 0 )
+				input.binary_mode = WRITE;
+			else
+				print_CLI_error();
+		}
+		// kernel optimization selection (-k)
+		else if( strcmp(arg, "-k") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.kernel_id = atoi(argv[i]);
+			}
+			else
+				print_CLI_error();
+		}
+		else
+			print_CLI_error();
+	}
+
+	// Validate Input
+	
+	// Validate nthreads
+	if( input.nthreads < 1 )
+		print_CLI_error();
+	
+	// Validate n_isotopes
+	if( input.n_isotopes < 1 )
+		print_CLI_error();
+	
+	// Validate n_gridpoints
+	if( input.n_gridpoints < 1 )
+		print_CLI_error();
+
+	// Validate lookups
+	if( input.lookups < 1 )
+		print_CLI_error();
+
+	// Validate Hash Bins 
+	if( input.hash_bins < 1 )
+		print_CLI_error();
+	
+	// Validate HM size
+	if( strcasecmp(input.HM, "small") != 0 &&
+		strcasecmp(input.HM, "large") != 0 &&
+		strcasecmp(input.HM, "XL") != 0 &&
+		strcasecmp(input.HM, "XXL") != 0 )
+		print_CLI_error();
+	
+	// Set HM size specific parameters
+	// (defaults to large)
+	if( strcasecmp(input.HM, "small") == 0 )
+		input.n_isotopes = 68;
+	else if( strcasecmp(input.HM, "XL") == 0 && user_g == 0 )
+		input.n_gridpoints = 238847; // sized to make 120 GB XS data
+	else if( strcasecmp(input.HM, "XXL") == 0 && user_g == 0 )
+		input.n_gridpoints = 238847 * 2.1; // 252 GB XS data
+
+	// Return input struct
+	return input;
+}
+
+void binary_write( Inputs in, SimulationData SD )
+{
+	char * fname = "XS_data.dat";
+	printf("Writing all data structures to binary file %s...\n", fname);
+	FILE * fp = fopen(fname, "w");
+
+	// Write SimulationData Object. Include pointers, even though we won't be using them.
+	fwrite(&SD, sizeof(SimulationData), 1, fp);
+
+	// Write heap arrays in SimulationData Object
+	fwrite(SD.num_nucs,       sizeof(int), SD.length_num_nucs, fp);
+	fwrite(SD.concs,          sizeof(double), SD.length_concs, fp);
+	fwrite(SD.mats,           sizeof(int), SD.length_mats, fp);
+	fwrite(SD.nuclide_grid,   sizeof(NuclideGridPoint), SD.length_nuclide_grid, fp); 
+	fwrite(SD.index_grid, sizeof(int), SD.length_index_grid, fp);
+	fwrite(SD.unionized_energy_array, sizeof(double), SD.length_unionized_energy_array, fp);
+
+	fclose(fp);
+}
+
+SimulationData binary_read( Inputs in )
+{
+	SimulationData SD;
+	
+	char * fname = "XS_data.dat";
+	printf("Reading all data structures from binary file %s...\n", fname);
+
+	FILE * fp = fopen(fname, "r");
+	assert(fp != NULL);
+
+	// Read SimulationData Object. Include pointers, even though we won't be using them.
+	fread(&SD, sizeof(SimulationData), 1, fp);
+
+	// Allocate space for arrays on heap
+	SD.num_nucs = (int *) malloc(SD.length_num_nucs * sizeof(int));
+	SD.concs = (double *) malloc(SD.length_concs * sizeof(double));
+	SD.mats = (int *) malloc(SD.length_mats * sizeof(int));
+	SD.nuclide_grid = (NuclideGridPoint *) malloc(SD.length_nuclide_grid * sizeof(NuclideGridPoint));
+	SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
+	SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
+
+	// Read heap arrays into SimulationData Object
+	fread(SD.num_nucs,       sizeof(int), SD.length_num_nucs, fp);
+	fread(SD.concs,          sizeof(double), SD.length_concs, fp);
+	fread(SD.mats,           sizeof(int), SD.length_mats, fp);
+	fread(SD.nuclide_grid,   sizeof(NuclideGridPoint), SD.length_nuclide_grid, fp); 
+	fread(SD.index_grid, sizeof(int), SD.length_index_grid, fp);
+	fread(SD.unionized_energy_array, sizeof(double), SD.length_unionized_energy_array, fp);
+
+	fclose(fp);
+
+	return SD;
+}

From a414eeac386ce474d297404342ca221b536b185c Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@udel.edu>
Date: Thu, 22 Jun 2023 16:59:24 -0400
Subject: [PATCH 2/3] Enable OACC by default

---
 openacc/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openacc/Makefile b/openacc/Makefile
index 51a0bbd4..35690285 100644
--- a/openacc/Makefile
+++ b/openacc/Makefile
@@ -7,7 +7,7 @@ OPTIMIZE    = yes
 DEBUG       = yes
 PROFILE     = no
 MPI         = no
-OPENACC     = no
+OPENACC     = yes
 
 #===============================================================================
 # Program name & source code list
@@ -47,7 +47,7 @@ ifeq ($(COMPILER),pgi)
   endif
   # OpenACC (assumes NVIDA GPU)
   ifeq ($(OPENACC),yes)
-    CFLAGS += -acc -Minfo=accel -gpu=cc80
+    CFLAGS += -acc -Minfo=accel -gpu=cc70
   endif
 endif
 

From 17be517c65b63c1f5cba5750daafa4fdeeafc58d Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Tue, 17 Oct 2023 14:38:30 -0400
Subject: [PATCH 3/3] feat: update README to include openacc

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index da71ae6a..25c1fedd 100644
--- a/README.md
+++ b/README.md
@@ -44,12 +44,15 @@ This version of XSBench is written in CUDA for use with NVIDIA GPU architectures
 4. **XSBench/opencl**
 This version of XSBench is written in OpenCL, and can be used for CPU, GPU, FPGA, or other architectures that support OpenCL. It was written with GPUs in mind, so if running on other architectures you may need to heavily re-optimize the code. You will also likely need to edit the makefile to supply the path to your OpenCL compiler.
 
-4. **XSBench/sycl**
+5. **XSBench/sycl**
 This version of XSBench is written in SYCL, and can be used for CPU, GPU, FPGA, or other architectures that support OpenCL and SYCL. It was written with GPUs in mind, so if running on other architectures you may need to heavily re-optimize the code. You will also likely need to edit the makefile to supply the path to your SYCL compiler.
 
-5. **XSBench/hip**
+6. **XSBench/hip**
 This version of XSBench is written in HIP for use with GPU architectures. This version is derived from CUDA using an automatic conversion tool with only a few small manual changes.
 
+7. XSBench/openacc
+This verison of XSBench is written in OpenACC for use with GPU architectures. This version is derived from the OpenMP 4.5 implementation. You will likely need to modify the Makefile to supply the appropriate compiler and compiler flags.
+
 ## Compilation
 
 To compile XSBench with default settings, navigate to your selected source directory and use the following command: