diff --git a/Development/stream.c.5.10 b/Development/stream.c.5.10
new file mode 100644
index 0000000..b9fd19e
--- /dev/null
+++ b/Development/stream.c.5.10
@@ -0,0 +1,496 @@
+/*-----------------------------------------------------------------------*/
+/* Program: Stream                                                       */
+/* Revision: $Id: stream.c,v 5.10 2009/01/28 13:22:09 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2005: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*         "tuned STREAM benchmark results"                              */
+/*         "based on a variant of the STREAM benchmark code"             */
+/*         Other comparable, clear and reasonable labelling is           */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <stdlib.h>
+# include <getopt.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <math.h>
+# include <sys/time.h>
+# include <sys/ipc.h>
+# include <sys/shm.h>
+
+/* INSTRUCTIONS:
+ *
+ *	1) Stream requires a good bit of memory to run.  Adjust the
+ *          value of 'N' (below) to give a 'timing calibration' of 
+ *          at least 20 clock-ticks.  This will provide rate estimates
+ *          that should be good to about 5% precision.
+ */
+
+/*
+ *	3) Compile the code with full optimization.  Many compilers
+ *	   generate unreasonably bad code before the optimizer tightens
+ *	   things up.  If the results are unreasonably good, on the
+ *	   other hand, the optimizer might be too smart for me!
+ *
+ *         Try compiling with:
+ *               cc -O stream_omp.c -o stream_omp
+ *
+ *         This is known to work on Cray, SGI, IBM, and Sun machines.
+ *
+ *
+ *	4) Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include:
+ *		a) computer hardware model number and software revision
+ *		b) the compiler flags
+ *		c) all of the output from the test case.
+ * Thanks!
+ *
+ */
+
+#include "stream.h"
+
+#define MAXNTIMES 100
+#define MAXSEGS 8
+
+int
+main(int argc, char **argv)
+    {
+    long		N, OFFSET, SIZE;
+    int			quantum;
+    int			largepage,shmflag,shmid[MAXSEGS]; 
+    int			BytesPerWord;
+    register int	j, k;
+    double		scalar, t, times[4][MAXNTIMES];
+    double		*a, *b, *c;
+    double		avgtime[4] = {0};
+    double		maxtime[4] = {0};
+    double		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+    static char		*label[4] = {"Copy:      ", "Scale:     ",
+				     "Add:       ", "Triad:     "};
+    double		bytes[4] = {2 * sizeof(double),
+				    2 * sizeof(double),
+				    3 * sizeof(double),
+				    3 * sizeof(double)};
+
+    /* --- default options --- */
+    N = 2000000;
+    OFFSET = 0;
+    NTIMES = 10;
+    largepage = 0;
+
+    /* --- NEW --- parse command line arguments using getopt --- */
+    while (1) {
+        int this_option_optind = optind ? optind : 1;
+        int option_index = 0;
+        static struct option long_options[] = {
+            {"largepage", 1, 0, 'l'},
+            {"length", 1, 0, 'n'},
+            {"offset", 1, 0, 'o'},
+            {"repetitions", 1, 0, 'r'},
+            {"tuned", 1, 0, 't'},
+            {"help", 0, 0, 'h'},
+            {0, 0, 0, 0}
+        };
+
+        j = getopt_long (argc, argv, "ln:o:r:t:h",
+                 long_options, &option_index);
+        if (j == -1)	/* finished parsing all command-line options */
+            break;
+
+        switch (j) {
+	    case 0:			/* this should not happen */
+		printf ("option %s", long_options[option_index].name);
+		if (optarg)
+		    printf (" with arg %s", optarg);
+		printf ("\n");
+		break;
+
+	    case 'l':		/* requesting data allocation on large pages */
+		printf ("User requested data allocation on large pages\n");
+		largepage=1;
+		break;
+
+	    case 'n':		/* define vector length in 10^6 elements */
+		printf ("User requested Array Size of %d * 10^6 elements\n", optarg);
+		N = atoi(optarg);
+		if ( N >= 2147 ) {
+		    printf("Warning: Array Size exceeds 2GB - watch for anomalies\n");
+		}
+		N = N * 1000 * 1000;
+		break;
+
+	    case 'o':		/* define offset in elements */
+		printf ("User requested Array Offset of %d elements\n", optarg);
+		OFFSET = atoi(optarg);
+		break;
+
+	    case 'r':		/* specify number of repetitions */
+		printf ("option r with value '%s'\n", optarg);
+		NTIMES = atoi(optarg);
+		if (NTIMES > MAXNTIMES) {
+		    NTIMES = MAXNTIMES;
+		    printf("Note: requested repetitions exceeds maximum allowed\n");
+		    printf("      repeat count reset to %d\n",MAXNTIMES);
+		}
+            break;
+
+	   case 't':		/* selection tuned version of code -- not currently used */
+		printf ("option t with value '%s'\n", optarg);
+            break;
+
+	   case 'h':
+		printf ("Usage: %s [options]\n",argv[0]);
+		printf ("Options:\n");
+		printf ("   [-l]                  <-- request data put on large pages (in a shared segment)\n");
+		printf ("   [-n, --length] n      <-- n is array length in 10^6 elements\n");
+		printf ("   [-o, --offset] n      <-- n is offset/padding in elements\n");
+		printf ("   [-r, --repetitions] n <-- n is number of repetitions for timing (1st is not counted)\n");
+		printf ("   [-t, --tuned] nnn     <-- nnn is version number of tuned kernels to execute\n");
+		printf ("   [-h, --help]          <-- gives this help message and exits\n");
+		exit(0);
+
+	   default:
+           printf ("?? getopt returned character code 0%o ??\n", c);
+        }
+    }
+
+   if (optind < argc) {
+        printf ("non-option ARGV-elements: ");
+        while (optind < argc)
+            printf ("%s ", argv[optind++]);
+        printf ("\n");
+    }
+
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    printf("STREAM version $Revision: 5.10 $\n");
+    printf(HLINE);
+    BytesPerWord = sizeof(double);
+    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
+    printf("Total memory required = %.1f MB.\n",
+	(3.0 * BytesPerWord) * ( (double) (N+OFFSET) / 1048576.0));
+    printf(HLINE);
+    printf("Allocating three arrays....\n");
+
+    if (largepage) {
+	SIZE = (3*N + 3*OFFSET) * sizeof(double);
+	printf("Data SIZE needed %llu (Bytes)\n",SIZE);
+	SIZE = ceil((double)SIZE/(2048.*1024.)) * 2048*1024;
+	printf("Data SIZE requested %llu (Bytes)\n",SIZE);
+	printf("attempting to create (shmget) a shared segment of size %llu\n",SIZE);
+	shmid[0] = shmget(IPC_PRIVATE,SIZE,IPC_CREAT|SHM_HUGETLB);
+	if (shmid[0] == -1) {
+		perror("ERROR: failed shmget:");
+		printf("(usually caused by non-root user trying to get large pages)\n");
+		exit(2);
+	}
+	printf("shmget returned a shmid of %d\n",shmid[0]);
+	a = shmat(shmid[0],0,SHM_RND);
+	printf("shmat returned a pointer to %p\n",a);
+	if (a == (double *)(-1)) {
+		perror("ERROR: failed shmat:");
+		printf("Deleting shared segment\n");
+		shmctl(shmid[0],IPC_RMID,NULL);
+		exit(3);
+	}
+	b = a + N + OFFSET;
+	c = a + 2 * (N + OFFSET);
+    }
+    else{
+	a = malloc( (N+0*OFFSET) * sizeof(double));
+	b = malloc( (N+1*OFFSET) * sizeof(double));
+	c = malloc( (N+2*OFFSET) * sizeof(double));
+	if ( (a==0) || (b==0) || (c==0) ) {
+	    printf("Error: one or more mallocs failed!\n");
+	    printf(" a = %p\n",a);
+	    printf(" b = %p\n",b);
+	    printf(" c = %p\n",c);
+	}
+	/* Move the starting points of b and c to implement the OFFSET */
+	b += OFFSET;
+	c += 2*OFFSET;
+    }
+    printf("Array Starting Locations: \n");
+    printf(" a = %p\n",a);
+    printf(" b = %p\n",b);
+    printf(" c = %p\n",c);
+
+    printf(HLINE);
+    printf("Each test is run %d times, but only\n", NTIMES);
+    printf("the *best* time for each is used.\n");
+
+#ifdef _OPENMP
+    printf(HLINE);
+    printf("OpenMP conditional compilation is active\n");
+#pragma omp parallel 
+    {
+#pragma omp master
+	{
+	    k = omp_get_num_threads();
+	    printf ("Number of Threads requested = %i\n",k);
+        }
+    }
+#endif
+
+    printf(HLINE);
+#pragma omp parallel
+    {
+    printf ("Printing one line per active thread....\n");
+    }
+
+    /* Get initial value for system clock. */
+#pragma omp parallel for
+    for (j=0; j<N; j++) {
+	a[j] = 1.0;
+	b[j] = 2.0;
+	c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else {
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+	quantum = 1;
+    }
+
+    t = mysecond();
+#pragma omp parallel for
+    for (j = 0; j < N; j++)
+	a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+#ifdef TUNED
+#include "tuned.inc"
+#else
+#include "standard.inc"
+#endif
+
+    /*	--- SUMMARY --- */
+
+    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+	for (j=0; j<4; j++)
+	    {
+	    avgtime[j] = avgtime[j] + times[j][k];
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+	avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
+	       1.0E-06 * (double) N * bytes[j]/mintime[j],
+	       avgtime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    printf(HLINE);
+
+    /* --- Check Results --- */
+    checkSTREAMresults(N, a, b, c);
+    printf(HLINE);
+
+    return 0;
+}
+
+
+/* =============== Utility Routines ================= */
+
+/* ---------------------------------------------------------------- 
+   checktick tries to determine the granularity of the system timer
+   Thanks to John Henning for the original code 
+------------------------------------------------------------------- */
+
+# define	M	20
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+/* ----------------------------------------------- 
+   A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems. 
+-------------------------------------------------- */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+/* ----------------------------------------------- 
+   Check the results to make sure all the loops
+   have actually been run.   
+   This revised version (in 5.9 and above) sums the
+   absolute errors across the arrays, rather than 
+   summing the values in the arrays and comparing
+   with the expected sum.  This version is much 
+   less sensitive to accumulation of roundoff error.
+-------------------------------------------------- */
+void checkSTREAMresults (long N, double *a, double *b, double *c)
+{
+	double aj,bj,cj,scalar;
+	double asum,bsum,csum;
+	double epsilon;
+	int	j,k,fail=0;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+    /* now aj, bj, and cj have values that should match each element */
+    /* of arrays a[], b[], and c[] -- unless I modified the code to */
+    /* fiddle with some entries to confuse optimizers -- watch for this */
+
+#ifdef VERBOSE
+	printf ("Comparison of specific values at midpoint of arrays: \n");
+	printf ("        Expected  : %f %f %f \n",aj,bj,cj);
+	printf ("        Observed  : %f %f %f \n",a[N/2],b[N/2],c[N/2]);
+#endif
+
+#ifndef abs
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+#endif
+	asum = 0.0;
+	bsum = 0.0;
+	csum = 0.0;
+	for (j=0; j<N; j++) {
+		asum += abs(a[j] - aj);
+		bsum += abs(b[j] - bj);
+		csum += abs(c[j] - cj);
+	}
+	asum = asum / (double) (N);
+	csum = bsum / (double) (N);
+	csum = csum / (double) (N);
+#ifdef VERBOSE
+	printf ("Average Absolute Error : \n");
+	printf ("    arrays: a, b, c  : %f %f %f \n",asum,bsum,csum);
+#endif
+
+	epsilon = 1.e-8;
+
+	if (asum > epsilon) {
+		printf ("Failed Validation on array a[]\n");
+		printf ("        Max Allowable Error  : %f \n",epsilon);
+		printf ("        Observed Error       : %f \n",asum);
+		fail = 1;
+	}
+	if (bsum > epsilon) {
+		printf ("Failed Validation on array b[]\n");
+		printf ("        Max Allowable Error  : %f \n",epsilon);
+		printf ("        Observed Error       : %f \n",bsum);
+		fail = 1;
+	}
+	if (csum > epsilon) {
+		printf ("Failed Validation on array c[]\n");
+		printf ("        Max Allowable Error  : %f \n",epsilon);
+		printf ("        Observed Error       : %f \n",csum);
+		fail = 1;
+	}
+	if (fail == 0) {
+		printf ("Solution Validates\n");
+	}
+}
+
diff --git a/TO_DO b/TO_DO
new file mode 100644
index 0000000..363b9da
--- /dev/null
+++ b/TO_DO
@@ -0,0 +1,20 @@
+2016-05-18:
+
+* Incorporate the timer code for Microsoft Windows into the mainstream version of stream.c
+
+* Incorporate alternate array allocation into the mainstream version of stream.c
+** posix_memalign()
+** malloc()
+** mmap()
+** shmget()/shmat()
+
+* Figure out if the "restrict" keyword can actually be made portable and useful
+  for C versions....
+
+* Update the Fortran version of STREAM to catch up with the functionality
+  and features of the C version 5.10
+** Updated result checking code with minimal roundoff error.
+** Large integer array indices (N > (1<<32)).
+
+* Update the Fortran MPI version of STREAM to catch up with the functionality
+  and features of the C version 5.10
diff --git a/Versions/Experimental/Parallel_jobs b/Versions/Experimental/Parallel_jobs
new file mode 100644
index 0000000..02fce72
--- /dev/null
+++ b/Versions/Experimental/Parallel_jobs
@@ -0,0 +1,31 @@
+#!/bin/csh
+#
+# This program runs multiple copies of the stream_wall benchmark
+# to measure how much they interfere with each other....
+#
+# John D. McCalpin, mccalpin@cs.virginia.edu
+# Mon May  2 18:51:19 EDT 1994
+# set verbose
+
+#switch ($#argv)
+#case 1:
+#  breaksw
+#default:
+#  echo "Usage: $0 <ncpus>"
+#  exit 1
+#endsw
+
+foreach k (1 2 4 6 8)
+    set NCPU=$k
+    set i=$k
+    echo "Starting $i jobs"
+    while (`expr $i - 1` >= 0)
+	echo stream_d >P${NCPU}.${i} &
+	set i=`expr $i - 1`
+    end
+    wait
+    cat P${NCPU}.[1-${NCPU}] >P${NCPU}.out
+    rm  P${NCPU}.[1-${NCPU}]
+    echo "All jobs done.... Output is in P${NCPU}.out"
+end
+exit 0
diff --git a/Versions/Experimental/do_offsets b/Versions/Experimental/do_offsets
new file mode 100644
index 0000000..34532de
--- /dev/null
+++ b/Versions/Experimental/do_offsets
@@ -0,0 +1,7 @@
+#!/bin/csh
+foreach OFFSET (0 7 8 9 15 16 17 31 32 33 63 64 65)
+    echo $OFFSET
+    sed "s/offset=0/offset=${OFFSET}/" <stream_d.f >q.f
+    make q
+    q >>LOG
+end
diff --git a/Versions/Old/1996-08-18/stream_d.c b/Versions/Old/1996-08-18/stream_d.c
new file mode 100644
index 0000000..867af5b
--- /dev/null
+++ b/Versions/Old/1996-08-18/stream_d.c
@@ -0,0 +1,143 @@
+/*
+* Program: Stream
+* Programmer: John D. McCalpin
+* Revision: 2.1, August 30, 1995
+*
+* This program measures memory transfer rates in MB/s for simple 
+* computational kernels coded in Fortran.  These numbers reveal the
+* quality of code generation for simple uncacheable kernels as well
+* as showing the cost of floating-point operations relative to memory
+* accesses.
+*
+* INSTRUCTIONS:
+*	1) (fortran-specific, omitted.)
+*	2) Stream requires a good bit of memory to run.
+*	   Adjust the Parameter 'N' in the second line of the main
+*	   program to give a 'timing calibration' of at least 20 clicks.
+*	   This will provide rate estimates that should be good to 
+*	   about 5% precision.
+*	3) Compile the code with full optimization.  Many compilers
+*	   generate unreasonably bad code before the optimizer tightens
+*	   things up.  If the results are unreasonable good, on the
+*	   other hand, the optimizer might be too smart for me!
+*	4) Mail the results to mccalpin@cs.virginia.edu
+*	   Be sure to include:
+*		a) computer hardware model number and software revision
+*		b) the compiler flags
+*		c) all of the output from the test case.
+* Thanks!
+*
+* this version was ported from fortran to c by mark hahn, hahn+@pitt.edu.
+*/
+
+#define N 1000000
+#define NTIMES 10
+
+#ifdef __hpux
+#define _HPUX_SOURCE 1
+#else
+#define _INCLUDE_POSIX_SOURCE 1
+#endif
+#include <limits.h>
+#include <sys/time.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+
+struct timeval tvStart;
+
+void utimeStart() {
+    struct timezone tz;
+    gettimeofday(&tvStart,&tz);
+}
+
+float utime() {
+    struct timeval tv;
+    struct timezone tz;
+    float utime;
+    gettimeofday(&tv,&tz);
+    utime = 1e6 * (tv.tv_sec - tvStart.tv_sec) + tv.tv_usec - tvStart.tv_usec;
+    return utime;
+}
+
+typedef double real;
+static real a[N],b[N],c[N];
+
+int main() {
+    int j,k;
+    float times[4][NTIMES];
+    static float rmstime[4] = {0};
+    static float mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+    static float maxtime[4] = {0};
+    static char *label[4] = {"Assignment:",
+			     "Scaling   :",
+			     "Summing   :",
+			     "SAXPYing  :"};
+    static float bytes[4] = { 2 * sizeof(real) * N,
+			      2 * sizeof(real) * N,
+			      3 * sizeof(real) * N,
+			      3 * sizeof(real) * N};
+
+    /* --- SETUP --- determine precision and check timing --- */
+    utimeStart();
+    for (j=0; j<N; j++) {
+	a[j] = 1.0;
+	b[j] = 2.0;
+	c[j] = 0.0;
+    }
+    printf("Timing calibration ; time = %f usec.\n",utime());
+    printf("Increase the size of the arrays if this is < 300000\n"
+	   "and your clock precision is =< 1/100 second.\n");
+    printf("---------------------------------------------------\n");
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+    for (k=0; k<NTIMES; k++) {
+	utimeStart();
+	for (j=0; j<N; j++)
+	    c[j] = a[j];
+	times[0][k] = utime();
+	
+	utimeStart();
+	for (j=0; j<N; j++)
+	    c[j] = 3.0e0*a[j];
+	times[1][k] = utime();
+	
+	utimeStart();
+	for (j=0; j<N; j++)
+	    c[j] = a[j]+b[j];
+	times[2][k] = utime();
+	
+	utimeStart();
+	for (j=0; j<N; j++)
+	    c[j] = a[j]+3.0e0*b[j];
+	times[3][k] = utime();
+    }
+    
+    /*	--- SUMMARY --- */
+    for (k=0; k<NTIMES; k++) {
+	for (j=0; j<4; j++) {
+	    rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	}
+    }
+    
+    printf("Function Rate   (MB/s)   RMS time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+	rmstime[j] = sqrt(rmstime[j]/(float)NTIMES);
+
+	printf("%s%11.3f  %11.3f  %11.3f  %11.3f\n",
+	       label[j],
+	       bytes[j]/mintime[j],
+	       rmstime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    return 0;
+}
diff --git a/Versions/Old/1996-08-18/stream_d.f b/Versions/Old/1996-08-18/stream_d.f
new file mode 100644
index 0000000..c61bd6b
--- /dev/null
+++ b/Versions/Old/1996-08-18/stream_d.f
@@ -0,0 +1,245 @@
+* Program: Stream
+* Programmer: John D. McCalpin
+* Revision: 3.0, Aug 2 1994
+*
+* This program measures memory transfer rates in MB/s for simple
+* computational kernels coded in Fortran.  These numbers reveal the
+* quality of code generation for simple uncacheable kernels as well
+* as showing the cost of floating-point operations relative to memory
+* accesses.
+*
+* INSTRUCTIONS:
+*       1) Stream requires a cpu timing function called second().
+*          A sample is shown below.  This is unfortunately rather
+*          system dependent.  It helps to know the granularity of the
+*          timing.  The code below assumes that the granularity is
+*          1/100 seconds.
+*       2) Stream requires a good bit of memory to run.
+*          Adjust the Parameter 'N' in the second line of the main
+*          program to give a 'timing calibration' of at least 20 clicks.
+*          This will provide rate estimates that should be good to
+*          about 5% precision.
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonable good, on the
+*          other hand, the optimizer might be too smart for me!
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*
+* Thanks!
+*
+      PROGRAM stream
+C     .. Parameters ..
+      INTEGER n,ntimes
+      PARAMETER (n=2 000 000,ntimes=10)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t,t0
+      INTEGER j,k,nbpw
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION a(n),b(n),c(n),maxtime(4),mintime(4),rmstime(4),
+     $                 times(4,ntimes),sum(3)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      REAL second
+      INTEGER realsize
+      EXTERNAL second,realsize
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC dble,max,min,sqrt
+C     ..
+C     .. Data statements ..
+      DATA rmstime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Assignment:','Scaling   :','Summing   :',
+     $     'SAXPYing  :'/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      t = second(t0)
+      t = second(t0)
+      DO 10 j = 1,n
+          a(j) = 1.0D0
+          b(j) = 2.0D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = second(t0) - t
+      PRINT *,'Timing calibration ; time = ',t*100,' hundredths',
+     $  ' of a second'
+      PRINT *,'Increase the size of the arrays if this is <30 ',
+     $  ' and your clock precision is =<1/100 second'
+      PRINT *,'---------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      DO 60 k = 1,ntimes
+
+          t = second(t0)
+          DO 20 j = 1,n
+              c(j) = a(j)
+   20     CONTINUE
+          t = second(t0) - t
+          times(1,k) = t
+
+          t = second(t0)
+          DO 30 j = 1,n
+              b(j) = 3.0D0*c(j)
+   30     CONTINUE
+          t = second(t0) - t
+          times(2,k) = t
+
+          t = second(t0)
+          DO 40 j = 1,n
+              c(j) = a(j) + b(j)
+   40     CONTINUE
+          t = second(t0) - t
+          times(3,k) = t
+
+          t = second(t0)
+          DO 50 j = 1,n
+              a(j) = b(j) + 3.0D0*c(j)
+   50     CONTINUE
+          t = second(t0) - t
+          times(4,k) = t
+   60 CONTINUE
+
+*       --- SUMMARY ---
+      DO 80 k = 1,ntimes
+          DO 70 j = 1,4
+              rmstime(j) = rmstime(j) + times(j,k)**2
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   70     CONTINUE
+   80 CONTINUE
+      WRITE (*,FMT=9000)
+      DO 90 j = 1,4
+          rmstime(j) = sqrt(rmstime(j)/dble(ntimes))
+          WRITE (*,FMT=9010) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      rmstime(j),mintime(j),maxtime(j)
+   90 CONTINUE
+      sum(1) = 0.0
+      sum(2) = 0.0
+      sum(3) = 0.0
+      DO 100 j=1,n
+         sum(1) = sum(1) + a(j)
+         sum(2) = sum(2) + b(j)
+         sum(3) = sum(3) + c(j)
+  100 CONTINUE
+      PRINT *,'Sum of a is : ',sum(1)
+      PRINT *,'Sum of b is : ',sum(2)
+      PRINT *,'Sum of c is : ',sum(3)
+
+ 9000 FORMAT ('Function',5x,'Rate (MB/s)  RMS time   Min time  Max time'
+     $       )
+ 9010 FORMAT (a,4 (f10.4,2x))
+      END
+
+*-------------------------------------
+* Sample timing routine
+*       This code works on Sun and Silicon Graphics machines.
+       REAL function second(t0)
+       double precision t0
+       real dummy(2)
+       second = etime(dummy)
+       end
+* Sample timing routine
+*       This code works on IBM RS/6000 machines
+*      REAL FUNCTION second(t0)
+C     .. Scalar Arguments ..
+*      DOUBLE PRECISION t0
+C     ..
+C     .. External Functions ..
+*      INTEGER mclock
+*      EXTERNAL mclock
+C     ..
+*      second = mclock()*0.01D0
+*      END
+
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLEPRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLEPRECISION 
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL dummy
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0**(-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL dummy(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 50
+          END IF
+   30 CONTINUE
+      GOTO 60
+
+   50 WRITE (*,FMT='(a)') '--------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLEPRECISION word'
+      WRITE (*,FMT='(a)') '--------------------------------------'
+      RETURN
+
+   60 PRINT *,'Hmmmm.  I am unable to determine the size of a REAL'
+      PRINT *,'Please enter the number of Bytes per DOUBLEPRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense!'
+          PRINT *,'Try again!'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'REAL number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per REAL number'
+      WRITE (*,FMT='(a)') '--------------------------------------'
+      END
+
+      SUBROUTINE dummy(q,r)
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
diff --git a/Versions/Old/1996-08-18/stream_wall.f b/Versions/Old/1996-08-18/stream_wall.f
new file mode 100644
index 0000000..4d6434f
--- /dev/null
+++ b/Versions/Old/1996-08-18/stream_wall.f
@@ -0,0 +1,223 @@
+* Program: Stream
+* Programmer: John D. McCalpin
+* Revision: 3.0, August 2, 1994
+*
+* This program measures memory transfer rates in MB/s for simple
+* computational kernels coded in Fortran.  These numbers reveal the
+* quality of code generation for simple uncacheable kernels as well
+* as showing the cost of floating-point operations relative to memory
+* accesses.
+*
+* INSTRUCTIONS:
+*       1) Stream requires a cpu timing function called myclock().
+*          A sample is in myclock.c.  This is unfortunately rather
+*          system dependent.  It helps to know the granularity of the
+*          timing.  The code below assumes that the granularity is
+*          1/100 seconds.
+*       2) Stream requires a good bit of memory to run.
+*          Adjust the Parameter 'N' in the second line of the main
+*          program to give a 'timing calibration' of at least 20 clicks.
+*          This will provide rate estimates that should be good to
+*          about 5% precision.
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonable good, on the
+*          other hand, the optimizer might be too smart for me!
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*
+* Thanks!
+*
+      PROGRAM stream
+C     .. Parameters ..
+      INTEGER n,ntimes
+      PARAMETER (n=2 000 000,ntimes=10)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t,t0
+      INTEGER j,k,nbpw
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION a(n),b(n),c(n),maxtime(4),mintime(4),rmstime(4),
+     $                 times(4,ntimes),sum(3)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION myclock
+      INTEGER realsize
+      EXTERNAL myclock,realsize
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC dble,max,min,sqrt
+C     ..
+C     .. Data statements ..
+      DATA rmstime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Assignment:','Scaling   :','Summing   :',
+     $     'SAXPYing  :'/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      t = myclock(t0)
+      t = myclock(t0)
+      DO 10 j = 1,n
+          a(j) = 1.0D0
+          b(j) = 2.0D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = myclock(t0) - t
+      PRINT *,'Timing calibration ; time = ',t*100,' hundredths',
+     $  ' of a second'
+      PRINT *,'Increase the size of the arrays if this is <30 ',
+     $  ' and your clock precision is =<1/100 second'
+      PRINT *,'---------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      DO 60 k = 1,ntimes
+
+          t = myclock(t0)
+          DO 20 j = 1,n
+              c(j) = a(j)
+   20     CONTINUE
+          t = myclock(t0) - t
+          times(1,k) = t
+
+          t = myclock(t0)
+          DO 30 j = 1,n
+              b(j) = 3.0D0*c(j)
+   30     CONTINUE
+          t = myclock(t0) - t
+          times(2,k) = t
+
+          t = myclock(t0)
+          DO 40 j = 1,n
+              c(j) = a(j) + b(j)
+   40     CONTINUE
+          t = myclock(t0) - t
+          times(3,k) = t
+
+          t = myclock(t0)
+          DO 50 j = 1,n
+              a(j) = b(j) + 3.0D0*c(j)
+   50     CONTINUE
+          t = myclock(t0) - t
+          times(4,k) = t
+   60 CONTINUE
+
+*       --- SUMMARY ---
+      DO 80 k = 1,ntimes
+          DO 70 j = 1,4
+              rmstime(j) = rmstime(j) + times(j,k)**2
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   70     CONTINUE
+   80 CONTINUE
+      WRITE (*,FMT=9000)
+      DO 90 j = 1,4
+          rmstime(j) = sqrt(rmstime(j)/dble(ntimes))
+          WRITE (*,FMT=9010) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      rmstime(j),mintime(j),maxtime(j)
+   90 CONTINUE
+      sum(1) = 0.0
+      sum(2) = 0.0
+      sum(3) = 0.0
+      DO 100 j=1,n
+         sum(1) = sum(1) + a(j)
+         sum(2) = sum(2) + b(j)
+         sum(3) = sum(3) + c(j)
+  100 CONTINUE
+      PRINT *,'Sum of a is : ',sum(1)
+      PRINT *,'Sum of b is : ',sum(2)
+      PRINT *,'Sum of c is : ',sum(3)
+
+ 9000 FORMAT ('Function',5x,'Rate (MB/s)  RMS time   Min time  Max time'
+     $       )
+ 9010 FORMAT (a,4 (f10.4,2x))
+      END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLEPRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLEPRECISION 
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL dummy
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0**(-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL dummy(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 50
+          END IF
+   30 CONTINUE
+      GOTO 60
+
+   50 WRITE (*,FMT='(a)') '--------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLEPRECISION word'
+      WRITE (*,FMT='(a)') '--------------------------------------'
+      RETURN
+
+   60 PRINT *,'Hmmmm.  I am unable to determine the size of a REAL'
+      PRINT *,'Please enter the number of Bytes per DOUBLEPRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense!'
+          PRINT *,'Try again!'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'REAL number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per REAL number'
+      WRITE (*,FMT='(a)') '--------------------------------------'
+      END
+
+      SUBROUTINE dummy(q,r)
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
diff --git a/Versions/Old/2003-04-08/second_cpu.c b/Versions/Old/2003-04-08/second_cpu.c
new file mode 100644
index 0000000..d0338a9
--- /dev/null
+++ b/Versions/Old/2003-04-08/second_cpu.c
@@ -0,0 +1,15 @@
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/times.h>
+#include <time.h>
+
+double mysecond()
+{
+    long sec;
+    double secx;
+    struct tms realbuf;
+
+    times(&realbuf);
+    secx = ( realbuf.tms_stime + realbuf.tms_utime ) / (float) CLK_TCK;
+    return ((double) secx);
+}
diff --git a/Versions/Old/2003-04-08/second_cpu.f b/Versions/Old/2003-04-08/second_cpu.f
new file mode 100644
index 0000000..cef7c36
--- /dev/null
+++ b/Versions/Old/2003-04-08/second_cpu.f
@@ -0,0 +1,18 @@
+*-------------------------------------
+* Sample timing routine
+*       This code works on Sun and Silicon Graphics machines.
+*       DOUBLE PRECISION function mysecond()
+*       real arg(2)
+*       mysecond = etime(arg)
+*       end
+* Sample timing routine
+*       This code works on IBM RS/6000 machines
+      DOUBLE PRECISION FUNCTION mysecond()
+C     ..
+C     .. External Functions ..
+      INTEGER mclock
+      EXTERNAL mclock
+C     ..
+      mysecond = mclock()*0.01D0
+      END
+
diff --git a/Versions/Old/2003-04-08/second_wall.c b/Versions/Old/2003-04-08/second_wall.c
new file mode 100644
index 0000000..a9b799a
--- /dev/null
+++ b/Versions/Old/2003-04-08/second_wall.c
@@ -0,0 +1,27 @@
+/* A Fortran-callable gettimeofday routine to give access
+   to the wall clock timer.
+
+   This subroutine may need to be modified slightly to get
+   it to link with Fortran on your computer.
+   The most common difference is adding/removing a trailing
+   underscore character to the function name.
+*/
+
+#include <sys/time.h>
+/* int gettimeofday(struct timeval *tp, struct timezone *tzp); */
+
+double mysecond_()
+{
+/* struct timeval { long	tv_sec;	
+	    long	tv_usec;	};
+
+struct timezone { int	tz_minuteswest;
+	     int	tz_dsttime;	 };	*/
+
+	struct timeval tp;
+	struct timezone tzp;
+	int i;
+
+	i = gettimeofday(&tp,&tzp);
+	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
diff --git a/Versions/Old/Experimental/Parallel_jobs b/Versions/Old/Experimental/Parallel_jobs
new file mode 100644
index 0000000..02fce72
--- /dev/null
+++ b/Versions/Old/Experimental/Parallel_jobs
@@ -0,0 +1,31 @@
+#!/bin/csh
+#
+# This program runs multiple copies of the stream_wall benchmark
+# to measure how much they interfere with each other....
+#
+# John D. McCalpin, mccalpin@cs.virginia.edu
+# Mon May  2 18:51:19 EDT 1994
+# set verbose
+
+#switch ($#argv)
+#case 1:
+#  breaksw
+#default:
+#  echo "Usage: $0 <ncpus>"
+#  exit 1
+#endsw
+
+foreach k (1 2 4 6 8)
+    set NCPU=$k
+    set i=$k
+    echo "Starting $i jobs"
+    while (`expr $i - 1` >= 0)
+	echo stream_d >P${NCPU}.${i} &
+	set i=`expr $i - 1`
+    end
+    wait
+    cat P${NCPU}.[1-${NCPU}] >P${NCPU}.out
+    rm  P${NCPU}.[1-${NCPU}]
+    echo "All jobs done.... Output is in P${NCPU}.out"
+end
+exit 0
diff --git a/Versions/Old/Experimental/do_offsets b/Versions/Old/Experimental/do_offsets
new file mode 100644
index 0000000..34532de
--- /dev/null
+++ b/Versions/Old/Experimental/do_offsets
@@ -0,0 +1,7 @@
+#!/bin/csh
+foreach OFFSET (0 7 8 9 15 16 17 31 32 33 63 64 65)
+    echo $OFFSET
+    sed "s/offset=0/offset=${OFFSET}/" <stream_d.f >q.f
+    make q
+    q >>LOG
+end
diff --git a/Versions/Old/second_cpu.c b/Versions/Old/second_cpu.c
new file mode 100644
index 0000000..d0338a9
--- /dev/null
+++ b/Versions/Old/second_cpu.c
@@ -0,0 +1,15 @@
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/times.h>
+#include <time.h>
+
+double mysecond()
+{
+    long sec;
+    double secx;
+    struct tms realbuf;
+
+    times(&realbuf);
+    secx = ( realbuf.tms_stime + realbuf.tms_utime ) / (float) CLK_TCK;
+    return ((double) secx);
+}
diff --git a/Versions/Old/second_cpu.f b/Versions/Old/second_cpu.f
new file mode 100644
index 0000000..cef7c36
--- /dev/null
+++ b/Versions/Old/second_cpu.f
@@ -0,0 +1,18 @@
+*-------------------------------------
+* Sample timing routine
+*       This code works on Sun and Silicon Graphics machines.
+*       DOUBLE PRECISION function mysecond()
+*       real arg(2)
+*       mysecond = etime(arg)
+*       end
+* Sample timing routine
+*       This code works on IBM RS/6000 machines
+      DOUBLE PRECISION FUNCTION mysecond()
+C     ..
+C     .. External Functions ..
+      INTEGER mclock
+      EXTERNAL mclock
+C     ..
+      mysecond = mclock()*0.01D0
+      END
+
diff --git a/Versions/Old/second_wall.c b/Versions/Old/second_wall.c
new file mode 100644
index 0000000..a9b799a
--- /dev/null
+++ b/Versions/Old/second_wall.c
@@ -0,0 +1,27 @@
+/* A Fortran-callable gettimeofday routine to give access
+   to the wall clock timer.
+
+   This subroutine may need to be modified slightly to get
+   it to link with Fortran on your computer.
+   The most common difference is adding/removing a trailing
+   underscore character to the function name.
+*/
+
+#include <sys/time.h>
+/* int gettimeofday(struct timeval *tp, struct timezone *tzp); */
+
+double mysecond_()
+{
+/* struct timeval { long	tv_sec;	
+	    long	tv_usec;	};
+
+struct timezone { int	tz_minuteswest;
+	     int	tz_dsttime;	 };	*/
+
+	struct timeval tp;
+	struct timezone tzp;
+	int i;
+
+	i = gettimeofday(&tp,&tzp);
+	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
diff --git a/Versions/Old/stream.c.5.10 b/Versions/Old/stream.c.5.10
new file mode 100644
index 0000000..b9fd19e
--- /dev/null
+++ b/Versions/Old/stream.c.5.10
@@ -0,0 +1,496 @@
+/*-----------------------------------------------------------------------*/
+/* Program: Stream                                                       */
+/* Revision: $Id: stream.c,v 5.10 2009/01/28 13:22:09 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2005: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*         "tuned STREAM benchmark results"                              */
+/*         "based on a variant of the STREAM benchmark code"             */
+/*         Other comparable, clear and reasonable labelling is           */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <stdlib.h>
+# include <getopt.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <math.h>
+# include <sys/time.h>
+# include <sys/ipc.h>
+# include <sys/shm.h>
+
+/* INSTRUCTIONS:
+ *
+ *	1) Stream requires a good bit of memory to run.  Adjust the
+ *          value of 'N' (below) to give a 'timing calibration' of 
+ *          at least 20 clock-ticks.  This will provide rate estimates
+ *          that should be good to about 5% precision.
+ */
+
+/*
+ *	3) Compile the code with full optimization.  Many compilers
+ *	   generate unreasonably bad code before the optimizer tightens
+ *	   things up.  If the results are unreasonably good, on the
+ *	   other hand, the optimizer might be too smart for me!
+ *
+ *         Try compiling with:
+ *               cc -O stream_omp.c -o stream_omp
+ *
+ *         This is known to work on Cray, SGI, IBM, and Sun machines.
+ *
+ *
+ *	4) Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include:
+ *		a) computer hardware model number and software revision
+ *		b) the compiler flags
+ *		c) all of the output from the test case.
+ * Thanks!
+ *
+ */
+
+#include "stream.h"
+
+#define MAXNTIMES 100
+#define MAXSEGS 8
+
+int
+main(int argc, char **argv)
+    {
+    long		N, OFFSET, SIZE;
+    int			quantum;
+    int			largepage,shmflag,shmid[MAXSEGS]; 
+    int			BytesPerWord;
+    register int	j, k;
+    double		scalar, t, times[4][MAXNTIMES];
+    double		*a, *b, *c;
+    double		avgtime[4] = {0};
+    double		maxtime[4] = {0};
+    double		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+    static char		*label[4] = {"Copy:      ", "Scale:     ",
+				     "Add:       ", "Triad:     "};
+    double		bytes[4] = {2 * sizeof(double),
+				    2 * sizeof(double),
+				    3 * sizeof(double),
+				    3 * sizeof(double)};
+
+    /* --- default options --- */
+    N = 2000000;
+    OFFSET = 0;
+    NTIMES = 10;
+    largepage = 0;
+
+    /* --- NEW --- parse command line arguments using getopt --- */
+    while (1) {
+        int this_option_optind = optind ? optind : 1;
+        int option_index = 0;
+        static struct option long_options[] = {
+            {"largepage", 1, 0, 'l'},
+            {"length", 1, 0, 'n'},
+            {"offset", 1, 0, 'o'},
+            {"repetitions", 1, 0, 'r'},
+            {"tuned", 1, 0, 't'},
+            {"help", 0, 0, 'h'},
+            {0, 0, 0, 0}
+        };
+
+        j = getopt_long (argc, argv, "ln:o:r:t:h",
+                 long_options, &option_index);
+        if (j == -1)	/* finished parsing all command-line options */
+            break;
+
+        switch (j) {
+	    case 0:			/* this should not happen */
+		printf ("option %s", long_options[option_index].name);
+		if (optarg)
+		    printf (" with arg %s", optarg);
+		printf ("\n");
+		break;
+
+	    case 'l':		/* requesting data allocation on large pages */
+		printf ("User requested data allocation on large pages\n");
+		largepage=1;
+		break;
+
+	    case 'n':		/* define vector length in 10^6 elements */
+		printf ("User requested Array Size of %d * 10^6 elements\n", optarg);
+		N = atoi(optarg);
+		if ( N >= 2147 ) {
+		    printf("Warning: Array Size exceeds 2GB - watch for anomalies\n");
+		}
+		N = N * 1000 * 1000;
+		break;
+
+	    case 'o':		/* define offset in elements */
+		printf ("User requested Array Offset of %d elements\n", optarg);
+		OFFSET = atoi(optarg);
+		break;
+
+	    case 'r':		/* specify number of repetitions */
+		printf ("option r with value '%s'\n", optarg);
+		NTIMES = atoi(optarg);
+		if (NTIMES > MAXNTIMES) {
+		    NTIMES = MAXNTIMES;
+		    printf("Note: requested repetitions exceeds maximum allowed\n");
+		    printf("      repeat count reset to %d\n",MAXNTIMES);
+		}
+            break;
+
+	   case 't':		/* selection tuned version of code -- not currently used */
+		printf ("option t with value '%s'\n", optarg);
+            break;
+
+	   case 'h':
+		printf ("Usage: %s [options]\n",argv[0]);
+		printf ("Options:\n");
+		printf ("   [-l]                  <-- request data put on large pages (in a shared segment)\n");
+		printf ("   [-n, --length] n      <-- n is array length in 10^6 elements\n");
+		printf ("   [-o, --offset] n      <-- n is offset/padding in elements\n");
+		printf ("   [-r, --repetitions] n <-- n is number of repetitions for timing (1st is not counted)\n");
+		printf ("   [-t, --tuned] nnn     <-- nnn is version number of tuned kernels to execute\n");
+		printf ("   [-h, --help]          <-- gives this help message and exits\n");
+		exit(0);
+
+	   default:
+           printf ("?? getopt returned character code 0%o ??\n", c);
+        }
+    }
+
+   if (optind < argc) {
+        printf ("non-option ARGV-elements: ");
+        while (optind < argc)
+            printf ("%s ", argv[optind++]);
+        printf ("\n");
+    }
+
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    printf("STREAM version $Revision: 5.10 $\n");
+    printf(HLINE);
+    BytesPerWord = sizeof(double);
+    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
+    printf("Total memory required = %.1f MB.\n",
+	(3.0 * BytesPerWord) * ( (double) (N+OFFSET) / 1048576.0));
+    printf(HLINE);
+    printf("Allocating three arrays....\n");
+
+    if (largepage) {
+	SIZE = (3*N + 3*OFFSET) * sizeof(double);
+	printf("Data SIZE needed %llu (Bytes)\n",SIZE);
+	SIZE = ceil((double)SIZE/(2048.*1024.)) * 2048*1024;
+	printf("Data SIZE requested %llu (Bytes)\n",SIZE);
+	printf("attempting to create (shmget) a shared segment of size %llu\n",SIZE);
+	shmid[0] = shmget(IPC_PRIVATE,SIZE,IPC_CREAT|SHM_HUGETLB);
+	if (shmid[0] == -1) {
+		perror("ERROR: failed shmget:");
+		printf("(usually caused by non-root user trying to get large pages)\n");
+		exit(2);
+	}
+	printf("shmget returned a shmid of %d\n",shmid[0]);
+	a = shmat(shmid[0],0,SHM_RND);
+	printf("shmat returned a pointer to %p\n",a);
+	if (a == (double *)(-1)) {
+		perror("ERROR: failed shmat:");
+		printf("Deleting shared segment\n");
+		shmctl(shmid[0],IPC_RMID,NULL);
+		exit(3);
+	}
+	b = a + N + OFFSET;
+	c = a + 2 * (N + OFFSET);
+    }
+    else{
+	a = malloc( (N+0*OFFSET) * sizeof(double));
+	b = malloc( (N+1*OFFSET) * sizeof(double));
+	c = malloc( (N+2*OFFSET) * sizeof(double));
+	if ( (a==0) || (b==0) || (c==0) ) {
+	    printf("Error: one or more mallocs failed!\n");
+	    printf(" a = %p\n",a);
+	    printf(" b = %p\n",b);
+	    printf(" c = %p\n",c);
+	}
+	/* Move the starting points of b and c to implement the OFFSET */
+	b += OFFSET;
+	c += 2*OFFSET;
+    }
+    printf("Array Starting Locations: \n");
+    printf(" a = %p\n",a);
+    printf(" b = %p\n",b);
+    printf(" c = %p\n",c);
+
+    printf(HLINE);
+    printf("Each test is run %d times, but only\n", NTIMES);
+    printf("the *best* time for each is used.\n");
+
+#ifdef _OPENMP
+    printf(HLINE);
+    printf("OpenMP conditional compilation is active\n");
+#pragma omp parallel 
+    {
+#pragma omp master
+	{
+	    k = omp_get_num_threads();
+	    printf ("Number of Threads requested = %i\n",k);
+        }
+    }
+#endif
+
+    printf(HLINE);
+#pragma omp parallel
+    {
+    printf ("Printing one line per active thread....\n");
+    }
+
+    /* Get initial value for system clock. */
+#pragma omp parallel for
+    for (j=0; j<N; j++) {
+	a[j] = 1.0;
+	b[j] = 2.0;
+	c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else {
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+	quantum = 1;
+    }
+
+    t = mysecond();
+#pragma omp parallel for
+    for (j = 0; j < N; j++)
+	a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+#ifdef TUNED
+#include "tuned.inc"
+#else
+#include "standard.inc"
+#endif
+
+    /*	--- SUMMARY --- */
+
+    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+	for (j=0; j<4; j++)
+	    {
+	    avgtime[j] = avgtime[j] + times[j][k];
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+	avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
+	       1.0E-06 * (double) N * bytes[j]/mintime[j],
+	       avgtime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    printf(HLINE);
+
+    /* --- Check Results --- */
+    checkSTREAMresults(N, a, b, c);
+    printf(HLINE);
+
+    return 0;
+}
+
+
+/* =============== Utility Routines ================= */
+
+/* ---------------------------------------------------------------- 
+   checktick tries to determine the granularity of the system timer
+   Thanks to John Henning for the original code 
+------------------------------------------------------------------- */
+
+# define	M	20
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+/* ----------------------------------------------- 
+   A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems. 
+-------------------------------------------------- */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+/* ----------------------------------------------- 
+   Check the results to make sure all the loops
+   have actually been run.   
+   This revised version (in 5.9 and above) sums the
+   absolute errors across the arrays, rather than 
+   summing the values in the arrays and comparing
+   with the expected sum.  This version is much 
+   less sensitive to accumulation of roundoff error.
+-------------------------------------------------- */
+void checkSTREAMresults (long N, double *a, double *b, double *c)
+{
+	double aj,bj,cj,scalar;
+	double asum,bsum,csum;
+	double epsilon;
+	int	j,k,fail=0;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+    /* now aj, bj, and cj have values that should match each element */
+    /* of arrays a[], b[], and c[] -- unless I modified the code to */
+    /* fiddle with some entries to confuse optimizers -- watch for this */
+
+#ifdef VERBOSE
+	printf ("Comparison of specific values at midpoint of arrays: \n");
+	printf ("        Expected  : %f %f %f \n",aj,bj,cj);
+	printf ("        Observed  : %f %f %f \n",a[N/2],b[N/2],c[N/2]);
+#endif
+
+#ifndef abs
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+#endif
+	asum = 0.0;
+	bsum = 0.0;
+	csum = 0.0;
+	for (j=0; j<N; j++) {
+		asum += abs(a[j] - aj);
+		bsum += abs(b[j] - bj);
+		csum += abs(c[j] - cj);
+	}
+	asum = asum / (double) (N);
+	csum = bsum / (double) (N);
+	csum = csum / (double) (N);
+#ifdef VERBOSE
+	printf ("Average Absolute Error : \n");
+	printf ("    arrays: a, b, c  : %f %f %f \n",asum,bsum,csum);
+#endif
+
+	epsilon = 1.e-8;
+
+	if (asum > epsilon) {
+		printf ("Failed Validation on array a[]\n");
+		printf ("        Max Allowable Error  : %f \n",epsilon);
+		printf ("        Observed Error       : %f \n",asum);
+		fail = 1;
+	}
+	if (bsum > epsilon) {
+		printf ("Failed Validation on array b[]\n");
+		printf ("        Max Allowable Error  : %f \n",epsilon);
+		printf ("        Observed Error       : %f \n",bsum);
+		fail = 1;
+	}
+	if (csum > epsilon) {
+		printf ("Failed Validation on array c[]\n");
+		printf ("        Max Allowable Error  : %f \n",epsilon);
+		printf ("        Observed Error       : %f \n",csum);
+		fail = 1;
+	}
+	if (fail == 0) {
+		printf ("Solution Validates\n");
+	}
+}
+
diff --git a/Versions/Old/stream_d.c b/Versions/Old/stream_d.c
new file mode 100644
index 0000000..1f2bbe1
--- /dev/null
+++ b/Versions/Old/stream_d.c
@@ -0,0 +1,218 @@
+# include <stdio.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/*
+ * Program: Stream
+ * Programmer: Joe R. Zagar
+ * Revision: 4.0-BETA, October 24, 1995
+ * Original code developed by John D. McCalpin
+ *
+ * This program measures memory transfer rates in MB/s for simple 
+ * computational kernels coded in C.  These numbers reveal the quality
+ * of code generation for simple uncacheable kernels as well as showing
+ * the cost of floating-point operations relative to memory accesses.
+ *
+ * INSTRUCTIONS:
+ *
+ *	1) Stream requires a good bit of memory to run.  Adjust the
+ *          value of 'N' (below) to give a 'timing calibration' of 
+ *          at least 20 clock-ticks.  This will provide rate estimates
+ *          that should be good to about 5% precision.
+ */
+
+# define N	2000000
+# define NTIMES	10
+# define OFFSET	0
+
+/*
+ *	3) Compile the code with full optimization.  Many compilers
+ *	   generate unreasonably bad code before the optimizer tightens
+ *	   things up.  If the results are unreasonably good, on the
+ *	   other hand, the optimizer might be too smart for me!
+ *
+ *         Try compiling with:
+ *               cc -O stream_d.c second_wall.c -o stream_d -lm
+ *
+ *         This is known to work on Cray, SGI, IBM, and Sun machines.
+ *
+ *
+ *	4) Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include:
+ *		a) computer hardware model number and software revision
+ *		b) the compiler flags
+ *		c) all of the output from the test case.
+ * Thanks!
+ *
+ */
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+static double	a[N+OFFSET],
+		b[N+OFFSET],
+		c[N+OFFSET];
+
+static double	rmstime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(double) * N,
+    2 * sizeof(double) * N,
+    3 * sizeof(double) * N,
+    3 * sizeof(double) * N
+    };
+
+extern double mysecond();
+
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    register int	j, k;
+    double		scalar, t, times[4][NTIMES];
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    BytesPerWord = sizeof(double);
+    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
+    printf("Total memory required = %.1f MB.\n",
+	(3 * N * BytesPerWord) / 1048576.0);
+    printf("Each test is run %d times, but only\n", NTIMES);
+    printf("the *best* time for each is used.\n");
+
+    /* Get initial value for system clock. */
+
+    for (j=0; j<N; j++) {
+	a[j] = 1.0;
+	b[j] = 2.0;
+	c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+
+    t = mysecond();
+    for (j = 0; j < N; j++)
+	a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    scalar = 3.0;
+    for (k=0; k<NTIMES; k++)
+	{
+	times[0][k] = mysecond();
+	for (j=0; j<N; j++)
+	    c[j] = a[j];
+	times[0][k] = mysecond() - times[0][k];
+	
+	times[1][k] = mysecond();
+	for (j=0; j<N; j++)
+	    b[j] = scalar*c[j];
+	times[1][k] = mysecond() - times[1][k];
+	
+	times[2][k] = mysecond();
+	for (j=0; j<N; j++)
+	    c[j] = a[j]+b[j];
+	times[2][k] = mysecond() - times[2][k];
+	
+	times[3][k] = mysecond();
+	for (j=0; j<N; j++)
+	    a[j] = b[j]+scalar*c[j];
+	times[3][k] = mysecond() - times[3][k];
+	}
+    
+    /*	--- SUMMARY --- */
+
+    for (k=0; k<NTIMES; k++)
+	{
+	for (j=0; j<4; j++)
+	    {
+	    rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function      Rate (MB/s)   RMS time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+	rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);
+
+	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
+	       1.0E-06 * bytes[j]/mintime[j],
+	       rmstime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+    return(minDelta);
+    }
+
diff --git a/Versions/Old/stream_d.f b/Versions/Old/stream_d.f
new file mode 100644
index 0000000..cff1125
--- /dev/null
+++ b/Versions/Old/stream_d.f
@@ -0,0 +1,451 @@
+*=======================================================================
+* Program: STREAM
+* Programmer: John D. McCalpin
+* Revision: 5.3, January 10, 2003
+*-----------------------------------------------------------------------
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results" 
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*-----------------------------------------------------------------------
+* This program measures sustained memory transfer rates in MB/s for
+* simple computational kernels coded in FORTRAN.
+*
+* The intent is to demonstrate the extent to which ordinary user
+* code can exploit the main memory bandwidth of the system under
+* test.
+*=======================================================================
+* The STREAM web page is at:
+*          http://www.streambench.org
+*
+* Most of the content is currently hosted at:
+*          http://www.cs.virginia.edu/stream/
+*
+* BRIEF INSTRUCTIONS: 
+*       0) See http://www.cs.virginia.edu/stream/ref.html for details
+*       1) STREAM requires a timing function called mysecond().
+*          Several examples are provided in this directory.
+*          "CPU" timers are only allowed for uniprocessor runs.
+*          "Wall-clock" timers are required for all multiprocessor runs.
+*       2) The STREAM array sizes must be set to size the test.
+*          The value "N" must be chosen so that each of the three
+*          arrays is at least 4x larger than the sum of all the last-
+*          level caches used in the run, or 1 million elements, which-
+*          ever is larger.
+*          ------------------------------------------------------------
+*          Note that you are free to use any array length and offset
+*          that makes each array 4x larger than the last-level cache.
+*          The intent is to determine the *best* sustainable bandwidth
+*          available with this simple coding.  Of course, lower values
+*          are usually fairly easy to obtain on cached machines, but 
+*          by keeping the test to the *best* results, the answers are
+*          easier to interpret.
+*          You may put the arrays in common or not, at your discretion.
+*          There is a commented-out COMMON statement below.
+*          Fortran90 "allocatable" arrays are fine, too.
+*          ------------------------------------------------------------
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonably good, on the
+*          other hand, the optimizer might be too smart for me
+*          Please let me know if this happens.
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*          Please let me know if you do not want your name posted along
+*          with the submitted results.
+*       5) See the web page for more comments about the run rules and
+*          about interpretation of the results.
+*
+* Thanks,
+*   Dr. Bandwidth
+*=========================================================================
+*
+      PROGRAM stream
+*     IMPLICIT NONE
+C     .. Parameters ..
+      INTEGER n,offset,ndim,ntimes
+      PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION scalar,t
+      INTEGER j,k,nbpw,quantum
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
+     $                 times(4,ntimes)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      INTEGER checktick,realsize
+      EXTERNAL mysecond,checktick,realsize
+!$    INTEGER omp_get_num_threads
+!$    EXTERNAL omp_get_num_threads
+C     ..
+C     .. Intrinsic Functions ..
+C
+      INTRINSIC dble,max,min,nint,sqrt
+C     ..
+C     .. Arrays in Common ..
+      DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
+C     ..
+C     .. Common blocks ..
+*     COMMON a,b,c
+C     ..
+C     .. Data statements ..
+      DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Copy:      ','Scale:     ','Add:       ',
+     $     'Triad:     '/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      WRITE (*,FMT=9010) 'Array size = ',n
+      WRITE (*,FMT=9010) 'Offset     = ',offset
+      WRITE (*,FMT=9020) 'The total memory requirement is ',
+     $  3*nbpw*n/ (1024*1024),' MB'
+      WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
+      WRITE (*,FMT=9030) '--'
+      WRITE (*,FMT=9030) 'The *best* time for each test is used'
+      WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
+
+!$OMP PARALLEL
+!$    PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS()
+!$OMP END PARALLEL
+
+!$OMP PARALLEL DO
+      DO 10 j = 1,n
+          a(j) = 2.0d0
+          b(j) = 0.5D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = mysecond()
+!$OMP PARALLEL DO
+      DO 20 j = 1,n
+          a(j) = 0.5d0*a(j)
+   20 CONTINUE
+      t = mysecond() - t
+      PRINT *,'----------------------------------------------------'
+      quantum = checktick()
+      WRITE (*,FMT=9000)
+     $  'Your clock granularity/precision appears to be ',quantum,
+     $  ' microseconds'
+      PRINT *,'----------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      scalar = 0.5d0*a(1)
+      DO 70 k = 1,ntimes
+
+          t = mysecond()
+          a(1) = a(1) + t
+!$OMP PARALLEL DO
+          DO 30 j = 1,n
+              c(j) = a(j)
+   30     CONTINUE
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(1,k) = t
+
+          t = mysecond()
+          c(1) = c(1) + t
+!$OMP PARALLEL DO
+          DO 40 j = 1,n
+              b(j) = scalar*c(j)
+   40     CONTINUE
+          t = mysecond() - t
+          b(n) = b(n) + t
+          times(2,k) = t
+
+          t = mysecond()
+          a(1) = a(1) + t
+!$OMP PARALLEL DO
+          DO 50 j = 1,n
+              c(j) = a(j) + b(j)
+   50     CONTINUE
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(3,k) = t
+
+          t = mysecond()
+          b(1) = b(1) + t
+!$OMP PARALLEL DO
+          DO 60 j = 1,n
+              a(j) = b(j) + scalar*c(j)
+   60     CONTINUE
+          t = mysecond() - t
+          a(n) = a(n) + t
+          times(4,k) = t
+   70 CONTINUE
+
+*       --- SUMMARY ---
+      DO 90 k = 2,ntimes-1
+          DO 80 j = 1,4
+              avgtime(j) = avgtime(j) + times(j,k)
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   80     CONTINUE
+   90 CONTINUE
+      WRITE (*,FMT=9040)
+      DO 100 j = 1,4
+          avgtime(j) = avgtime(j)/dble(ntimes-2)
+          WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      avgtime(j),mintime(j),maxtime(j)
+  100 CONTINUE
+      PRINT *,'----------------------------------------------------'
+      CALL checksums (a,b,c,n,ntimes)
+      PRINT *,'----------------------------------------------------'
+
+ 9000 FORMAT (1x,a,i6,a)
+ 9010 FORMAT (1x,a,i10)
+ 9020 FORMAT (1x,a,i4,a)
+ 9030 FORMAT (1x,a,i3,a,a)
+ 9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
+     $       )
+ 9050 FORMAT (a,4 (f10.4,2x))
+      END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLE PRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLE PRECISION
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+*     IMPLICIT NONE
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL confuse
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0** (-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL confuse(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 40
+          END IF
+   30 CONTINUE
+      GO TO 50
+
+   40 WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLE PRECISION word'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      RETURN
+
+   50 PRINT *,'Hmmmm.  I am unable to determine the size.'
+      PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense.'
+          PRINT *,'Try again.'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'DOUBLE PRECISION number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per DOUBLE PRECISION number'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      END
+
+      SUBROUTINE confuse(q,r)
+*     IMPLICIT NONE
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
+
+* A semi-portable way to determine the clock granularity
+* Adapted from a code by John Henning of Digital Equipment Corporation
+*
+      INTEGER FUNCTION checktick()
+*     IMPLICIT NONE
+
+C     .. Parameters ..
+      INTEGER n
+      PARAMETER (n=20)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t1,t2
+      INTEGER i,j,jmin
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION timesfound(n)
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      EXTERNAL mysecond
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC max,min,nint
+C     ..
+      i = 0
+
+   10 t2 = mysecond()
+      IF (t2.EQ.t1) GO TO 10
+
+      t1 = t2
+      i = i + 1
+      timesfound(i) = t1
+      IF (i.LT.n) GO TO 10
+
+      jmin = 1000000
+      DO 20 i = 2,n
+          j = nint((timesfound(i)-timesfound(i-1))*1d6)
+          jmin = min(jmin,max(j,0))
+   20 CONTINUE
+
+      IF (jmin.GT.0) THEN
+          checktick = jmin
+      ELSE
+          PRINT *,'Your clock granularity appears to be less ',
+     $      'than one microsecond'
+          checktick = 1
+      END IF
+      RETURN
+
+*      PRINT 14, timesfound(1)*1d6
+*      DO 20 i=2,n
+*         PRINT 14, timesfound(i)*1d6,
+*     &       nint((timesfound(i)-timesfound(i-1))*1d6)
+*   14    FORMAT (1X, F18.4, 1X, i8)
+*   20 CONTINUE
+
+      END
+
+
+
+
+      SUBROUTINE checksums(a,b,c,n,ntimes)
+*     IMPLICIT NONE
+C     ..
+C     .. Arguments ..
+      DOUBLE PRECISION a(*),b(*),c(*)
+      INTEGER n,ntimes
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
+      INTEGER k
+C     ..
+
+C     Repeat the main loop, but with scalars only.
+C     This is done to check the sum & make sure all
+C     iterations have been executed correctly.
+
+      aa = 2.0D0
+      bb = 0.5D0
+      cc = 0.0D0
+      aa = 0.5D0*aa
+      scalar = 0.5d0*aa
+      DO k = 1,ntimes
+          cc = aa
+          bb = scalar*cc
+          cc = aa + bb
+          aa = bb + scalar*cc
+      END DO
+      aa = aa*DBLE(n-2)
+      bb = bb*DBLE(n-2)
+      cc = cc*DBLE(n-2)
+
+C     Now sum up the arrays, excluding the first and last
+C     elements, which are modified using the timing results
+C     to confuse aggressive optimizers.
+
+      suma = 0.0d0
+      sumb = 0.0d0
+      sumc = 0.0d0
+!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
+      DO 110 j = 2,n-1
+          suma = suma + a(j)
+          sumb = sumb + b(j)
+          sumc = sumc + c(j)
+  110 CONTINUE
+
+      epsilon = 1.D-6
+
+      IF (ABS(suma-aa)/suma .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array a()'
+          PRINT *,'Target   Sum of a is = ',aa
+          PRINT *,'Computed Sum of a is = ',suma
+      ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array b()'
+          PRINT *,'Target   Sum of b is = ',bb
+          PRINT *,'Computed Sum of b is = ',sumb
+      ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array c()'
+          PRINT *,'Target   Sum of c is = ',cc
+          PRINT *,'Computed Sum of c is = ',sumc
+      ELSE
+          PRINT *,'Solution Validates!'
+      ENDIF
+
+      END
+
diff --git a/Versions/Old/stream_omp.c b/Versions/Old/stream_omp.c
new file mode 100644
index 0000000..c1f34c1
--- /dev/null
+++ b/Versions/Old/stream_omp.c
@@ -0,0 +1,402 @@
+/*-----------------------------------------------------------------------*/
+/* Program: Stream                                                       */
+/* Revision: $Id: stream_omp.c,v 5.4 2009/02/19 13:57:12 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2003: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*         "tuned STREAM benchmark results"                              */
+/*         "based on a variant of the STREAM benchmark code"             */
+/*         Other comparable, clear and reasonable labelling is           */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/* INSTRUCTIONS:
+ *
+ *	1) Stream requires a good bit of memory to run.  Adjust the
+ *          value of 'N' (below) to give a 'timing calibration' of 
+ *          at least 20 clock-ticks.  This will provide rate estimates
+ *          that should be good to about 5% precision.
+ */
+
+# define N	2000000
+# define NTIMES	10
+# define OFFSET	0
+
+/*
+ *	3) Compile the code with full optimization.  Many compilers
+ *	   generate unreasonably bad code before the optimizer tightens
+ *	   things up.  If the results are unreasonably good, on the
+ *	   other hand, the optimizer might be too smart for me!
+ *
+ *         Try compiling with:
+ *               cc -O stream_omp.c -o stream_omp
+ *
+ *         This is known to work on Cray, SGI, IBM, and Sun machines.
+ *
+ *
+ *	4) Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include:
+ *		a) computer hardware model number and software revision
+ *		b) the compiler flags
+ *		c) all of the output from the test case.
+ * Thanks!
+ *
+ */
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+static double	a[N+OFFSET],
+		b[N+OFFSET],
+		c[N+OFFSET];
+
+static double	avgtime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(double) * N,
+    2 * sizeof(double) * N,
+    3 * sizeof(double) * N,
+    3 * sizeof(double) * N
+    };
+
+extern double mysecond();
+extern void checkSTREAMresults();
+#ifdef TUNED
+extern void tuned_STREAM_Copy();
+extern void tuned_STREAM_Scale(double scalar);
+extern void tuned_STREAM_Add();
+extern void tuned_STREAM_Triad(double scalar);
+#endif
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    register int	j, k;
+    double		scalar, t, times[4][NTIMES];
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    BytesPerWord = sizeof(double);
+    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
+    printf("Total memory required = %.1f MB.\n",
+	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
+    printf("Each test is run %d times, but only\n", NTIMES);
+    printf("the *best* time for each is used.\n");
+
+#ifdef _OPENMP
+    printf(HLINE);
+#pragma omp parallel private(k)
+    {
+    k = omp_get_num_threads();
+    printf ("Number of Threads requested = %i\n",k);
+    }
+#endif
+
+    /* Get initial value for system clock. */
+#pragma omp parallel for
+    for (j=0; j<N; j++) {
+	a[j] = 1.0;
+	b[j] = 2.0;
+	c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+
+    t = mysecond();
+#pragma omp parallel for
+    for (j = 0; j < N; j++)
+	a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    scalar = 3.0;
+    for (k=0; k<NTIMES; k++)
+	{
+	times[0][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Copy();
+#else
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    c[j] = a[j];
+#endif
+	times[0][k] = mysecond() - times[0][k];
+	
+	times[1][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Scale(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    b[j] = scalar*c[j];
+#endif
+	times[1][k] = mysecond() - times[1][k];
+	
+	times[2][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Add();
+#else
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    c[j] = a[j]+b[j];
+#endif
+	times[2][k] = mysecond() - times[2][k];
+	
+	times[3][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Triad(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    a[j] = b[j]+scalar*c[j];
+#endif
+	times[3][k] = mysecond() - times[3][k];
+	}
+
+    /*	--- SUMMARY --- */
+
+    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+	for (j=0; j<4; j++)
+	    {
+	    avgtime[j] = avgtime[j] + times[j][k];
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+	avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
+	       1.0E-06 * bytes[j]/mintime[j],
+	       avgtime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    printf(HLINE);
+
+    /* --- Check Results --- */
+    checkSTREAMresults();
+    printf(HLINE);
+
+    return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+
+/* A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems.  */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+void checkSTREAMresults ()
+{
+	double aj,bj,cj,scalar;
+	double asum,bsum,csum;
+	double epsilon;
+	int	j,k;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+	aj = aj * (double) (N);
+	bj = bj * (double) (N);
+	cj = cj * (double) (N);
+
+	asum = 0.0;
+	bsum = 0.0;
+	csum = 0.0;
+	for (j=0; j<N; j++) {
+		asum += a[j];
+		bsum += b[j];
+		csum += c[j];
+	}
+#ifdef VERBOSE
+	printf ("Results Comparison: \n");
+	printf ("        Expected  : %f %f %f \n",aj,bj,cj);
+	printf ("        Observed  : %f %f %f \n",asum,bsum,csum);
+#endif
+
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+	epsilon = 1.e-8;
+
+	if (abs(aj-asum)/asum > epsilon) {
+		printf ("Failed Validation on array a[]\n");
+		printf ("        Expected  : %f \n",aj);
+		printf ("        Observed  : %f \n",asum);
+	}
+	else if (abs(bj-bsum)/bsum > epsilon) {
+		printf ("Failed Validation on array b[]\n");
+		printf ("        Expected  : %f \n",bj);
+		printf ("        Observed  : %f \n",bsum);
+	}
+	else if (abs(cj-csum)/csum > epsilon) {
+		printf ("Failed Validation on array c[]\n");
+		printf ("        Expected  : %f \n",cj);
+		printf ("        Observed  : %f \n",csum);
+	}
+	else {
+		printf ("Solution Validates\n");
+	}
+}
+
+void tuned_STREAM_Copy()
+{
+	int j;
+#pragma omp parallel for
+        for (j=0; j<N; j++)
+            c[j] = a[j];
+}
+
+void tuned_STREAM_Scale(double scalar)
+{
+	int j;
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    b[j] = scalar*c[j];
+}
+
+void tuned_STREAM_Add()
+{
+	int j;
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    c[j] = a[j]+b[j];
+}
+
+void tuned_STREAM_Triad(double scalar)
+{
+	int j;
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    a[j] = b[j]+scalar*c[j];
+}
diff --git a/Versions/Old/stream_tuned.f b/Versions/Old/stream_tuned.f
new file mode 100644
index 0000000..5c51d4a
--- /dev/null
+++ b/Versions/Old/stream_tuned.f
@@ -0,0 +1,485 @@
+*=========================================================================
+* Program: STREAM_TUNED
+* Programmer: John D. McCalpin
+* Revision: 1.0, November 6, 2002
+*-----------------------------------------------------------------------
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results"
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*-----------------------------------------------------------------------
+* STREAM measures memory transfer rates in MB/s for simple
+* computational kernels coded in Fortran.  
+*
+* The intent is to demonstrate the extent to which ordinary user
+* code can exploit the main memory bandwidth of the system under
+* test.
+*
+* This version is a simple harness to allow code optimization
+* in the context of the data flow and result checking of the
+* basic STREAM version 5.0 code.   Each of the four kernel loops
+* has been moved to a separate subroutine to allow easy code 
+* optimization or replacement.
+*
+*=========================================================================
+* THIS IS JUST A STARTING POINT --- IT HAS NOT BEEN OPTIMIZED YET!!!
+*=========================================================================
+* The STREAM web page is at:
+*          http://www.streambench.org
+*
+* Most of the content is currently hosted at:
+*          http://www.cs.virginia.edu/stream/
+*
+* BRIEF INSTRUCTIONS: 
+*       0) See http://www.cs.virginia.edu/stream/ref.html for details
+*       1) STREAM requires a timing function called mysecond().
+*          Several examples are provided in this directory.
+*          "CPU" timers are only allowed for uniprocessor runs.
+*          "Wall-clock" timers are required for all multiprocessor runs.
+*       2) The STREAM array sizes must be set to size the test.
+*          The value "N" must be chosen so that each of the three
+*          arrays is at least 4x larger than the sum of all the last-
+*          level caches used in the run, or 1 million elements, which-
+*          ever is larger.
+*          ------------------------------------------------------------
+*          Note that you are free to use any array length and offset
+*          that makes each array 4x larger than the last-level cache.
+*          The intent is to determine the *best* sustainable bandwidth
+*          available with this simple coding.  Of course, lower values
+*          are usually fairly easy to obtain on cached machines, but 
+*          by keeping the test to the *best* results, the answers are
+*          easier to interpret.
+*          You may put the arrays in common or not, at your discretion.
+*          There is a commented-out COMMON statement below.
+*          Fortran90 "allocatable" arrays are fine, too.
+*          ------------------------------------------------------------
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonably good, on the
+*          other hand, the optimizer might be too smart for me
+*          Please let me know if this happens.
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*          Please let me know if you do not want your name posted along
+*          with the submitted results.
+*       5) See the web page for more comments about the run rules and
+*          about interpretation of the results.
+*
+* Thanks,
+*   Dr. Bandwidth
+*=========================================================================
+*
+      PROGRAM stream
+*     IMPLICIT NONE
+C     .. Parameters ..
+      INTEGER n,offset,ndim,ntimes
+      PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION scalar,t
+      INTEGER j,k,nbpw,quantum
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
+     $                 times(4,ntimes)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      INTEGER checktick,realsize
+      EXTERNAL mysecond,checktick,realsize
+C     ..
+C     .. Intrinsic Functions ..
+C
+      INTRINSIC dble,max,min,nint,sqrt
+C     ..
+C     .. Arrays in Common ..
+      DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
+C     ..
+C     .. Common blocks ..
+*     COMMON a,b,c
+C     ..
+C     .. Data statements ..
+      DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Copy:      ','Scale:     ','Add:       ',
+     $     'Triad:     '/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      WRITE (*,FMT=9010) 'Array size = ',n
+      WRITE (*,FMT=9010) 'Offset     = ',offset
+      WRITE (*,FMT=9020) 'The total memory requirement is ',
+     $  3*nbpw*n/ (1024*1024),' MB'
+      WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
+      WRITE (*,FMT=9030) '--'
+      WRITE (*,FMT=9030) 'The *best* time for each test is used'
+      WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
+
+!$OMP PARALLEL DO
+      DO 10 j = 1,n
+          a(j) = 2.0d0
+          b(j) = 0.5D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = mysecond()
+!$OMP PARALLEL DO
+      DO 20 j = 1,n
+          a(j) = 0.5d0*a(j)
+   20 CONTINUE
+      t = mysecond() - t
+      PRINT *,'----------------------------------------------------'
+      quantum = checktick()
+      WRITE (*,FMT=9000)
+     $  'Your clock granularity/precision appears to be ',quantum,
+     $  ' microseconds'
+      PRINT *,'----------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      scalar = 0.5d0*a(1)
+      DO 70 k = 1,ntimes
+
+          t = mysecond()
+          a(1) = a(1) + t
+          call stream_copy (c, a, n)
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(1,k) = t
+
+          t = mysecond()
+          c(1) = c(1) + t
+          call stream_scale (b, c, scalar, n)
+          t = mysecond() - t
+          b(n) = b(n) + t
+          times(2,k) = t
+
+          t = mysecond()
+          a(1) = a(1) + t
+          call stream_add (c, a, b, n)
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(3,k) = t
+
+          t = mysecond()
+          b(1) = b(1) + t
+          call stream_triad (a, b, c, scalar, n)
+          t = mysecond() - t
+          a(n) = a(n) + t
+          times(4,k) = t
+   70 CONTINUE
+
+*       --- SUMMARY ---
+      DO 90 k = 2,ntimes-1
+          DO 80 j = 1,4
+              avgtime(j) = avgtime(j) + times(j,k)
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   80     CONTINUE
+   90 CONTINUE
+      WRITE (*,FMT=9040)
+      DO 100 j = 1,4
+          avgtime(j) = avgtime(j)/dble(ntimes-2)
+          WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      avgtime(j),mintime(j),maxtime(j)
+  100 CONTINUE
+      PRINT *,'----------------------------------------------------'
+      CALL checksums (a,b,c,n,ntimes)
+      PRINT *,'----------------------------------------------------'
+
+ 9000 FORMAT (1x,a,i6,a)
+ 9010 FORMAT (1x,a,i10)
+ 9020 FORMAT (1x,a,i4,a)
+ 9030 FORMAT (1x,a,i3,a,a)
+ 9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
+     $       )
+ 9050 FORMAT (a,4 (f10.4,2x))
+      END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLE PRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLE PRECISION
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+*     IMPLICIT NONE
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL confuse
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0** (-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL confuse(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 40
+          END IF
+   30 CONTINUE
+      GO TO 50
+
+   40 WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLE PRECISION word'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      RETURN
+
+   50 PRINT *,'Hmmmm.  I am unable to determine the size.'
+      PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense.'
+          PRINT *,'Try again.'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'DOUBLE PRECISION number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per DOUBLE PRECISION number'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      END
+
+      SUBROUTINE confuse(q,r)
+*     IMPLICIT NONE
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
+
+* A semi-portable way to determine the clock granularity
+* Adapted from a code by John Henning of Digital Equipment Corporation
+*
+      INTEGER FUNCTION checktick()
+*     IMPLICIT NONE
+
+C     .. Parameters ..
+      INTEGER n
+      PARAMETER (n=20)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t1,t2
+      INTEGER i,j,jmin
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION timesfound(n)
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      EXTERNAL mysecond
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC max,min,nint
+C     ..
+      i = 0
+      t1 = mysecond()
+
+   10 t2 = mysecond()
+      IF (t2.EQ.t1) GO TO 10
+
+      t1 = t2
+      i = i + 1
+      timesfound(i) = t1
+      IF (i.LT.n) GO TO 10
+
+      jmin = 1000000
+      DO 20 i = 2,n
+          j = nint((timesfound(i)-timesfound(i-1))*1d6)
+          jmin = min(jmin,max(j,0))
+   20 CONTINUE
+
+      IF (jmin.GT.0) THEN
+          checktick = jmin
+      ELSE
+          PRINT *,'Your clock granularity appears to be less ',
+     $      'than one microsecond'
+          checktick = 1
+      END IF
+      RETURN
+
+*      PRINT 14, timesfound(1)*1d6
+*      DO 20 i=2,n
+*         PRINT 14, timesfound(i)*1d6,
+*     &       nint((timesfound(i)-timesfound(i-1))*1d6)
+*   14    FORMAT (1X, F18.4, 1X, i8)
+*   20 CONTINUE
+
+      END
+
+
+
+
+      SUBROUTINE checksums(a,b,c,n,ntimes)
+*     IMPLICIT NONE
+C     ..
+C     .. Arguments ..
+      DOUBLE PRECISION a(*),b(*),c(*)
+      INTEGER n,ntimes
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
+      INTEGER k
+C     ..
+
+C     Repeat the main loop, but with scalars only.
+C     This is done to check the sum & make sure all
+C     iterations have been executed correctly.
+
+      aa = 2.0D0
+      bb = 0.5D0
+      cc = 0.0D0
+      aa = 0.5D0*aa
+      scalar = 0.5d0*aa
+      DO k = 1,ntimes
+          cc = aa
+          bb = scalar*cc
+          cc = aa + bb
+          aa = bb + scalar*cc
+      END DO
+      aa = aa*DBLE(n-2)
+      bb = bb*DBLE(n-2)
+      cc = cc*DBLE(n-2)
+
+C     Now sum up the arrays, excluding the first and last
+C     elements, which are modified using the timing results
+C     to confuse aggressive optimizers.
+
+      suma = 0.0d0
+      sumb = 0.0d0
+      sumc = 0.0d0
+!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
+      DO 110 j = 2,n-1
+          suma = suma + a(j)
+          sumb = sumb + b(j)
+          sumc = sumc + c(j)
+  110 CONTINUE
+
+      epsilon = 1.D-6
+
+      IF (ABS(suma-aa)/suma .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array a()'
+          PRINT *,'Target   Sum of a is = ',aa
+          PRINT *,'Computed Sum of a is = ',suma
+      ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array b()'
+          PRINT *,'Target   Sum of b is = ',bb
+          PRINT *,'Computed Sum of b is = ',sumb
+      ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array c()'
+          PRINT *,'Target   Sum of c is = ',cc
+          PRINT *,'Computed Sum of c is = ',sumc
+      ELSE
+          PRINT *,'Solution Validates!'
+      ENDIF
+
+      END
+
+
+*=========================================================================
+* This version is a simple harness to allow code optimization
+* in the context of the data flow and result checking of the
+* basic STREAM version 5.0 code.   Each of the four kernel loops
+* has been moved to a separate subroutine to allow easy code 
+* optimization or replacement.
+*=========================================================================
+* THESE ARE JUST STARTING POINTS --- THEY HAVE NOT BEEN OPTIMIZED YET!!!
+*=========================================================================
+
+          subroutine stream_copy (c, a, n)
+          real*8 c(*), a(*)
+!$OMP PARALLEL DO
+          do j = 1,n
+              c(j) = a(j)
+          end do
+          end
+
+          subroutine stream_scale (b, c, scalar, n)
+          real*8 b(*), c(*), scalar
+!$OMP PARALLEL DO
+          do j = 1,n
+              b(j) = scalar*c(j)
+          end do
+          end
+
+          subroutine stream_add (c, a, b, n)
+          real*8 c(*), a(*), b(*)
+!$OMP PARALLEL DO
+          do j = 1,n
+              c(j) = a(j) + b(j)
+          end do
+          end
+
+          subroutine stream_triad (a, b, c, scalar, n)
+          real*8 a(*), b(*), c(*), scalar
+!$OMP PARALLEL DO
+          do j = 1,n
+              a(j) = b(j) + scalar*c(j)
+          end do
+          end
diff --git a/Versions/Old/stream_wall.f b/Versions/Old/stream_wall.f
new file mode 100644
index 0000000..4d6434f
--- /dev/null
+++ b/Versions/Old/stream_wall.f
@@ -0,0 +1,223 @@
+* Program: Stream
+* Programmer: John D. McCalpin
+* Revision: 3.0, August 2, 1994
+*
+* This program measures memory transfer rates in MB/s for simple
+* computational kernels coded in Fortran.  These numbers reveal the
+* quality of code generation for simple uncacheable kernels as well
+* as showing the cost of floating-point operations relative to memory
+* accesses.
+*
+* INSTRUCTIONS:
+*       1) Stream requires a cpu timing function called myclock().
+*          A sample is in myclock.c.  This is unfortunately rather
+*          system dependent.  It helps to know the granularity of the
+*          timing.  The code below assumes that the granularity is
+*          1/100 seconds.
+*       2) Stream requires a good bit of memory to run.
+*          Adjust the Parameter 'N' in the second line of the main
+*          program to give a 'timing calibration' of at least 20 clicks.
+*          This will provide rate estimates that should be good to
+*          about 5% precision.
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonable good, on the
+*          other hand, the optimizer might be too smart for me!
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*
+* Thanks!
+*
+      PROGRAM stream
+C     .. Parameters ..
+      INTEGER n,ntimes
+      PARAMETER (n=2 000 000,ntimes=10)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t,t0
+      INTEGER j,k,nbpw
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION a(n),b(n),c(n),maxtime(4),mintime(4),rmstime(4),
+     $                 times(4,ntimes),sum(3)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION myclock
+      INTEGER realsize
+      EXTERNAL myclock,realsize
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC dble,max,min,sqrt
+C     ..
+C     .. Data statements ..
+      DATA rmstime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Assignment:','Scaling   :','Summing   :',
+     $     'SAXPYing  :'/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      t = myclock(t0)
+      t = myclock(t0)
+      DO 10 j = 1,n
+          a(j) = 1.0D0
+          b(j) = 2.0D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = myclock(t0) - t
+      PRINT *,'Timing calibration ; time = ',t*100,' hundredths',
+     $  ' of a second'
+      PRINT *,'Increase the size of the arrays if this is <30 ',
+     $  ' and your clock precision is =<1/100 second'
+      PRINT *,'---------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      DO 60 k = 1,ntimes
+
+          t = myclock(t0)
+          DO 20 j = 1,n
+              c(j) = a(j)
+   20     CONTINUE
+          t = myclock(t0) - t
+          times(1,k) = t
+
+          t = myclock(t0)
+          DO 30 j = 1,n
+              b(j) = 3.0D0*c(j)
+   30     CONTINUE
+          t = myclock(t0) - t
+          times(2,k) = t
+
+          t = myclock(t0)
+          DO 40 j = 1,n
+              c(j) = a(j) + b(j)
+   40     CONTINUE
+          t = myclock(t0) - t
+          times(3,k) = t
+
+          t = myclock(t0)
+          DO 50 j = 1,n
+              a(j) = b(j) + 3.0D0*c(j)
+   50     CONTINUE
+          t = myclock(t0) - t
+          times(4,k) = t
+   60 CONTINUE
+
+*       --- SUMMARY ---
+      DO 80 k = 1,ntimes
+          DO 70 j = 1,4
+              rmstime(j) = rmstime(j) + times(j,k)**2
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   70     CONTINUE
+   80 CONTINUE
+      WRITE (*,FMT=9000)
+      DO 90 j = 1,4
+          rmstime(j) = sqrt(rmstime(j)/dble(ntimes))
+          WRITE (*,FMT=9010) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      rmstime(j),mintime(j),maxtime(j)
+   90 CONTINUE
+      sum(1) = 0.0
+      sum(2) = 0.0
+      sum(3) = 0.0
+      DO 100 j=1,n
+         sum(1) = sum(1) + a(j)
+         sum(2) = sum(2) + b(j)
+         sum(3) = sum(3) + c(j)
+  100 CONTINUE
+      PRINT *,'Sum of a is : ',sum(1)
+      PRINT *,'Sum of b is : ',sum(2)
+      PRINT *,'Sum of c is : ',sum(3)
+
+ 9000 FORMAT ('Function',5x,'Rate (MB/s)  RMS time   Min time  Max time'
+     $       )
+ 9010 FORMAT (a,4 (f10.4,2x))
+      END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLEPRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLEPRECISION 
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL dummy
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0**(-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL dummy(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 50
+          END IF
+   30 CONTINUE
+      GOTO 60
+
+   50 WRITE (*,FMT='(a)') '--------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLEPRECISION word'
+      WRITE (*,FMT='(a)') '--------------------------------------'
+      RETURN
+
+   60 PRINT *,'Hmmmm.  I am unable to determine the size of a REAL'
+      PRINT *,'Please enter the number of Bytes per DOUBLEPRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense!'
+          PRINT *,'Try again!'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'REAL number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per REAL number'
+      WRITE (*,FMT='(a)') '--------------------------------------'
+      END
+
+      SUBROUTINE dummy(q,r)
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
diff --git a/Versions/README b/Versions/README
new file mode 100644
index 0000000..c731db0
--- /dev/null
+++ b/Versions/README
@@ -0,0 +1,124 @@
+===============================================
+STREAM is the de facto industry standard benchmark
+   for measuring sustained memory bandwidth.
+Documentation for STREAM is on the web at:
+   http://www.cs.virginia.edu/stream/ref.html
+===============================================
+===============================================
+This "Versions" directory holds several variants of
+STREAM that are sometimes useful.
+===============================================
+===============================================
+Potentially Useful Versions:
+    2014-Oct-21  stream_mpi.c
+    2014-Oct-21  stream_5-10_posix_memalign.c
+    2014-Feb-14  stream_mpi.f
+    2012-Aug-14  stream_windows.c
+
+All older versions have been moved to the "Old"
+directory and should not be used.
+    2009-Mar-14  stream_omp.c
+    2004-Dec-12  stream_tuned.f
+    2003-May-27  stream_d.f
+    2003-Apr-08  second_cpu.f
+    2003-Apr-08  second_cpu.c
+    2003-Jan-10  stream_d.c
+===============================================
+NEWS AND NOTES
+===============================================
+UPDATE: October 28 2014:
+
+"stream_mpi.c" released.
+Based on Version 5.10 of stream.c, stream_mpi.c
+brings the following new features:
+* MPI implementation that *distributes* the arrays
+  across all MPI ranks. (The older Fortran version
+  of STREAM in MPI *replicates* the arrays across
+  all MPI ranks.)
+* Data is allocated using "posix_memalign" 
+  rather than using static arrays.  Different
+  compiler flags may be needed for both portability
+  and optimization.
+  For example, on the TACC Stampede system using
+  the Intel compilers, my standard compile line
+  for a 4-node run is:
+	-----------------------------------------
+	mpicc -O3 -ffreestanding -openmp -mcmodel=medium -restrict -opt-streaming-stores always  \
+	    -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 -DVERBOSE   \
+	    stream_mpi.c -o stream_mpi
+
+	Notes on flags:
+	    -O3                                 works OK, as does -O2 in most cases
+	    -ffreestanding                      don't replace the Copy kernel with a library version
+	    -openmp                             use OpenMP
+	    -mcmodel=medium                     allow arrays to be bigger than 4 GiB
+	    -restrict                           allow use of the C99 "restrict" keyword
+	    -opt-streaming-stores always        force streaming stores
+
+	Notes on preprocessor variables:
+	    STREAM_ARRAY_SIZE                   elements per array (distributed across all ranks)
+	    NTIMES                              number of iterations to run
+	    VERBOSE                             print startup and shutdown timing information
+	-----------------------------------------
+* Error checking and timing done by all ranks and
+  gathered by rank 0 for processing and output.
+* Timing code uses barriers to ensure correct
+  operation even when multiple MPI ranks run on
+  shared memory systems.
+
+NOTE: MPI is not a preferred implementation for
+  STREAM, which is intended to measure memory
+  bandwidth in shared-memory systems.  In stream_mpi,
+  the MPI calls are only used to properly synchronize
+  the timers (using MPI_Barrier) and to gather
+  timing and error data, so the performance should 
+  scale linearly with the size of the cluster.
+  But it may be useful, and was an interesting 
+  exercise to develop and debug.
+
+===============================================
+UPDATE: January 17 2013:
+
+Version 5.10 of stream.c is finally available!
+
+There are no changes to what is being measured, but
+a number of long-awaited improvements have been made:
+
+* Updated validation code does not suffer from 
+  accumulated roundoff error for large arrays.
+* Defining the preprocessor variable "VERBOSE"
+  when compiling will (1) cause the code to print the
+  measured average relative absolute error (rather than
+  simply printing "Solution Validates", and (2) print
+  the first 10 array entries with relative error exceeding
+  the error tolerance.
+* Array index variables have been upgraded from
+  "int" to "ssize_t" to allow arrays with more
+  than 2 billion elements on 64-bit systems.
+* Substantial improvements to the comments in 
+  the source on how to configure/compile/run the
+  benchmark.
+* The proprocessor variable controlling the array
+  size has been changed from "N" to "STREAM_ARRAY_SIZE".
+* A new preprocessor variable "STREAM_TYPE" can be
+  used to override the data type from the default
+  "double" to "float".
+  This mechanism could also be used to change to 
+  non-floating-point types, but several "printf"
+  statements would need to have their formats changed
+  to accomodate the modified data type.
+* Some small changes in output, including printing
+  array sizes is GiB as well as MiB.
+* Change to the default output format to print fewer
+  decimals for the bandwidth and more decimals for
+  the min/max/avg execution times.
+===============================================
+STREAM is a project of "Dr. Bandwidth":
+	John D. McCalpin, Ph.D.
+	john@mccalpin.com
+===============================================
+The STREAM web and ftp sites are currently hosted at
+the Department of Computer Science at the University of
+Virginia under the generous sponsorship of Professor Bill
+Wulf and Professor Alan Batson.
+===============================================
diff --git a/Versions/second_cpu.c b/Versions/second_cpu.c
new file mode 100644
index 0000000..d0338a9
--- /dev/null
+++ b/Versions/second_cpu.c
@@ -0,0 +1,15 @@
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/times.h>
+#include <time.h>
+
+double mysecond()
+{
+    long sec;
+    double secx;
+    struct tms realbuf;
+
+    times(&realbuf);
+    secx = ( realbuf.tms_stime + realbuf.tms_utime ) / (float) CLK_TCK;
+    return ((double) secx);
+}
diff --git a/Versions/second_cpu.f b/Versions/second_cpu.f
new file mode 100644
index 0000000..cef7c36
--- /dev/null
+++ b/Versions/second_cpu.f
@@ -0,0 +1,18 @@
+*-------------------------------------
+* Sample timing routine
+*       This code works on Sun and Silicon Graphics machines.
+*       DOUBLE PRECISION function mysecond()
+*       real arg(2)
+*       mysecond = etime(arg)
+*       end
+* Sample timing routine
+*       This code works on IBM RS/6000 machines
+      DOUBLE PRECISION FUNCTION mysecond()
+C     ..
+C     .. External Functions ..
+      INTEGER mclock
+      EXTERNAL mclock
+C     ..
+      mysecond = mclock()*0.01D0
+      END
+
diff --git a/Versions/stream_5-10_posix_memalign.c b/Versions/stream_5-10_posix_memalign.c
new file mode 100644
index 0000000..d2c722c
--- /dev/null
+++ b/Versions/stream_5-10_posix_memalign.c
@@ -0,0 +1,609 @@
+/*-----------------------------------------------------------------------*/
+/* Program: STREAM                                                       */
+/* Revision: $Id: stream.c,v 5.10.1 2014/06/17 08:16:08 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2013: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*           "tuned STREAM benchmark results"                            */
+/*           "based on a variant of the STREAM benchmark code"           */
+/*         Other comparable, clear, and reasonable labelling is          */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/*-----------------------------------------------------------------------
+ * INSTRUCTIONS:
+ *
+ *	1) STREAM requires different amounts of memory to run on different
+ *           systems, depending on both the system cache size(s) and the
+ *           granularity of the system timer.
+ *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
+ *           to meet *both* of the following criteria:
+ *       (a) Each array must be at least 4 times the size of the
+ *           available cache memory. I don't worry about the difference
+ *           between 10^6 and 2^20, so in practice the minimum array size
+ *           is about 3.8 times the cache size.
+ *           Example 1: One Xeon E3 with 8 MB L3 cache
+ *               STREAM_ARRAY_SIZE should be >= 4 million, giving
+ *               an array size of 30.5 MB and a total memory requirement
+ *               of 91.5 MB.  
+ *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
+ *               STREAM_ARRAY_SIZE should be >= 20 million, giving
+ *               an array size of 153 MB and a total memory requirement
+ *               of 458 MB.  
+ *       (b) The size should be large enough so that the 'timing calibration'
+ *           output by the program is at least 20 clock-ticks.  
+ *           Example: most versions of Windows have a 10 millisecond timer
+ *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
+ *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
+ *               This means the each array must be at least 1 GB, or 128M elements.
+ *
+ *      Version 5.10 increases the default array size from 2 million
+ *          elements to 10 million elements in response to the increasing
+ *          size of L3 caches.  The new default size is large enough for caches
+ *          up to 20 MB. 
+ *      Version 5.10 changes the loop index variables from "register int"
+ *          to "ssize_t", which allows array indices >2^32 (4 billion)
+ *          on properly configured 64-bit systems.  Additional compiler options
+ *          (such as "-mcmodel=medium") may be required for large memory runs.
+ *
+ *      Array size can be set at compile time without modifying the source
+ *          code for the (many) compilers that support preprocessor definitions
+ *          on the compile line.  E.g.,
+ *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
+ *          will override the default size of 10M with a new size of 100M elements
+ *          per array.
+ */
+#ifndef STREAM_ARRAY_SIZE
+#   define STREAM_ARRAY_SIZE	10000000
+#endif
+
+/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
+ *         for any iteration after the first, therefore the minimum value
+ *         for NTIMES is 2.
+ *      There are no rules on maximum allowable values for NTIMES, but
+ *         when running with STREAM_TYPE=float, the results will overflow
+ *         if NTIMES exceeds 32.  Results will probably overflow at some
+ *         point with STREAM_TYPE=double, but I have not checked the exact value.
+ *         Values larger than the default are unlikely to noticeably
+ *         increase the reported performance.
+ *      NTIMES can also be set on the compile line without changing the source
+ *         code using, for example, "-DNTIMES=7".
+ */
+#ifdef NTIMES
+#if NTIMES<=1
+#   define NTIMES	10
+#endif
+#endif
+#ifndef NTIMES
+#   define NTIMES	10
+#endif
+
+/*  Users are allowed to modify the "OFFSET" variable, which *may* change the
+ *         relative alignment of the arrays (though compilers may change the 
+ *         effective offset by making the arrays non-contiguous on some systems). 
+ *      Use of non-zero values for OFFSET can be especially helpful if the
+ *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
+ *      OFFSET can also be set on the compile line without changing the source
+ *         code using, for example, "-DOFFSET=56".
+ */
+#ifndef OFFSET
+#   define OFFSET	0
+#endif
+
+/*
+ *	3) Compile the code with optimization.  Many compilers generate
+ *       unreasonably bad code before the optimizer tightens things up.  
+ *     If the results are unreasonably good, on the other hand, the
+ *       optimizer might be too smart for me!
+ *
+ *     For a simple single-core version, try compiling with:
+ *            cc -O stream.c -o stream
+ *     This is known to work on many, many systems....
+ *
+ *     To use multiple cores, you need to tell the compiler to obey the OpenMP
+ *       directives in the code.  This varies by compiler, but a common example is
+ *            gcc -O -fopenmp stream.c -o stream_omp
+ *       The environment variable OMP_NUM_THREADS allows runtime control of the 
+ *         number of threads/cores used when the resulting "stream_omp" program
+ *         is executed.
+ *
+ *     To run with single-precision variables and arithmetic, simply add
+ *         -DSTREAM_TYPE=float
+ *     to the compile line.
+ *     Note that this changes the minimum array sizes required --- see (1) above.
+ *
+ *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
+ *       code to call separate functions to execute each kernel.  Trivial versions
+ *       of these functions are provided, but they are *not* tuned -- they just 
+ *       provide predefined interfaces to be replaced with tuned code.
+ *
+ *
+ *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include info that will help me understand:
+ *		a) the computer hardware configuration (e.g., processor model, memory type)
+ *		b) the compiler name/version and compilation flags
+ *      c) any run-time information (such as OMP_NUM_THREADS)
+ *		d) all of the output from the test case.
+ *
+ * Thanks!
+ *
+ *-----------------------------------------------------------------------*/
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+#ifndef STREAM_TYPE
+#define STREAM_TYPE double
+#endif
+
+//static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
+//			b[STREAM_ARRAY_SIZE+OFFSET],
+//			c[STREAM_ARRAY_SIZE+OFFSET];
+double *a,*b,*c;
+
+static double	avgtime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
+    };
+
+extern double mysecond();
+extern void checkSTREAMresults();
+#ifdef TUNED
+extern void tuned_STREAM_Copy();
+extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
+extern void tuned_STREAM_Add();
+extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
+#endif
+#ifdef _OPENMP
+extern int omp_get_num_threads();
+#endif
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    int			k;
+    ssize_t		j;
+    STREAM_TYPE		scalar;
+    double		t, times[4][NTIMES];
+	size_t		arraybytes,arrayalignment;
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    printf("STREAM version $Revision: 5.10 $\n");
+    printf(HLINE);
+    BytesPerWord = sizeof(STREAM_TYPE);
+    printf("This system uses %d bytes per array element.\n",
+	BytesPerWord);
+
+    arraybytes = (STREAM_ARRAY_SIZE + OFFSET)*sizeof(STREAM_TYPE);
+    arrayalignment = 64;
+	k = posix_memalign((void **)&a, arrayalignment, arraybytes);
+	if (k != 0) {
+		printf("Allocation of array a failed, return code is %d\n",k);
+		exit(1);
+	}
+	k = posix_memalign((void **)&b, arrayalignment, arraybytes);
+	if (k != 0) {
+		printf("Allocation of array b failed, return code is %d\n",k);
+		exit(1);
+	}
+	k = posix_memalign((void **)&c, arrayalignment, arraybytes);
+	if (k != 0) {
+		printf("Allocation of array c failed, return code is %d\n",k);
+		exit(1);
+	}
+
+    printf(HLINE);
+#ifdef N
+    printf("*****  WARNING: ******\n");
+    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
+    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
+    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
+    printf("*****  WARNING: ******\n");
+#endif
+
+    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
+    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
+	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
+	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
+    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
+	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
+	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
+    printf("Each kernel will be executed %d times.\n", NTIMES);
+    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
+    printf(" will be used to compute the reported bandwidth.\n");
+
+#ifdef _OPENMP
+    printf(HLINE);
+#pragma omp parallel 
+    {
+#pragma omp master
+	{
+	    k = omp_get_num_threads();
+	    printf ("Number of Threads requested = %i\n",k);
+        }
+    }
+#endif
+
+#ifdef _OPENMP
+	k = 0;
+#pragma omp parallel
+#pragma omp atomic 
+		k++;
+    printf ("Number of Threads counted = %i\n",k);
+#endif
+
+    /* Get initial value for system clock. */
+#pragma omp parallel for
+    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+	    a[j] = 1.0;
+	    b[j] = 2.0;
+	    c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else {
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+	quantum = 1;
+    }
+
+    t = mysecond();
+#pragma omp parallel for
+    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
+		a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    scalar = 3.0;
+    for (k=0; k<NTIMES; k++)
+	{
+	times[0][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Copy();
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j];
+#endif
+	times[0][k] = mysecond() - times[0][k];
+	
+	times[1][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Scale(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    b[j] = scalar*c[j];
+#endif
+	times[1][k] = mysecond() - times[1][k];
+	
+	times[2][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Add();
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j]+b[j];
+#endif
+	times[2][k] = mysecond() - times[2][k];
+	
+	times[3][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Triad(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    a[j] = b[j]+scalar*c[j];
+#endif
+	times[3][k] = mysecond() - times[3][k];
+	}
+
+    /*	--- SUMMARY --- */
+
+    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+	for (j=0; j<4; j++)
+	    {
+	    avgtime[j] = avgtime[j] + times[j][k];
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+		printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
+	       1.0E-06 * bytes[j]/mintime[j],
+	       avgtime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    printf(HLINE);
+
+    /* --- Check Results --- */
+    checkSTREAMresults();
+    printf(HLINE);
+
+    return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+
+/* A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems.  */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+#ifndef abs
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+#endif
+void checkSTREAMresults ()
+{
+	STREAM_TYPE aj,bj,cj,scalar;
+	STREAM_TYPE aSumErr,bSumErr,cSumErr;
+	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
+	double epsilon;
+	ssize_t	j;
+	int	k,ierr,err;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+
+    /* accumulate deltas between observed and expected results */
+	aSumErr = 0.0;
+	bSumErr = 0.0;
+	cSumErr = 0.0;
+	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+		aSumErr += abs(a[j] - aj);
+		bSumErr += abs(b[j] - bj);
+		cSumErr += abs(c[j] - cj);
+		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
+	}
+	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+
+	if (sizeof(STREAM_TYPE) == 4) {
+		epsilon = 1.e-6;
+	}
+	else if (sizeof(STREAM_TYPE) == 8) {
+		epsilon = 1.e-13;
+	}
+	else {
+		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
+		epsilon = 1.e-6;
+	}
+
+	err = 0;
+	if (abs(aAvgErr/aj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(a[j]/aj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,aj,a[j],abs((aj-a[j])/aAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array a[], %d errors were found.\n",ierr);
+	}
+	if (abs(bAvgErr/bj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(b[j]/bj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,bj,b[j],abs((bj-b[j])/bAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array b[], %d errors were found.\n",ierr);
+	}
+	if (abs(cAvgErr/cj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(c[j]/cj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,cj,c[j],abs((cj-c[j])/cAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array c[], %d errors were found.\n",ierr);
+	}
+	if (err == 0) {
+		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
+	}
+#ifdef VERBOSE
+	printf ("Results Validation Verbose Results: \n");
+	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
+	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
+	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
+#endif
+}
+
+#ifdef TUNED
+/* stubs for "tuned" versions of the kernels */
+void tuned_STREAM_Copy()
+{
+	ssize_t j;
+#pragma omp parallel for
+        for (j=0; j<STREAM_ARRAY_SIZE; j++)
+            c[j] = a[j];
+}
+
+void tuned_STREAM_Scale(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    b[j] = scalar*c[j];
+}
+
+void tuned_STREAM_Add()
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j]+b[j];
+}
+
+void tuned_STREAM_Triad(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    a[j] = b[j]+scalar*c[j];
+}
+/* end of stubs for the "tuned" versions of the kernels */
+#endif
diff --git a/Versions/stream_d.c b/Versions/stream_d.c
new file mode 100644
index 0000000..1f2bbe1
--- /dev/null
+++ b/Versions/stream_d.c
@@ -0,0 +1,218 @@
+# include <stdio.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/*
+ * Program: Stream
+ * Programmer: Joe R. Zagar
+ * Revision: 4.0-BETA, October 24, 1995
+ * Original code developed by John D. McCalpin
+ *
+ * This program measures memory transfer rates in MB/s for simple 
+ * computational kernels coded in C.  These numbers reveal the quality
+ * of code generation for simple uncacheable kernels as well as showing
+ * the cost of floating-point operations relative to memory accesses.
+ *
+ * INSTRUCTIONS:
+ *
+ *	1) Stream requires a good bit of memory to run.  Adjust the
+ *          value of 'N' (below) to give a 'timing calibration' of 
+ *          at least 20 clock-ticks.  This will provide rate estimates
+ *          that should be good to about 5% precision.
+ */
+
+# define N	2000000
+# define NTIMES	10
+# define OFFSET	0
+
+/*
+ *	3) Compile the code with full optimization.  Many compilers
+ *	   generate unreasonably bad code before the optimizer tightens
+ *	   things up.  If the results are unreasonably good, on the
+ *	   other hand, the optimizer might be too smart for me!
+ *
+ *         Try compiling with:
+ *               cc -O stream_d.c second_wall.c -o stream_d -lm
+ *
+ *         This is known to work on Cray, SGI, IBM, and Sun machines.
+ *
+ *
+ *	4) Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include:
+ *		a) computer hardware model number and software revision
+ *		b) the compiler flags
+ *		c) all of the output from the test case.
+ * Thanks!
+ *
+ */
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+static double	a[N+OFFSET],
+		b[N+OFFSET],
+		c[N+OFFSET];
+
+static double	rmstime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(double) * N,
+    2 * sizeof(double) * N,
+    3 * sizeof(double) * N,
+    3 * sizeof(double) * N
+    };
+
+extern double mysecond();
+
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    register int	j, k;
+    double		scalar, t, times[4][NTIMES];
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    BytesPerWord = sizeof(double);
+    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
+    printf("Total memory required = %.1f MB.\n",
+	(3 * N * BytesPerWord) / 1048576.0);
+    printf("Each test is run %d times, but only\n", NTIMES);
+    printf("the *best* time for each is used.\n");
+
+    /* Get initial value for system clock. */
+
+    for (j=0; j<N; j++) {
+	a[j] = 1.0;
+	b[j] = 2.0;
+	c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+
+    t = mysecond();
+    for (j = 0; j < N; j++)
+	a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    scalar = 3.0;
+    for (k=0; k<NTIMES; k++)
+	{
+	times[0][k] = mysecond();
+	for (j=0; j<N; j++)
+	    c[j] = a[j];
+	times[0][k] = mysecond() - times[0][k];
+	
+	times[1][k] = mysecond();
+	for (j=0; j<N; j++)
+	    b[j] = scalar*c[j];
+	times[1][k] = mysecond() - times[1][k];
+	
+	times[2][k] = mysecond();
+	for (j=0; j<N; j++)
+	    c[j] = a[j]+b[j];
+	times[2][k] = mysecond() - times[2][k];
+	
+	times[3][k] = mysecond();
+	for (j=0; j<N; j++)
+	    a[j] = b[j]+scalar*c[j];
+	times[3][k] = mysecond() - times[3][k];
+	}
+    
+    /*	--- SUMMARY --- */
+
+    for (k=0; k<NTIMES; k++)
+	{
+	for (j=0; j<4; j++)
+	    {
+	    rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function      Rate (MB/s)   RMS time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+	rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);
+
+	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
+	       1.0E-06 * bytes[j]/mintime[j],
+	       rmstime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+    return(minDelta);
+    }
+
diff --git a/Versions/stream_d.f b/Versions/stream_d.f
new file mode 100644
index 0000000..cff1125
--- /dev/null
+++ b/Versions/stream_d.f
@@ -0,0 +1,451 @@
+*=======================================================================
+* Program: STREAM
+* Programmer: John D. McCalpin
+* Revision: 5.3, January 10, 2003
+*-----------------------------------------------------------------------
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results" 
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*-----------------------------------------------------------------------
+* This program measures sustained memory transfer rates in MB/s for
+* simple computational kernels coded in FORTRAN.
+*
+* The intent is to demonstrate the extent to which ordinary user
+* code can exploit the main memory bandwidth of the system under
+* test.
+*=======================================================================
+* The STREAM web page is at:
+*          http://www.streambench.org
+*
+* Most of the content is currently hosted at:
+*          http://www.cs.virginia.edu/stream/
+*
+* BRIEF INSTRUCTIONS: 
+*       0) See http://www.cs.virginia.edu/stream/ref.html for details
+*       1) STREAM requires a timing function called mysecond().
+*          Several examples are provided in this directory.
+*          "CPU" timers are only allowed for uniprocessor runs.
+*          "Wall-clock" timers are required for all multiprocessor runs.
+*       2) The STREAM array sizes must be set to size the test.
+*          The value "N" must be chosen so that each of the three
+*          arrays is at least 4x larger than the sum of all the last-
+*          level caches used in the run, or 1 million elements, which-
+*          ever is larger.
+*          ------------------------------------------------------------
+*          Note that you are free to use any array length and offset
+*          that makes each array 4x larger than the last-level cache.
+*          The intent is to determine the *best* sustainable bandwidth
+*          available with this simple coding.  Of course, lower values
+*          are usually fairly easy to obtain on cached machines, but 
+*          by keeping the test to the *best* results, the answers are
+*          easier to interpret.
+*          You may put the arrays in common or not, at your discretion.
+*          There is a commented-out COMMON statement below.
+*          Fortran90 "allocatable" arrays are fine, too.
+*          ------------------------------------------------------------
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonably good, on the
+*          other hand, the optimizer might be too smart for me
+*          Please let me know if this happens.
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*          Please let me know if you do not want your name posted along
+*          with the submitted results.
+*       5) See the web page for more comments about the run rules and
+*          about interpretation of the results.
+*
+* Thanks,
+*   Dr. Bandwidth
+*=========================================================================
+*
+      PROGRAM stream
+*     IMPLICIT NONE
+C     .. Parameters ..
+      INTEGER n,offset,ndim,ntimes
+      PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION scalar,t
+      INTEGER j,k,nbpw,quantum
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
+     $                 times(4,ntimes)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      INTEGER checktick,realsize
+      EXTERNAL mysecond,checktick,realsize
+!$    INTEGER omp_get_num_threads
+!$    EXTERNAL omp_get_num_threads
+C     ..
+C     .. Intrinsic Functions ..
+C
+      INTRINSIC dble,max,min,nint,sqrt
+C     ..
+C     .. Arrays in Common ..
+      DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
+C     ..
+C     .. Common blocks ..
+*     COMMON a,b,c
+C     ..
+C     .. Data statements ..
+      DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Copy:      ','Scale:     ','Add:       ',
+     $     'Triad:     '/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      WRITE (*,FMT=9010) 'Array size = ',n
+      WRITE (*,FMT=9010) 'Offset     = ',offset
+      WRITE (*,FMT=9020) 'The total memory requirement is ',
+     $  3*nbpw*n/ (1024*1024),' MB'
+      WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
+      WRITE (*,FMT=9030) '--'
+      WRITE (*,FMT=9030) 'The *best* time for each test is used'
+      WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
+
+!$OMP PARALLEL
+!$    PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS()
+!$OMP END PARALLEL
+
+!$OMP PARALLEL DO
+      DO 10 j = 1,n
+          a(j) = 2.0d0
+          b(j) = 0.5D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = mysecond()
+!$OMP PARALLEL DO
+      DO 20 j = 1,n
+          a(j) = 0.5d0*a(j)
+   20 CONTINUE
+      t = mysecond() - t
+      PRINT *,'----------------------------------------------------'
+      quantum = checktick()
+      WRITE (*,FMT=9000)
+     $  'Your clock granularity/precision appears to be ',quantum,
+     $  ' microseconds'
+      PRINT *,'----------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      scalar = 0.5d0*a(1)
+      DO 70 k = 1,ntimes
+
+          t = mysecond()
+          a(1) = a(1) + t
+!$OMP PARALLEL DO
+          DO 30 j = 1,n
+              c(j) = a(j)
+   30     CONTINUE
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(1,k) = t
+
+          t = mysecond()
+          c(1) = c(1) + t
+!$OMP PARALLEL DO
+          DO 40 j = 1,n
+              b(j) = scalar*c(j)
+   40     CONTINUE
+          t = mysecond() - t
+          b(n) = b(n) + t
+          times(2,k) = t
+
+          t = mysecond()
+          a(1) = a(1) + t
+!$OMP PARALLEL DO
+          DO 50 j = 1,n
+              c(j) = a(j) + b(j)
+   50     CONTINUE
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(3,k) = t
+
+          t = mysecond()
+          b(1) = b(1) + t
+!$OMP PARALLEL DO
+          DO 60 j = 1,n
+              a(j) = b(j) + scalar*c(j)
+   60     CONTINUE
+          t = mysecond() - t
+          a(n) = a(n) + t
+          times(4,k) = t
+   70 CONTINUE
+
+*       --- SUMMARY ---
+      DO 90 k = 2,ntimes-1
+          DO 80 j = 1,4
+              avgtime(j) = avgtime(j) + times(j,k)
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   80     CONTINUE
+   90 CONTINUE
+      WRITE (*,FMT=9040)
+      DO 100 j = 1,4
+          avgtime(j) = avgtime(j)/dble(ntimes-2)
+          WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      avgtime(j),mintime(j),maxtime(j)
+  100 CONTINUE
+      PRINT *,'----------------------------------------------------'
+      CALL checksums (a,b,c,n,ntimes)
+      PRINT *,'----------------------------------------------------'
+
+ 9000 FORMAT (1x,a,i6,a)
+ 9010 FORMAT (1x,a,i10)
+ 9020 FORMAT (1x,a,i4,a)
+ 9030 FORMAT (1x,a,i3,a,a)
+ 9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
+     $       )
+ 9050 FORMAT (a,4 (f10.4,2x))
+      END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLE PRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLE PRECISION
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+*     IMPLICIT NONE
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL confuse
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0** (-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL confuse(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 40
+          END IF
+   30 CONTINUE
+      GO TO 50
+
+   40 WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLE PRECISION word'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      RETURN
+
+   50 PRINT *,'Hmmmm.  I am unable to determine the size.'
+      PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense.'
+          PRINT *,'Try again.'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'DOUBLE PRECISION number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per DOUBLE PRECISION number'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      END
+
+      SUBROUTINE confuse(q,r)
+*     IMPLICIT NONE
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
+
+* A semi-portable way to determine the clock granularity
+* Adapted from a code by John Henning of Digital Equipment Corporation
+*
+      INTEGER FUNCTION checktick()
+*     IMPLICIT NONE
+
+C     .. Parameters ..
+      INTEGER n
+      PARAMETER (n=20)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t1,t2
+      INTEGER i,j,jmin
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION timesfound(n)
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      EXTERNAL mysecond
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC max,min,nint
+C     ..
+      i = 0
+
+   10 t2 = mysecond()
+      IF (t2.EQ.t1) GO TO 10
+
+      t1 = t2
+      i = i + 1
+      timesfound(i) = t1
+      IF (i.LT.n) GO TO 10
+
+      jmin = 1000000
+      DO 20 i = 2,n
+          j = nint((timesfound(i)-timesfound(i-1))*1d6)
+          jmin = min(jmin,max(j,0))
+   20 CONTINUE
+
+      IF (jmin.GT.0) THEN
+          checktick = jmin
+      ELSE
+          PRINT *,'Your clock granularity appears to be less ',
+     $      'than one microsecond'
+          checktick = 1
+      END IF
+      RETURN
+
+*      PRINT 14, timesfound(1)*1d6
+*      DO 20 i=2,n
+*         PRINT 14, timesfound(i)*1d6,
+*     &       nint((timesfound(i)-timesfound(i-1))*1d6)
+*   14    FORMAT (1X, F18.4, 1X, i8)
+*   20 CONTINUE
+
+      END
+
+
+
+
+      SUBROUTINE checksums(a,b,c,n,ntimes)
+*     IMPLICIT NONE
+C     ..
+C     .. Arguments ..
+      DOUBLE PRECISION a(*),b(*),c(*)
+      INTEGER n,ntimes
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
+      INTEGER k
+C     ..
+
+C     Repeat the main loop, but with scalars only.
+C     This is done to check the sum & make sure all
+C     iterations have been executed correctly.
+
+      aa = 2.0D0
+      bb = 0.5D0
+      cc = 0.0D0
+      aa = 0.5D0*aa
+      scalar = 0.5d0*aa
+      DO k = 1,ntimes
+          cc = aa
+          bb = scalar*cc
+          cc = aa + bb
+          aa = bb + scalar*cc
+      END DO
+      aa = aa*DBLE(n-2)
+      bb = bb*DBLE(n-2)
+      cc = cc*DBLE(n-2)
+
+C     Now sum up the arrays, excluding the first and last
+C     elements, which are modified using the timing results
+C     to confuse aggressive optimizers.
+
+      suma = 0.0d0
+      sumb = 0.0d0
+      sumc = 0.0d0
+!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
+      DO 110 j = 2,n-1
+          suma = suma + a(j)
+          sumb = sumb + b(j)
+          sumc = sumc + c(j)
+  110 CONTINUE
+
+      epsilon = 1.D-6
+
+      IF (ABS(suma-aa)/suma .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array a()'
+          PRINT *,'Target   Sum of a is = ',aa
+          PRINT *,'Computed Sum of a is = ',suma
+      ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array b()'
+          PRINT *,'Target   Sum of b is = ',bb
+          PRINT *,'Computed Sum of b is = ',sumb
+      ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array c()'
+          PRINT *,'Target   Sum of c is = ',cc
+          PRINT *,'Computed Sum of c is = ',sumc
+      ELSE
+          PRINT *,'Solution Validates!'
+      ENDIF
+
+      END
+
diff --git a/Versions/stream_mpi.c b/Versions/stream_mpi.c
new file mode 100644
index 0000000..ae4e4a1
--- /dev/null
+++ b/Versions/stream_mpi.c
@@ -0,0 +1,831 @@
+/*-----------------------------------------------------------------------*/
+/* Program: STREAM                                                       */
+/* Revision: $Id: stream_mpi.c,v 1.8 2016/07/28 16:00:50 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2013: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*           "tuned STREAM benchmark results"                            */
+/*           "based on a variant of the STREAM benchmark code"           */
+/*         Other comparable, clear, and reasonable labelling is          */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+
+# define _XOPEN_SOURCE 600
+
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <math.h>
+# include <float.h>
+# include <string.h>
+# include <limits.h>
+# include <sys/time.h>
+# include "mpi.h"
+
+/*-----------------------------------------------------------------------
+ * INSTRUCTIONS:
+ *
+ *	1) STREAM requires different amounts of memory to run on different
+ *           systems, depending on both the system cache size(s) and the
+ *           granularity of the system timer.
+ *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
+ *           to meet *both* of the following criteria:
+ *       (a) Each array must be at least 4 times the size of the
+ *           available cache memory. I don't worry about the difference
+ *           between 10^6 and 2^20, so in practice the minimum array size
+ *           is about 3.8 times the cache size.
+ *           Example 1: One Xeon E3 with 8 MB L3 cache
+ *               STREAM_ARRAY_SIZE should be >= 4 million, giving
+ *               an array size of 30.5 MB and a total memory requirement
+ *               of 91.5 MB.  
+ *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
+ *               STREAM_ARRAY_SIZE should be >= 20 million, giving
+ *               an array size of 153 MB and a total memory requirement
+ *               of 458 MB.  
+ *       (b) The size should be large enough so that the 'timing calibration'
+ *           output by the program is at least 20 clock-ticks.  
+ *           Example: most versions of Windows have a 10 millisecond timer
+ *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
+ *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
+ *               This means the each array must be at least 1 GB, or 128M elements.
+ *
+ *      Version 5.10 increases the default array size from 2 million
+ *          elements to 10 million elements in response to the increasing
+ *          size of L3 caches.  The new default size is large enough for caches
+ *          up to 20 MB. 
+ *      Version 5.10 changes the loop index variables from "register int"
+ *          to "ssize_t", which allows array indices >2^32 (4 billion)
+ *          on properly configured 64-bit systems.  Additional compiler options
+ *          (such as "-mcmodel=medium") may be required for large memory runs.
+ *
+ *      Array size can be set at compile time without modifying the source
+ *          code for the (many) compilers that support preprocessor definitions
+ *          on the compile line.  E.g.,
+ *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
+ *          will override the default size of 10M with a new size of 100M elements
+ *          per array.
+ */
+
+// ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------
+// For the MPI version of STREAM, the three arrays with this many elements
+// each will be *distributed* across the MPI ranks.  
+//
+// Be careful when computing the array size needed for a particular target
+// system to meet the minimum size requirement to ensure overflowing the caches.
+//
+// Example:
+//    Assume 4 nodes with two Intel Xeon E5-2680 processors (20 MiB L3) each.
+//    The *total* L3 cache size is 4*2*20 = 160 MiB, so each array must be
+//    at least 640 MiB, or at least 80 million 8 Byte elements. 
+// Note that it does not matter whether you use one MPI rank per node or 
+//    16 MPI ranks per node -- only the total array size and the total
+//    cache size matter.
+//
+#ifndef STREAM_ARRAY_SIZE
+#   define STREAM_ARRAY_SIZE	10000000
+#endif
+
+/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
+ *         for any iteration after the first, therefore the minimum value
+ *         for NTIMES is 2.
+ *      There are no rules on maximum allowable values for NTIMES, but
+ *         values larger than the default are unlikely to noticeably
+ *         increase the reported performance.
+ *      NTIMES can also be set on the compile line without changing the source
+ *         code using, for example, "-DNTIMES=7".
+ */
+#ifdef NTIMES
+#if NTIMES<=1
+#   define NTIMES	10
+#endif
+#endif
+#ifndef NTIMES
+#   define NTIMES	10
+#endif
+
+// Make the scalar coefficient modifiable at compile time.
+// The old value of 3.0 cause floating-point overflows after a relatively small
+// number of iterations.  The new default of 0.42 allows over 2000 iterations for
+// 32-bit IEEE arithmetic and over 18000 iterations for 64-bit IEEE arithmetic.
+// The growth in the solution can be eliminated (almost) completely by setting 
+// the scalar value to 0.41421445, but this also means that the error checking
+// code no longer triggers an error if the code does not actually execute the
+// correct number of iterations!
+#ifndef SCALAR
+#define SCALAR 0.42
+#endif
+
+
+// ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------
+// The OFFSET preprocessor variable is not used in this version of the benchmark.
+// The user must change the code at or after the "posix_memalign" array allocations
+//    to change the relative alignment of the pointers.
+// ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------
+#ifndef OFFSET
+#   define OFFSET	0
+#endif
+
+
+/*
+ *	3) Compile the code with optimization.  Many compilers generate
+ *       unreasonably bad code before the optimizer tightens things up.  
+ *     If the results are unreasonably good, on the other hand, the
+ *       optimizer might be too smart for me!
+ *
+ *     For a simple single-core version, try compiling with:
+ *            cc -O stream.c -o stream
+ *     This is known to work on many, many systems....
+ *
+ *     To use multiple cores, you need to tell the compiler to obey the OpenMP
+ *       directives in the code.  This varies by compiler, but a common example is
+ *            gcc -O -fopenmp stream.c -o stream_omp
+ *       The environment variable OMP_NUM_THREADS allows runtime control of the 
+ *         number of threads/cores used when the resulting "stream_omp" program
+ *         is executed.
+ *
+ *     To run with single-precision variables and arithmetic, simply add
+ *         -DSTREAM_TYPE=float
+ *     to the compile line.
+ *     Note that this changes the minimum array sizes required --- see (1) above.
+ *
+ *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
+ *       code to call separate functions to execute each kernel.  Trivial versions
+ *       of these functions are provided, but they are *not* tuned -- they just 
+ *       provide predefined interfaces to be replaced with tuned code.
+ *
+ *
+ *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include info that will help me understand:
+ *		a) the computer hardware configuration (e.g., processor model, memory type)
+ *		b) the compiler name/version and compilation flags
+ *      c) any run-time information (such as OMP_NUM_THREADS)
+ *		d) all of the output from the test case.
+ *
+ * Thanks!
+ *
+ *-----------------------------------------------------------------------*/
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+#ifndef STREAM_TYPE
+#define STREAM_TYPE double
+#endif
+
+//static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
+//			b[STREAM_ARRAY_SIZE+OFFSET],
+//			c[STREAM_ARRAY_SIZE+OFFSET];
+
+// Some compilers require an extra keyword to recognize the "restrict" qualifier.
+double * restrict a, * restrict b, * restrict c;
+
+size_t		array_elements, array_bytes, array_alignment;
+static double	avgtime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
+    };
+
+extern void checkSTREAMresults(STREAM_TYPE *AvgErrByRank, int numranks);
+extern void computeSTREAMerrors(STREAM_TYPE *aAvgErr, STREAM_TYPE *bAvgErr, STREAM_TYPE *cAvgErr);
+#ifdef TUNED
+extern void tuned_STREAM_Copy();
+extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
+extern void tuned_STREAM_Add();
+extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
+#endif
+#ifdef _OPENMP
+extern int omp_get_num_threads();
+#endif
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    int			i,k;
+    ssize_t		j;
+    STREAM_TYPE		scalar;
+    double		t, times[4][NTIMES];
+	double		*TimesByRank;
+	double		t0,t1,tmin;
+	int         rc, numranks, myrank;
+	STREAM_TYPE	AvgError[3] = {0.0,0.0,0.0};
+	STREAM_TYPE *AvgErrByRank;
+
+    /* --- SETUP --- call MPI_Init() before anything else! --- */
+
+    rc = MPI_Init(NULL, NULL);
+	t0 = MPI_Wtime();
+    if (rc != MPI_SUCCESS) {
+       printf("ERROR: MPI Initialization failed with return code %d\n",rc);
+       exit(1);
+    }
+	// if either of these fail there is something really screwed up!
+	MPI_Comm_size(MPI_COMM_WORLD, &numranks);
+	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+    /* --- NEW FEATURE --- distribute requested storage across MPI ranks --- */
+	array_elements = STREAM_ARRAY_SIZE / numranks;		// don't worry about rounding vs truncation
+    array_alignment = 64;						// Can be modified -- provides partial support for adjusting relative alignment
+
+	// Dynamically allocate the three arrays using "posix_memalign()"
+	// NOTE that the OFFSET parameter is not used in this version of the code!
+    array_bytes = array_elements * sizeof(STREAM_TYPE);
+    k = posix_memalign((void **)&a, array_alignment, array_bytes);
+    if (k != 0) {
+        printf("Rank %d: Allocation of array a failed, return code is %d\n",myrank,k);
+		MPI_Abort(MPI_COMM_WORLD, 2);
+        exit(1);
+    }
+    k = posix_memalign((void **)&b, array_alignment, array_bytes);
+    if (k != 0) {
+        printf("Rank %d: Allocation of array b failed, return code is %d\n",myrank,k);
+		MPI_Abort(MPI_COMM_WORLD, 2);
+        exit(1);
+    }
+    k = posix_memalign((void **)&c, array_alignment, array_bytes);
+    if (k != 0) {
+        printf("Rank %d: Allocation of array c failed, return code is %d\n",myrank,k);
+		MPI_Abort(MPI_COMM_WORLD, 2);
+        exit(1);
+    }
+
+	// Initial informational printouts -- rank 0 handles all the output
+	if (myrank == 0) {
+		printf(HLINE);
+		printf("STREAM version $Revision: 1.8 $\n");
+		printf(HLINE);
+		BytesPerWord = sizeof(STREAM_TYPE);
+		printf("This system uses %d bytes per array element.\n",
+		BytesPerWord);
+
+		printf(HLINE);
+#ifdef N
+		printf("*****  WARNING: ******\n");
+		printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
+		printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
+		printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
+		printf("*****  WARNING: ******\n");
+#endif
+		if (OFFSET != 0) {
+			printf("*****  WARNING: ******\n");
+			printf("   This version ignores the OFFSET parameter.\n");
+			printf("*****  WARNING: ******\n");
+		}
+
+		printf("Total Aggregate Array size = %llu (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE);
+		printf("Total Aggregate Memory per array = %.1f MiB (= %.1f GiB).\n", 
+			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
+			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
+		printf("Total Aggregate memory required = %.1f MiB (= %.1f GiB).\n",
+			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
+			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
+		printf("Data is distributed across %d MPI ranks\n",numranks);
+		printf("   Array size per MPI rank = %llu (elements)\n" , (unsigned long long) array_elements);
+		printf("   Memory per array per MPI rank = %.1f MiB (= %.1f GiB).\n", 
+			BytesPerWord * ( (double) array_elements / 1024.0/1024.0),
+			BytesPerWord * ( (double) array_elements / 1024.0/1024.0/1024.0));
+		printf("   Total memory per MPI rank = %.1f MiB (= %.1f GiB).\n",
+			(3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024.),
+			(3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024./1024.));
+
+		printf(HLINE);
+		printf("Each kernel will be executed %d times.\n", NTIMES);
+		printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
+		printf(" will be used to compute the reported bandwidth.\n");
+		printf("The SCALAR value used for this run is %f\n",SCALAR);
+
+#ifdef _OPENMP
+		printf(HLINE);
+#pragma omp parallel 
+		{
+#pragma omp master
+		{
+			k = omp_get_num_threads();
+			printf ("Number of Threads requested for each MPI rank = %i\n",k);
+			}
+		}
+#endif
+
+#ifdef _OPENMP
+		k = 0;
+#pragma omp parallel
+#pragma omp atomic 
+			k++;
+		printf ("Number of Threads counted for rank 0 = %i\n",k);
+#endif
+
+	}
+
+    /* --- SETUP --- initialize arrays and estimate precision of timer --- */
+
+#pragma omp parallel for
+    for (j=0; j<array_elements; j++) {
+	    a[j] = 1.0;
+	    b[j] = 2.0;
+	    c[j] = 0.0;
+	}
+
+	// Rank 0 needs to allocate arrays to hold error data and timing data from
+	// all ranks for analysis and output.
+	// Allocate and instantiate the arrays here -- after the primary arrays 
+	// have been instantiated -- so there is no possibility of having these 
+	// auxiliary arrays mess up the NUMA placement of the primary arrays.
+
+	if (myrank == 0) {
+		// There are 3 average error values for each rank (using STREAM_TYPE).
+		AvgErrByRank = (double *) malloc(3 * sizeof(STREAM_TYPE) * numranks);
+		if (AvgErrByRank == NULL) {
+			printf("Ooops -- allocation of arrays to collect errors on MPI rank 0 failed\n");
+			MPI_Abort(MPI_COMM_WORLD, 2);
+		}
+		memset(AvgErrByRank,0,3*sizeof(STREAM_TYPE)*numranks);
+
+		// There are 4*NTIMES timing values for each rank (always doubles)
+		TimesByRank = (double *) malloc(4 * NTIMES * sizeof(double) * numranks);
+		if (TimesByRank == NULL) {
+			printf("Ooops -- allocation of arrays to collect timing data on MPI rank 0 failed\n");
+			MPI_Abort(MPI_COMM_WORLD, 3);
+		}
+		memset(TimesByRank,0,4*NTIMES*sizeof(double)*numranks);
+	}
+
+	// Simple check for granularity of the timer being used
+	if (myrank == 0) {
+		printf(HLINE);
+
+		if  ( (quantum = checktick()) >= 1) 
+		printf("Your timer granularity/precision appears to be "
+			"%d microseconds.\n", quantum);
+		else {
+		printf("Your timer granularity appears to be "
+			"less than one microsecond.\n");
+		quantum = 1;
+		}
+	}
+
+    /* Get initial timing estimate to compare to timer granularity. */
+	/* All ranks need to run this code since it changes the values in array a */
+    t = MPI_Wtime();
+#pragma omp parallel for
+    for (j = 0; j < array_elements; j++)
+		a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (MPI_Wtime() - t);
+
+	if (myrank == 0) {
+		printf("Each test below will take on the order"
+		" of %d microseconds.\n", (int) t  );
+		printf("   (= %d timer ticks)\n", (int) (t/quantum) );
+		printf("Increase the size of the arrays if this shows that\n");
+		printf("you are not getting at least 20 timer ticks per test.\n");
+
+		printf(HLINE);
+
+		printf("WARNING -- The above is only a rough guideline.\n");
+		printf("For best results, please be sure you know the\n");
+		printf("precision of your system timer.\n");
+		printf(HLINE);
+#ifdef VERBOSE
+		t1 = MPI_Wtime();
+		printf("VERBOSE: total setup time for rank 0 = %f seconds\n",t1-t0);
+		printf(HLINE);
+#endif
+	}
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    // This code has more barriers and timing calls than are actually needed, but
+    // this should not cause a problem for arrays that are large enough to satisfy
+    // the STREAM run rules.
+	// MAJOR FIX!!!  Version 1.7 had the start timer for each loop *after* the
+	// MPI_Barrier(), when it should have been *before* the MPI_Barrier().
+    // 
+
+    scalar = SCALAR;
+    for (k=0; k<NTIMES; k++)
+	{
+		// kernel 1: Copy
+		t0 = MPI_Wtime();
+		MPI_Barrier(MPI_COMM_WORLD);
+#ifdef TUNED
+        tuned_STREAM_Copy();
+#else
+#pragma omp parallel for
+		for (j=0; j<array_elements; j++)
+			c[j] = a[j];
+#endif
+		MPI_Barrier(MPI_COMM_WORLD);
+		t1 = MPI_Wtime();
+		times[0][k] = t1 - t0;
+
+		// kernel 2: Scale
+		t0 = MPI_Wtime();
+		MPI_Barrier(MPI_COMM_WORLD);
+#ifdef TUNED
+        tuned_STREAM_Scale(scalar);
+#else
+#pragma omp parallel for
+		for (j=0; j<array_elements; j++)
+			b[j] = scalar*c[j];
+#endif
+		MPI_Barrier(MPI_COMM_WORLD);
+		t1 = MPI_Wtime();
+		times[1][k] = t1-t0;
+	
+		// kernel 3: Add
+		t0 = MPI_Wtime();
+		MPI_Barrier(MPI_COMM_WORLD);
+#ifdef TUNED
+        tuned_STREAM_Add();
+#else
+#pragma omp parallel for
+		for (j=0; j<array_elements; j++)
+			c[j] = a[j]+b[j];
+#endif
+		MPI_Barrier(MPI_COMM_WORLD);
+		t1 = MPI_Wtime();
+		times[2][k] = t1-t0;
+	
+		// kernel 4: Triad
+		t0 = MPI_Wtime();
+		MPI_Barrier(MPI_COMM_WORLD);
+#ifdef TUNED
+        tuned_STREAM_Triad(scalar);
+#else
+#pragma omp parallel for
+		for (j=0; j<array_elements; j++)
+			a[j] = b[j]+scalar*c[j];
+#endif
+		MPI_Barrier(MPI_COMM_WORLD);
+		t1 = MPI_Wtime();
+		times[3][k] = t1-t0;
+	}
+
+	t0 = MPI_Wtime();
+
+    /*	--- SUMMARY --- */
+
+	// Because of the MPI_Barrier() calls, the timings from any thread are equally valid. 
+    // The best estimate of the maximum performance is the minimum of the "outside the barrier"
+    // timings across all the MPI ranks.
+
+	// Gather all timing data to MPI rank 0
+	MPI_Gather(times, 4*NTIMES, MPI_DOUBLE, TimesByRank, 4*NTIMES, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+	// Rank 0 processes all timing data
+	if (myrank == 0) {
+		// for each iteration and each kernel, collect the minimum time across all MPI ranks
+		// and overwrite the rank 0 "times" variable with the minimum so the original post-
+		// processing code can still be used.
+		for (k=0; k<NTIMES; k++) {
+			for (j=0; j<4; j++) {
+				tmin = 1.0e36;
+				for (i=0; i<numranks; i++) {
+					// printf("DEBUG: Timing: iter %d, kernel %lu, rank %d, tmin %f, TbyRank %f\n",k,j,i,tmin,TimesByRank[4*NTIMES*i+j*NTIMES+k]);
+					tmin = MIN(tmin, TimesByRank[4*NTIMES*i+j*NTIMES+k]);
+				}
+				// printf("DEBUG: Final Timing: iter %d, kernel %lu, final tmin %f\n",k,j,tmin);
+				times[j][k] = tmin;
+			}
+		}
+
+	// Back to the original code, but now using the minimum global timing across all ranks
+		for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+		{
+		for (j=0; j<4; j++)
+			{
+			avgtime[j] = avgtime[j] + times[j][k];
+			mintime[j] = MIN(mintime[j], times[j][k]);
+			maxtime[j] = MAX(maxtime[j], times[j][k]);
+			}
+		}
+    
+		// note that "bytes[j]" is the aggregate array size, so no "numranks" is needed here
+		printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
+		for (j=0; j<4; j++) {
+			avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+			printf("%s%11.1f  %11.6f  %11.6f  %11.6f\n", label[j],
+			   1.0E-06 * bytes[j]/mintime[j],
+			   avgtime[j],
+			   mintime[j],
+			   maxtime[j]);
+		}
+		printf(HLINE);
+	}
+
+    /* --- Every Rank Checks its Results --- */
+#ifdef INJECTERROR
+	a[11] = 100.0 * a[11];
+#endif
+	computeSTREAMerrors(&AvgError[0], &AvgError[1], &AvgError[2]);
+	/* --- Collect the Average Errors for Each Array on Rank 0 --- */
+	MPI_Gather(AvgError, 3, MPI_DOUBLE, AvgErrByRank, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+	/* -- Combined averaged errors and report on Rank 0 only --- */
+	if (myrank == 0) {
+#ifdef VERBOSE
+		for (k=0; k<numranks; k++) {
+			printf("VERBOSE: rank %d, AvgErrors %e %e %e\n",k,AvgErrByRank[3*k+0],
+				AvgErrByRank[3*k+1],AvgErrByRank[3*k+2]);
+		}
+#endif
+		checkSTREAMresults(AvgErrByRank,numranks);
+		printf(HLINE);
+	}
+
+#ifdef VERBOSE
+	if (myrank == 0) {
+		t1 = MPI_Wtime();
+		printf("VERBOSE: total shutdown time for rank %d = %f seconds\n",myrank,t1-t0);
+	}
+#endif
+
+	free(a);
+	free(b);
+	free(c);
+	if (myrank == 0) {
+		free(TimesByRank);
+		free(AvgErrByRank);
+	}
+
+    MPI_Finalize();
+	return(0);
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = MPI_Wtime();
+	while( ((t2=MPI_Wtime()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+// ----------------------------------------------------------------------------------
+// For the MPI code I separate the computation of errors from the error
+// reporting output functions (which are handled by MPI rank 0).
+// ----------------------------------------------------------------------------------
+#ifndef abs
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+#endif
+void computeSTREAMerrors(STREAM_TYPE *aAvgErr, STREAM_TYPE *bAvgErr, STREAM_TYPE *cAvgErr)
+{
+	STREAM_TYPE aj,bj,cj,scalar;
+	STREAM_TYPE aSumErr,bSumErr,cSumErr;
+	ssize_t	j;
+	int	k;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = SCALAR;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+
+    /* accumulate deltas between observed and expected results */
+	aSumErr = 0.0;
+	bSumErr = 0.0;
+	cSumErr = 0.0;
+	for (j=0; j<array_elements; j++) {
+		aSumErr += abs(a[j] - aj);
+		bSumErr += abs(b[j] - bj);
+		cSumErr += abs(c[j] - cj);
+	}
+	*aAvgErr = aSumErr / (STREAM_TYPE) array_elements;
+	*bAvgErr = bSumErr / (STREAM_TYPE) array_elements;
+	*cAvgErr = cSumErr / (STREAM_TYPE) array_elements;
+}
+
+
+
+void checkSTREAMresults (STREAM_TYPE *AvgErrByRank, int numranks)
+{
+	STREAM_TYPE aj,bj,cj,scalar;
+	STREAM_TYPE aSumErr,bSumErr,cSumErr;
+	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
+	double epsilon;
+	ssize_t	j;
+	int	k,ierr,err;
+
+	// Repeat the computation of aj, bj, cj because I am lazy
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = SCALAR;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+
+	// Compute the average of the average errors contributed by each MPI rank
+	aSumErr = 0.0;
+	bSumErr = 0.0;
+	cSumErr = 0.0;
+	for (k=0; k<numranks; k++) {
+		aSumErr += AvgErrByRank[3*k + 0];
+		bSumErr += AvgErrByRank[3*k + 1];
+		cSumErr += AvgErrByRank[3*k + 2];
+	}
+	aAvgErr = aSumErr / (STREAM_TYPE) numranks;
+	bAvgErr = bSumErr / (STREAM_TYPE) numranks;
+	cAvgErr = cSumErr / (STREAM_TYPE) numranks;
+
+	if (sizeof(STREAM_TYPE) == 4) {
+		epsilon = 1.e-6;
+	}
+	else if (sizeof(STREAM_TYPE) == 8) {
+		epsilon = 1.e-13;
+	}
+	else {
+		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
+		epsilon = 1.e-6;
+	}
+
+	err = 0;
+	if (abs(aAvgErr/aj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
+		ierr = 0;
+		for (j=0; j<array_elements; j++) {
+			if (abs(a[j]/aj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,aj,a[j],abs((aj-a[j])/aAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array a[], %d errors were found.\n",ierr);
+	}
+	if (abs(bAvgErr/bj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<array_elements; j++) {
+			if (abs(b[j]/bj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,bj,b[j],abs((bj-b[j])/bAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array b[], %d errors were found.\n",ierr);
+	}
+	if (abs(cAvgErr/cj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<array_elements; j++) {
+			if (abs(c[j]/cj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,cj,c[j],abs((cj-c[j])/cAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array c[], %d errors were found.\n",ierr);
+	}
+	if (err == 0) {
+		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
+	}
+#ifdef VERBOSE
+	printf ("Results Validation Verbose Results: \n");
+	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
+	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
+	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
+#endif
+}
+
+#ifdef TUNED
+/* stubs for "tuned" versions of the kernels */
+void tuned_STREAM_Copy()
+{
+	ssize_t j;
+#pragma omp parallel for
+        for (j=0; j<array_elements; j++)
+            c[j] = a[j];
+}
+
+void tuned_STREAM_Scale(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<array_elements; j++)
+	    b[j] = scalar*c[j];
+}
+
+void tuned_STREAM_Add()
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<array_elements; j++)
+	    c[j] = a[j]+b[j];
+}
+
+void tuned_STREAM_Triad(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<array_elements; j++)
+	    a[j] = b[j]+scalar*c[j];
+}
+/* end of stubs for the "tuned" versions of the kernels */
+#endif
diff --git a/Versions/stream_mpi.f b/Versions/stream_mpi.f
new file mode 100644
index 0000000..dc70cb3
--- /dev/null
+++ b/Versions/stream_mpi.f
@@ -0,0 +1,575 @@
+* Program: STREAM
+* Programmer: John D. McCalpin, P. Vezolle
+*
+*-----------------------------------------------------------------------
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results"
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*-----------------------------------------------------------------------
+*                            MPI VERSION
+*
+*  Latest Modification: February 14, 2014 (John D. McCalpin)
+*     Minor update to output formatting to prevent overflows.
+*  Latest Modification: May 17, 2002 (P. Vezolle)
+*  Based on STREAM version 5.0, July 30, 2000
+*
+* This version has been shown to work under the MPI environment
+* of IBM's AIX 5.1 --- comments or suggestions on how to improve
+* portability are welcome!    mailto:john@mccalpin.com
+* 
+* This program measures memory transfer rates in MB/s for simple
+* computational kernels coded in Fortran.  
+* The intent is to demonstrate the extent to which ordinary user
+* code can exploit the main memory bandwidth of the system under
+* test.
+*=========================================================================
+* The STREAM web page is at:
+*          http://www.streambench.org
+*
+* Most of the content is currently hosted at:
+*          http://www.cs.virginia.edu/stream/
+*
+* BRIEF INSTRUCTIONS: 
+*       0) See http://www.cs.virginia.edu/stream/ref.html for details
+*       1) STREAM requires a timing function called mysecond().
+*          Several examples are provided in this directory.
+*          "CPU" timers are only allowed for uniprocessor runs.
+*          "Wall-clock" timers are required for all multiprocessor runs.
+*       2) The STREAM array sizes must be set to size the test.
+*          The value "N" must be chosen so that each of the three
+*          arrays is at least 4x larger than the sum of all the last-
+*          level caches used in the run, or 1 million elements, which-
+*          ever is larger.
+*          ------------------------------------------------------------
+*          Note that you are free to use any array length and offset
+*          that makes each array 4x larger than the last-level cache.
+*          The intent is to determine the *best* sustainable bandwidth
+*          available with this simple coding.  Of course, lower values
+*          are usually fairly easy to obtain on cached machines, but 
+*          by keeping the test to the *best* results, the answers are
+*          easier to interpret.
+*          You may put the arrays in common or not, at your discretion.
+*          There is a commented-out COMMON statement below.
+*          Fortran90 "allocatable" arrays are fine, too.
+*          ------------------------------------------------------------
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonably good, on the
+*          other hand, the optimizer might be too smart for me
+*          Please let me know if this happens.
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*          Please let me know if you do not want your name posted along
+*          with the submitted results.
+*       5) See the web page for more comments about the run rules and
+*          about interpretation of the results.
+*
+* Thanks,
+*   Dr. Bandwidth
+*=========================================================================
+*
+      PROGRAM stream_mpi
+*     IMPLICIT NONE
+
+      include 'mpif.h'
+
+C     .. Parameters ..
+      INTEGER  n,offset,ndim,ntimes
+      PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
+
+*     .. MPI Variables ..
+      integer numtask, rank
+      integer rc
+      DOUBLE PRECISION rc1
+
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION scalar,t
+      INTEGER j,k,nbpw,quantum
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
+     $                 times(4,ntimes)
+      INTEGER bytes(4)
+      CHARACTER*11 label(4)
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      INTEGER checktick,realsize
+      EXTERNAL mysecond,checktick,realsize
+C     ..
+C     .. Intrinsic Functions ..
+C
+      INTRINSIC dble,max,min,nint,sqrt
+C     ..
+C     .. Arrays in Common ..
+      DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
+C     ..
+C     .. Common blocks ..
+*     COMMON a,b,c
+C     ..
+C     .. Data statements ..
+      DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Copy:      ','Scale:     ','Add:       ',
+     $     'Triad:     '/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+*     .. MPI Initialization ..
+
+      call MPI_INIT ( rc )
+      if ( rc .ne. 0 ) then
+         WRITE(*,*) ' MPI Initialization problem, error code: ',rc
+         stop
+      endif
+      call MPI_COMM_RANK ( MPI_COMM_WORLD, rank, rc )
+      call MPI_COMM_SIZE ( MPI_COMM_WORLD, numtask, rc )
+
+      if ( rank .eq. 0 ) then
+        nbpw = realsize()
+        WRITE (*,*)        'Number of processors = ', numtask
+        WRITE (*,FMT=9010) 'Array size = ',n
+        WRITE (*,FMT=9010) 'Offset     = ',offset
+        WRITE (*,FMT=9060) 'The total memory requirement is ',
+     $     3.0*nbpw*n*numtask/(1024*1024),' MB',
+     $     ' (',3.0*nbpw*n/(1024*1024),'MB/task)'
+        WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
+        WRITE (*,FMT=9030) '--'
+        WRITE (*,FMT=9030) 'The *best* time for each test is used'
+        WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
+      endif
+
+      DO 10 j = 1,n
+          a(j) = 2.0d0
+          b(j) = 0.5D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = mysecond()
+      DO 20 j = 1,n
+          a(j) = 0.5d0*a(j)
+   20 CONTINUE
+      t = mysecond() - t
+      if ( rank .eq. 0 ) then
+        PRINT *,'----------------------------------------------------'
+        quantum = checktick()
+        WRITE (*,FMT=9000)
+     $    'Your clock granularity/precision appears to be ',quantum,
+     $    ' microseconds'
+        PRINT *,'----------------------------------------------------'
+      endif
+
+      call MPI_BARRIER( MPI_COMM_WORLD, rc)
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      scalar = 0.5d0*a(1)
+      DO 70 k = 1,ntimes
+
+!********** COPY
+
+        call MPI_BARRIER( MPI_COMM_WORLD, rc)
+          t = mysecond()
+          a(1) = a(1) + t
+          DO 30 j = 1,n
+              c(j) = a(j)
+   30     CONTINUE
+        call MPI_BARRIER( MPI_COMM_WORLD, rc)
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(1,k) = t
+
+!********** SCALE
+
+        call MPI_BARRIER( MPI_COMM_WORLD, rc)
+          t = mysecond()
+          c(1) = c(1) + t
+          DO 40 j = 1,n
+              b(j) = scalar*c(j)
+   40     CONTINUE
+        call MPI_BARRIER( MPI_COMM_WORLD, rc)
+          t = mysecond() - t
+          b(n) = b(n) + t
+          times(2,k) = t
+
+!********** ADD
+
+        call MPI_BARRIER( MPI_COMM_WORLD, rc)
+          t = mysecond()
+          a(1) = a(1) + t
+          DO 50 j = 1,n
+              c(j) = a(j) + b(j)
+   50     CONTINUE
+        call MPI_BARRIER( MPI_COMM_WORLD, rc)
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(3,k) = t
+
+!********** TRIAD
+
+        call MPI_BARRIER( MPI_COMM_WORLD, rc)
+          t = mysecond()
+          b(1) = b(1) + t
+          DO 60 j = 1,n
+              a(j) = b(j) + scalar*c(j)
+   60     CONTINUE
+        call MPI_BARRIER( MPI_COMM_WORLD, rc)
+          t = mysecond() - t
+          a(n) = a(n) + t
+          times(4,k) = t
+   70 CONTINUE
+
+*       --- SUMMARY ---
+      if ( rank .eq. 0 ) then
+        DO 90 k = 2,ntimes-1
+          DO 80 j = 1,4
+              avgtime(j) = avgtime(j) + times(j,k)
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   80     CONTINUE
+   90   CONTINUE
+        WRITE (*,FMT=9040)
+        DO 100 j = 1,4
+          avgtime(j) = avgtime(j)/dble(ntimes-2)
+          rc1 = 1.*n*bytes(j)*nbpw/mintime(j)/1.0D6
+          WRITE (*,FMT=9050) label(j),rc1*numtask,
+     $      avgtime(j),mintime(j),maxtime(j)
+  100   CONTINUE
+       endif
+       CALL checksums (a,b,c,n,ntimes,rank,numtask)
+
+ 1001 continue
+
+      call MPI_FINALIZE ( rc )
+      stop
+
+ 9000 FORMAT (1x,a,i6,a)
+ 9010 FORMAT (1x,a,i10)
+ 9020 FORMAT (1x,a,i4,a)
+ 9030 FORMAT (1x,a,i3,a,a)
+ 9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
+     $       )
+ 9050 FORMAT (a,f12.1,3 (f11.6,2x))
+ 9060 FORMAT (1x,a,f9.1,a,a,f9.1,a)
+      END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLE PRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLE PRECISION
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+*     IMPLICIT NONE
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL confuse
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0** (-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL confuse(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 40
+          END IF
+   30 CONTINUE
+      GO TO 50
+
+   40 WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLE PRECISION word'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      RETURN
+
+   50 PRINT *,'Hmmmm.  I am unable to determine the size.'
+      PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense.'
+          PRINT *,'Try again.'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'DOUBLE PRECISION number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per DOUBLE PRECISION number'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      END
+
+      SUBROUTINE confuse(q,r)
+*     IMPLICIT NONE
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
+
+* A semi-portable way to determine the clock granularity
+* Adapted from a code by John Henning of Digital Equipment Corporation
+*
+      INTEGER FUNCTION checktick()
+*     IMPLICIT NONE
+
+C     .. Parameters ..
+      INTEGER n
+      PARAMETER (n=20)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t1,t2
+      INTEGER i,j,jmin
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION timesfound(n)
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      EXTERNAL mysecond
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC max,min,nint
+C     ..
+      i = 0
+      t1 = mysecond()
+
+   10 t2 = mysecond()
+      IF (t2.EQ.t1) GO TO 10
+
+      t1 = t2
+      i = i + 1
+      timesfound(i) = t1
+      IF (i.LT.n) GO TO 10
+
+      jmin = 1000000
+      DO 20 i = 2,n
+          j = nint((timesfound(i)-timesfound(i-1))*1d6)
+          jmin = min(jmin,max(j,0))
+   20 CONTINUE
+
+      IF (jmin.GT.0) THEN
+          checktick = jmin
+      ELSE
+          PRINT *,'Your clock granularity appears to be less ',
+     $      'than one microsecond'
+          checktick = 1
+      END IF
+      RETURN
+
+*      PRINT 14, timesfound(1)*1d6
+*      DO 20 i=2,n
+*         PRINT 14, timesfound(i)*1d6,
+*     &       nint((timesfound(i)-timesfound(i-1))*1d6)
+*   14    FORMAT (1X, F18.4, 1X, i8)
+*   20 CONTINUE
+
+      END
+
+
+      SUBROUTINE checksums(a,b,c,n,ntimes,rank,numtask)
+*     IMPLICIT NONE
+      include 'mpif.h'
+C     ..
+C     .. Arguments ..
+      DOUBLE PRECISION a(*),b(*),c(*)
+      INTEGER n,ntimes
+      INTEGER rank,numtask,rc, Stat(MPI_STATUS_SIZE)
+      DOUBLE PRECISION Status(2), ival
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
+      INTEGER k
+C     ..
+
+      ival = 0
+
+C     Repeat the main loop, but with scalars only.
+C     This is done to check the sum & make sure all
+C     iterations have been executed correctly.
+
+      aa = 2.0D0
+      bb = 0.5D0
+      cc = 0.0D0
+      aa = 0.5D0*aa
+      scalar = 0.5d0*aa
+      DO k = 1,ntimes
+          cc = aa
+          bb = scalar*cc
+          cc = aa + bb
+          aa = bb + scalar*cc
+      END DO
+      aa = aa*DBLE(n-2)
+      bb = bb*DBLE(n-2)
+      cc = cc*DBLE(n-2)
+
+C     Now sum up the arrays, excluding the first and last
+C     elements, which are modified using the timing results
+C     to confuse aggressive optimizers.
+
+      suma = 0.0d0
+      sumb = 0.0d0
+      sumc = 0.0d0
+!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
+      DO 110 j = 2,n-1
+          suma = suma + a(j)
+          sumb = sumb + b(j)
+          sumc = sumc + c(j)
+  110 CONTINUE
+
+      epsilon = 1.D-6
+    
+*     .. Gather results by process  0
+      IF (ABS(suma-aa)/suma .GT. epsilon) THEN
+          Status(1) = aa
+          Status(2) = suma
+      ELSE
+           Status(1) = 0
+      ENDIF
+      if ( rank .ne. 0 ) then
+          call MPI_SEND(Status, 2, MPI_DOUBLE_PRECISION, 0, rank,
+     $                  MPI_COMM_WORLD, rc)
+      else
+         ival = ival + Status(1)
+         if ( Status(1) .ne. 0 ) then
+            PRINT *,'Failed Validation on array a(), Process ',rank
+            PRINT *,'Target   Sum of a is = ',Status(1)
+            PRINT *,'Computed Sum of a is = ',Status(2)
+         endif
+         do i=1,numtask-1
+          call MPI_RECV(Status, 2, MPI_DOUBLE_PRECISION, i, i,
+     s                  MPI_COMM_WORLD, Stat, rc);
+          ival = ival + Status(1)
+          if ( Status(1) .ne. 0 ) then
+            PRINT *,'Failed Validation on array a(), Process ',i
+            PRINT *,'Target   Sum of a is = ',Status(1)
+            PRINT *,'Computed Sum of a is = ',Status(2)
+          endif
+         enddo
+      endif
+
+      IF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
+          Status(1) = bb
+          Status(2) = sumb
+      ELSE
+           Status(1) = 0
+      ENDIF
+      if ( rank .ne. 0 ) then
+          call MPI_SEND(Status, 2, MPI_DOUBLE_PRECISION, 0, 2*rank,
+     $                  MPI_COMM_WORLD, rc)
+      else
+         ival = ival + Status(1)
+         if ( Status(1) .ne. 0 ) then
+            PRINT *,'Failed Validation on array b(), Process ',rank
+            PRINT *,'Target   Sum of b is = ',Status(1)
+            PRINT *,'Computed Sum of b is = ',Status(2)
+         endif
+         do i=1,numtask-1
+          call MPI_RECV(Status, 2, MPI_DOUBLE_PRECISION, i, 2*i,
+     s                  MPI_COMM_WORLD, Stat, rc);
+          ival = ival + Status(1)
+          if ( Status(1) .ne. 0 ) then
+            PRINT *,'Failed Validation on array b(), Process ',i
+            PRINT *,'Target   Sum of b is = ',Status(1)
+            PRINT *,'Computed Sum of b is = ',Status(2)
+          endif
+         enddo
+      endif
+
+      IF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
+          Status(1) = cc
+          Status(2) = sumc
+      ELSE
+           Status(1) = 0
+      ENDIF
+      if ( rank .ne. 0 ) then
+          call MPI_SEND(Status, 2, MPI_DOUBLE_PRECISION, 0, 3*rank,
+     $                  MPI_COMM_WORLD, rc)
+      else
+         ival = ival + Status(1)
+         if ( Status(1) .ne. 0 ) then
+            PRINT *,'Failed Validation on array c(), Process ',rank
+            PRINT *,'Target   Sum of c is = ',Status(1)
+            PRINT *,'Computed Sum of c is = ',Status(2)
+         endif
+         do i=1,numtask-1
+          call MPI_RECV(Status, 2, MPI_DOUBLE_PRECISION, i, 3*i,
+     s                  MPI_COMM_WORLD, Stat, rc);
+          ival = ival + Status(1)
+          if ( Status(1) .ne. 0 ) then
+            PRINT *,'Failed Validation on array c(), Process ',i
+            PRINT *,'Target   Sum of c is = ',Status(1)
+            PRINT *,'Computed Sum of c is = ',Status(2)
+          endif
+         enddo
+        
+         if ( ival .eq. 0. ) then
+           PRINT *,'-----------------------------------------------'
+           PRINT*,'Solution Validates!'
+           PRINT *,'-----------------------------------------------'
+         endif
+      endif
+
+      END
diff --git a/Versions/stream_omp.c b/Versions/stream_omp.c
new file mode 100644
index 0000000..c1f34c1
--- /dev/null
+++ b/Versions/stream_omp.c
@@ -0,0 +1,402 @@
+/*-----------------------------------------------------------------------*/
+/* Program: Stream                                                       */
+/* Revision: $Id: stream_omp.c,v 5.4 2009/02/19 13:57:12 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2003: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*         "tuned STREAM benchmark results"                              */
+/*         "based on a variant of the STREAM benchmark code"             */
+/*         Other comparable, clear and reasonable labelling is           */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/* INSTRUCTIONS:
+ *
+ *	1) Stream requires a good bit of memory to run.  Adjust the
+ *          value of 'N' (below) to give a 'timing calibration' of 
+ *          at least 20 clock-ticks.  This will provide rate estimates
+ *          that should be good to about 5% precision.
+ */
+
+# define N	2000000
+# define NTIMES	10
+# define OFFSET	0
+
+/*
+ *	3) Compile the code with full optimization.  Many compilers
+ *	   generate unreasonably bad code before the optimizer tightens
+ *	   things up.  If the results are unreasonably good, on the
+ *	   other hand, the optimizer might be too smart for me!
+ *
+ *         Try compiling with:
+ *               cc -O stream_omp.c -o stream_omp
+ *
+ *         This is known to work on Cray, SGI, IBM, and Sun machines.
+ *
+ *
+ *	4) Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include:
+ *		a) computer hardware model number and software revision
+ *		b) the compiler flags
+ *		c) all of the output from the test case.
+ * Thanks!
+ *
+ */
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+static double	a[N+OFFSET],
+		b[N+OFFSET],
+		c[N+OFFSET];
+
+static double	avgtime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(double) * N,
+    2 * sizeof(double) * N,
+    3 * sizeof(double) * N,
+    3 * sizeof(double) * N
+    };
+
+extern double mysecond();
+extern void checkSTREAMresults();
+#ifdef TUNED
+extern void tuned_STREAM_Copy();
+extern void tuned_STREAM_Scale(double scalar);
+extern void tuned_STREAM_Add();
+extern void tuned_STREAM_Triad(double scalar);
+#endif
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    register int	j, k;
+    double		scalar, t, times[4][NTIMES];
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    BytesPerWord = sizeof(double);
+    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
+    printf("Total memory required = %.1f MB.\n",
+	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
+    printf("Each test is run %d times, but only\n", NTIMES);
+    printf("the *best* time for each is used.\n");
+
+#ifdef _OPENMP
+    printf(HLINE);
+#pragma omp parallel private(k)
+    {
+    k = omp_get_num_threads();
+    printf ("Number of Threads requested = %i\n",k);
+    }
+#endif
+
+    /* Get initial value for system clock. */
+#pragma omp parallel for
+    for (j=0; j<N; j++) {
+	a[j] = 1.0;
+	b[j] = 2.0;
+	c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+
+    t = mysecond();
+#pragma omp parallel for
+    for (j = 0; j < N; j++)
+	a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    scalar = 3.0;
+    for (k=0; k<NTIMES; k++)
+	{
+	times[0][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Copy();
+#else
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    c[j] = a[j];
+#endif
+	times[0][k] = mysecond() - times[0][k];
+	
+	times[1][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Scale(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    b[j] = scalar*c[j];
+#endif
+	times[1][k] = mysecond() - times[1][k];
+	
+	times[2][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Add();
+#else
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    c[j] = a[j]+b[j];
+#endif
+	times[2][k] = mysecond() - times[2][k];
+	
+	times[3][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Triad(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    a[j] = b[j]+scalar*c[j];
+#endif
+	times[3][k] = mysecond() - times[3][k];
+	}
+
+    /*	--- SUMMARY --- */
+
+    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+	for (j=0; j<4; j++)
+	    {
+	    avgtime[j] = avgtime[j] + times[j][k];
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+	avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
+	       1.0E-06 * bytes[j]/mintime[j],
+	       avgtime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    printf(HLINE);
+
+    /* --- Check Results --- */
+    checkSTREAMresults();
+    printf(HLINE);
+
+    return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+
+/* A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems.  */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+void checkSTREAMresults ()
+{
+	double aj,bj,cj,scalar;
+	double asum,bsum,csum;
+	double epsilon;
+	int	j,k;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+	aj = aj * (double) (N);
+	bj = bj * (double) (N);
+	cj = cj * (double) (N);
+
+	asum = 0.0;
+	bsum = 0.0;
+	csum = 0.0;
+	for (j=0; j<N; j++) {
+		asum += a[j];
+		bsum += b[j];
+		csum += c[j];
+	}
+#ifdef VERBOSE
+	printf ("Results Comparison: \n");
+	printf ("        Expected  : %f %f %f \n",aj,bj,cj);
+	printf ("        Observed  : %f %f %f \n",asum,bsum,csum);
+#endif
+
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+	epsilon = 1.e-8;
+
+	if (abs(aj-asum)/asum > epsilon) {
+		printf ("Failed Validation on array a[]\n");
+		printf ("        Expected  : %f \n",aj);
+		printf ("        Observed  : %f \n",asum);
+	}
+	else if (abs(bj-bsum)/bsum > epsilon) {
+		printf ("Failed Validation on array b[]\n");
+		printf ("        Expected  : %f \n",bj);
+		printf ("        Observed  : %f \n",bsum);
+	}
+	else if (abs(cj-csum)/csum > epsilon) {
+		printf ("Failed Validation on array c[]\n");
+		printf ("        Expected  : %f \n",cj);
+		printf ("        Observed  : %f \n",csum);
+	}
+	else {
+		printf ("Solution Validates\n");
+	}
+}
+
+void tuned_STREAM_Copy()
+{
+	int j;
+#pragma omp parallel for
+        for (j=0; j<N; j++)
+            c[j] = a[j];
+}
+
+void tuned_STREAM_Scale(double scalar)
+{
+	int j;
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    b[j] = scalar*c[j];
+}
+
+void tuned_STREAM_Add()
+{
+	int j;
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    c[j] = a[j]+b[j];
+}
+
+void tuned_STREAM_Triad(double scalar)
+{
+	int j;
+#pragma omp parallel for
+	for (j=0; j<N; j++)
+	    a[j] = b[j]+scalar*c[j];
+}
diff --git a/Versions/stream_tuned.f b/Versions/stream_tuned.f
new file mode 100644
index 0000000..5c51d4a
--- /dev/null
+++ b/Versions/stream_tuned.f
@@ -0,0 +1,485 @@
+*=========================================================================
+* Program: STREAM_TUNED
+* Programmer: John D. McCalpin
+* Revision: 1.0, November 6, 2002
+*-----------------------------------------------------------------------
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results"
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*-----------------------------------------------------------------------
+* STREAM measures memory transfer rates in MB/s for simple
+* computational kernels coded in Fortran.  
+*
+* The intent is to demonstrate the extent to which ordinary user
+* code can exploit the main memory bandwidth of the system under
+* test.
+*
+* This version is a simple harness to allow code optimization
+* in the context of the data flow and result checking of the
+* basic STREAM version 5.0 code.   Each of the four kernel loops
+* has been moved to a separate subroutine to allow easy code 
+* optimization or replacement.
+*
+*=========================================================================
+* THIS IS JUST A STARTING POINT --- IT HAS NOT BEEN OPTIMIZED YET!!!
+*=========================================================================
+* The STREAM web page is at:
+*          http://www.streambench.org
+*
+* Most of the content is currently hosted at:
+*          http://www.cs.virginia.edu/stream/
+*
+* BRIEF INSTRUCTIONS: 
+*       0) See http://www.cs.virginia.edu/stream/ref.html for details
+*       1) STREAM requires a timing function called mysecond().
+*          Several examples are provided in this directory.
+*          "CPU" timers are only allowed for uniprocessor runs.
+*          "Wall-clock" timers are required for all multiprocessor runs.
+*       2) The STREAM array sizes must be set to size the test.
+*          The value "N" must be chosen so that each of the three
+*          arrays is at least 4x larger than the sum of all the last-
+*          level caches used in the run, or 1 million elements, which-
+*          ever is larger.
+*          ------------------------------------------------------------
+*          Note that you are free to use any array length and offset
+*          that makes each array 4x larger than the last-level cache.
+*          The intent is to determine the *best* sustainable bandwidth
+*          available with this simple coding.  Of course, lower values
+*          are usually fairly easy to obtain on cached machines, but 
+*          by keeping the test to the *best* results, the answers are
+*          easier to interpret.
+*          You may put the arrays in common or not, at your discretion.
+*          There is a commented-out COMMON statement below.
+*          Fortran90 "allocatable" arrays are fine, too.
+*          ------------------------------------------------------------
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonably good, on the
+*          other hand, the optimizer might be too smart for me
+*          Please let me know if this happens.
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*          Please let me know if you do not want your name posted along
+*          with the submitted results.
+*       5) See the web page for more comments about the run rules and
+*          about interpretation of the results.
+*
+* Thanks,
+*   Dr. Bandwidth
+*=========================================================================
+*
+      PROGRAM stream
+*     IMPLICIT NONE
+C     .. Parameters ..
+      INTEGER n,offset,ndim,ntimes
+      PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION scalar,t
+      INTEGER j,k,nbpw,quantum
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
+     $                 times(4,ntimes)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      INTEGER checktick,realsize
+      EXTERNAL mysecond,checktick,realsize
+C     ..
+C     .. Intrinsic Functions ..
+C
+      INTRINSIC dble,max,min,nint,sqrt
+C     ..
+C     .. Arrays in Common ..
+      DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
+C     ..
+C     .. Common blocks ..
+*     COMMON a,b,c
+C     ..
+C     .. Data statements ..
+      DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Copy:      ','Scale:     ','Add:       ',
+     $     'Triad:     '/
+      DATA bytes/2,2,3,3/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      WRITE (*,FMT=9010) 'Array size = ',n
+      WRITE (*,FMT=9010) 'Offset     = ',offset
+      WRITE (*,FMT=9020) 'The total memory requirement is ',
+     $  3*nbpw*n/ (1024*1024),' MB'
+      WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
+      WRITE (*,FMT=9030) '--'
+      WRITE (*,FMT=9030) 'The *best* time for each test is used'
+      WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
+
+!$OMP PARALLEL DO
+      DO 10 j = 1,n
+          a(j) = 2.0d0
+          b(j) = 0.5D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = mysecond()
+!$OMP PARALLEL DO
+      DO 20 j = 1,n
+          a(j) = 0.5d0*a(j)
+   20 CONTINUE
+      t = mysecond() - t
+      PRINT *,'----------------------------------------------------'
+      quantum = checktick()
+      WRITE (*,FMT=9000)
+     $  'Your clock granularity/precision appears to be ',quantum,
+     $  ' microseconds'
+      PRINT *,'----------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      scalar = 0.5d0*a(1)
+      DO 70 k = 1,ntimes
+
+          t = mysecond()
+          a(1) = a(1) + t
+          call stream_copy (c, a, n)
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(1,k) = t
+
+          t = mysecond()
+          c(1) = c(1) + t
+          call stream_scale (b, c, scalar, n)
+          t = mysecond() - t
+          b(n) = b(n) + t
+          times(2,k) = t
+
+          t = mysecond()
+          a(1) = a(1) + t
+          call stream_add (c, a, b, n)
+          t = mysecond() - t
+          c(n) = c(n) + t
+          times(3,k) = t
+
+          t = mysecond()
+          b(1) = b(1) + t
+          call stream_triad (a, b, c, scalar, n)
+          t = mysecond() - t
+          a(n) = a(n) + t
+          times(4,k) = t
+   70 CONTINUE
+
+*       --- SUMMARY ---
+      DO 90 k = 2,ntimes-1
+          DO 80 j = 1,4
+              avgtime(j) = avgtime(j) + times(j,k)
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   80     CONTINUE
+   90 CONTINUE
+      WRITE (*,FMT=9040)
+      DO 100 j = 1,4
+          avgtime(j) = avgtime(j)/dble(ntimes-2)
+          WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      avgtime(j),mintime(j),maxtime(j)
+  100 CONTINUE
+      PRINT *,'----------------------------------------------------'
+      CALL checksums (a,b,c,n,ntimes)
+      PRINT *,'----------------------------------------------------'
+
+ 9000 FORMAT (1x,a,i6,a)
+ 9010 FORMAT (1x,a,i10)
+ 9020 FORMAT (1x,a,i4,a)
+ 9030 FORMAT (1x,a,i3,a,a)
+ 9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
+     $       )
+ 9050 FORMAT (a,4 (f10.4,2x))
+      END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLE PRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLE PRECISION
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+*     IMPLICIT NONE
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL confuse
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0** (-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL confuse(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 40
+          END IF
+   30 CONTINUE
+      GO TO 50
+
+   40 WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLE PRECISION word'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      RETURN
+
+   50 PRINT *,'Hmmmm.  I am unable to determine the size.'
+      PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense.'
+          PRINT *,'Try again.'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'DOUBLE PRECISION number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per DOUBLE PRECISION number'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      END
+
+      SUBROUTINE confuse(q,r)
+*     IMPLICIT NONE
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
+
+* A semi-portable way to determine the clock granularity
+* Adapted from a code by John Henning of Digital Equipment Corporation
+*
+      INTEGER FUNCTION checktick()
+*     IMPLICIT NONE
+
+C     .. Parameters ..
+      INTEGER n
+      PARAMETER (n=20)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION t1,t2
+      INTEGER i,j,jmin
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION timesfound(n)
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION mysecond
+      EXTERNAL mysecond
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC max,min,nint
+C     ..
+      i = 0
+      t1 = mysecond()
+
+   10 t2 = mysecond()
+      IF (t2.EQ.t1) GO TO 10
+
+      t1 = t2
+      i = i + 1
+      timesfound(i) = t1
+      IF (i.LT.n) GO TO 10
+
+      jmin = 1000000
+      DO 20 i = 2,n
+          j = nint((timesfound(i)-timesfound(i-1))*1d6)
+          jmin = min(jmin,max(j,0))
+   20 CONTINUE
+
+      IF (jmin.GT.0) THEN
+          checktick = jmin
+      ELSE
+          PRINT *,'Your clock granularity appears to be less ',
+     $      'than one microsecond'
+          checktick = 1
+      END IF
+      RETURN
+
+*      PRINT 14, timesfound(1)*1d6
+*      DO 20 i=2,n
+*         PRINT 14, timesfound(i)*1d6,
+*     &       nint((timesfound(i)-timesfound(i-1))*1d6)
+*   14    FORMAT (1X, F18.4, 1X, i8)
+*   20 CONTINUE
+
+      END
+
+
+
+
+      SUBROUTINE checksums(a,b,c,n,ntimes)
+*     IMPLICIT NONE
+C     ..
+C     .. Arguments ..
+      DOUBLE PRECISION a(*),b(*),c(*)
+      INTEGER n,ntimes
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
+      INTEGER k
+C     ..
+
+C     Repeat the main loop, but with scalars only.
+C     This is done to check the sum & make sure all
+C     iterations have been executed correctly.
+
+      aa = 2.0D0
+      bb = 0.5D0
+      cc = 0.0D0
+      aa = 0.5D0*aa
+      scalar = 0.5d0*aa
+      DO k = 1,ntimes
+          cc = aa
+          bb = scalar*cc
+          cc = aa + bb
+          aa = bb + scalar*cc
+      END DO
+      aa = aa*DBLE(n-2)
+      bb = bb*DBLE(n-2)
+      cc = cc*DBLE(n-2)
+
+C     Now sum up the arrays, excluding the first and last
+C     elements, which are modified using the timing results
+C     to confuse aggressive optimizers.
+
+      suma = 0.0d0
+      sumb = 0.0d0
+      sumc = 0.0d0
+!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
+      DO 110 j = 2,n-1
+          suma = suma + a(j)
+          sumb = sumb + b(j)
+          sumc = sumc + c(j)
+  110 CONTINUE
+
+      epsilon = 1.D-6
+
+      IF (ABS(suma-aa)/suma .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array a()'
+          PRINT *,'Target   Sum of a is = ',aa
+          PRINT *,'Computed Sum of a is = ',suma
+      ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array b()'
+          PRINT *,'Target   Sum of b is = ',bb
+          PRINT *,'Computed Sum of b is = ',sumb
+      ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array c()'
+          PRINT *,'Target   Sum of c is = ',cc
+          PRINT *,'Computed Sum of c is = ',sumc
+      ELSE
+          PRINT *,'Solution Validates!'
+      ENDIF
+
+      END
+
+
+*=========================================================================
+* This version is a simple harness to allow code optimization
+* in the context of the data flow and result checking of the
+* basic STREAM version 5.0 code.   Each of the four kernel loops
+* has been moved to a separate subroutine to allow easy code 
+* optimization or replacement.
+*=========================================================================
+* THESE ARE JUST STARTING POINTS --- THEY HAVE NOT BEEN OPTIMIZED YET!!!
+*=========================================================================
+
+          subroutine stream_copy (c, a, n)
+          real*8 c(*), a(*)
+!$OMP PARALLEL DO
+          do j = 1,n
+              c(j) = a(j)
+          end do
+          end
+
+          subroutine stream_scale (b, c, scalar, n)
+          real*8 b(*), c(*), scalar
+!$OMP PARALLEL DO
+          do j = 1,n
+              b(j) = scalar*c(j)
+          end do
+          end
+
+          subroutine stream_add (c, a, b, n)
+          real*8 c(*), a(*), b(*)
+!$OMP PARALLEL DO
+          do j = 1,n
+              c(j) = a(j) + b(j)
+          end do
+          end
+
+          subroutine stream_triad (a, b, c, scalar, n)
+          real*8 a(*), b(*), c(*), scalar
+!$OMP PARALLEL DO
+          do j = 1,n
+              a(j) = b(j) + scalar*c(j)
+          end do
+          end