diff --git a/Development/stream.c.5.10 b/Development/stream.c.5.10 new file mode 100644 index 0000000..b9fd19e --- /dev/null +++ b/Development/stream.c.5.10 @@ -0,0 +1,496 @@ +/*-----------------------------------------------------------------------*/ +/* Program: Stream */ +/* Revision: $Id: stream.c,v 5.10 2009/01/28 13:22:09 mccalpin Exp mccalpin $ */ +/* Original code developed by John D. McCalpin */ +/* Programmers: John D. McCalpin */ +/* Joe R. Zagar */ +/* */ +/* This program measures memory transfer rates in MB/s for simple */ +/* computational kernels coded in C. */ +/*-----------------------------------------------------------------------*/ +/* Copyright 1991-2005: John D. McCalpin */ +/*-----------------------------------------------------------------------*/ +/* License: */ +/* 1. You are free to use this program and/or to redistribute */ +/* this program. */ +/* 2. You are free to modify this program for your own use, */ +/* including commercial use, subject to the publication */ +/* restrictions in item 3. */ +/* 3. You are free to publish results obtained from running this */ +/* program, or from works that you derive from this program, */ +/* with the following limitations: */ +/* 3a. In order to be referred to as "STREAM benchmark results", */ +/* published results must be in conformance to the STREAM */ +/* Run Rules, (briefly reviewed below) published at */ +/* http://www.cs.virginia.edu/stream/ref.html */ +/* and incorporated herein by reference. */ +/* As the copyright holder, John McCalpin retains the */ +/* right to determine conformity with the Run Rules. */ +/* 3b. Results based on modified source code or on runs not in */ +/* accordance with the STREAM Run Rules must be clearly */ +/* labelled whenever they are published. Examples of */ +/* proper labelling include: */ +/* "tuned STREAM benchmark results" */ +/* "based on a variant of the STREAM benchmark code" */ +/* Other comparable, clear and reasonable labelling is */ +/* acceptable. */ +/* 3c. Submission of results to the STREAM benchmark web site */ +/* is encouraged, but not required. */ +/* 4. Use of this program or creation of derived works based on this */ +/* program constitutes acceptance of these licensing restrictions. */ +/* 5. Absolutely no warranty is expressed or implied. */ +/*-----------------------------------------------------------------------*/ +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +/* INSTRUCTIONS: + * + * 1) Stream requires a good bit of memory to run. Adjust the + * value of 'N' (below) to give a 'timing calibration' of + * at least 20 clock-ticks. This will provide rate estimates + * that should be good to about 5% precision. + */ + +/* + * 3) Compile the code with full optimization. Many compilers + * generate unreasonably bad code before the optimizer tightens + * things up. If the results are unreasonably good, on the + * other hand, the optimizer might be too smart for me! + * + * Try compiling with: + * cc -O stream_omp.c -o stream_omp + * + * This is known to work on Cray, SGI, IBM, and Sun machines. + * + * + * 4) Mail the results to mccalpin@cs.virginia.edu + * Be sure to include: + * a) computer hardware model number and software revision + * b) the compiler flags + * c) all of the output from the test case. + * Thanks! + * + */ + +#include "stream.h" + +#define MAXNTIMES 100 +#define MAXSEGS 8 + +int +main(int argc, char **argv) + { + long N, OFFSET, SIZE; + int quantum; + int largepage,shmflag,shmid[MAXSEGS]; + int BytesPerWord; + register int j, k; + double scalar, t, times[4][MAXNTIMES]; + double *a, *b, *c; + double avgtime[4] = {0}; + double maxtime[4] = {0}; + double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + double bytes[4] = {2 * sizeof(double), + 2 * sizeof(double), + 3 * sizeof(double), + 3 * sizeof(double)}; + + /* --- default options --- */ + N = 2000000; + OFFSET = 0; + NTIMES = 10; + largepage = 0; + + /* --- NEW --- parse command line arguments using getopt --- */ + while (1) { + int this_option_optind = optind ? optind : 1; + int option_index = 0; + static struct option long_options[] = { + {"largepage", 1, 0, 'l'}, + {"length", 1, 0, 'n'}, + {"offset", 1, 0, 'o'}, + {"repetitions", 1, 0, 'r'}, + {"tuned", 1, 0, 't'}, + {"help", 0, 0, 'h'}, + {0, 0, 0, 0} + }; + + j = getopt_long (argc, argv, "ln:o:r:t:h", + long_options, &option_index); + if (j == -1) /* finished parsing all command-line options */ + break; + + switch (j) { + case 0: /* this should not happen */ + printf ("option %s", long_options[option_index].name); + if (optarg) + printf (" with arg %s", optarg); + printf ("\n"); + break; + + case 'l': /* requesting data allocation on large pages */ + printf ("User requested data allocation on large pages\n"); + largepage=1; + break; + + case 'n': /* define vector length in 10^6 elements */ + printf ("User requested Array Size of %d * 10^6 elements\n", optarg); + N = atoi(optarg); + if ( N >= 2147 ) { + printf("Warning: Array Size exceeds 2GB - watch for anomalies\n"); + } + N = N * 1000 * 1000; + break; + + case 'o': /* define offset in elements */ + printf ("User requested Array Offset of %d elements\n", optarg); + OFFSET = atoi(optarg); + break; + + case 'r': /* specify number of repetitions */ + printf ("option r with value '%s'\n", optarg); + NTIMES = atoi(optarg); + if (NTIMES > MAXNTIMES) { + NTIMES = MAXNTIMES; + printf("Note: requested repetitions exceeds maximum allowed\n"); + printf(" repeat count reset to %d\n",MAXNTIMES); + } + break; + + case 't': /* selection tuned version of code -- not currently used */ + printf ("option t with value '%s'\n", optarg); + break; + + case 'h': + printf ("Usage: %s [options]\n",argv[0]); + printf ("Options:\n"); + printf (" [-l] <-- request data put on large pages (in a shared segment)\n"); + printf (" [-n, --length] n <-- n is array length in 10^6 elements\n"); + printf (" [-o, --offset] n <-- n is offset/padding in elements\n"); + printf (" [-r, --repetitions] n <-- n is number of repetitions for timing (1st is not counted)\n"); + printf (" [-t, --tuned] nnn <-- nnn is version number of tuned kernels to execute\n"); + printf (" [-h, --help] <-- gives this help message and exits\n"); + exit(0); + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + printf("STREAM version $Revision: 5.10 $\n"); + printf(HLINE); + BytesPerWord = sizeof(double); + printf("This system uses %d bytes per DOUBLE PRECISION word.\n", + BytesPerWord); + + printf(HLINE); + printf("Array size = %d, Offset = %d\n" , N, OFFSET); + printf("Total memory required = %.1f MB.\n", + (3.0 * BytesPerWord) * ( (double) (N+OFFSET) / 1048576.0)); + printf(HLINE); + printf("Allocating three arrays....\n"); + + if (largepage) { + SIZE = (3*N + 3*OFFSET) * sizeof(double); + printf("Data SIZE needed %llu (Bytes)\n",SIZE); + SIZE = ceil((double)SIZE/(2048.*1024.)) * 2048*1024; + printf("Data SIZE requested %llu (Bytes)\n",SIZE); + printf("attempting to create (shmget) a shared segment of size %llu\n",SIZE); + shmid[0] = shmget(IPC_PRIVATE,SIZE,IPC_CREAT|SHM_HUGETLB); + if (shmid[0] == -1) { + perror("ERROR: failed shmget:"); + printf("(usually caused by non-root user trying to get large pages)\n"); + exit(2); + } + printf("shmget returned a shmid of %d\n",shmid[0]); + a = shmat(shmid[0],0,SHM_RND); + printf("shmat returned a pointer to %p\n",a); + if (a == (double *)(-1)) { + perror("ERROR: failed shmat:"); + printf("Deleting shared segment\n"); + shmctl(shmid[0],IPC_RMID,NULL); + exit(3); + } + b = a + N + OFFSET; + c = a + 2 * (N + OFFSET); + } + else{ + a = malloc( (N+0*OFFSET) * sizeof(double)); + b = malloc( (N+1*OFFSET) * sizeof(double)); + c = malloc( (N+2*OFFSET) * sizeof(double)); + if ( (a==0) || (b==0) || (c==0) ) { + printf("Error: one or more mallocs failed!\n"); + printf(" a = %p\n",a); + printf(" b = %p\n",b); + printf(" c = %p\n",c); + } + /* Move the starting points of b and c to implement the OFFSET */ + b += OFFSET; + c += 2*OFFSET; + } + printf("Array Starting Locations: \n"); + printf(" a = %p\n",a); + printf(" b = %p\n",b); + printf(" c = %p\n",c); + + printf(HLINE); + printf("Each test is run %d times, but only\n", NTIMES); + printf("the *best* time for each is used.\n"); + +#ifdef _OPENMP + printf(HLINE); + printf("OpenMP conditional compilation is active\n"); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } + } +#endif + + printf(HLINE); +#pragma omp parallel + { + printf ("Printing one line per active thread....\n"); + } + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < N; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + +#ifdef TUNED +#include "tuned.inc" +#else +#include "standard.inc" +#endif + + /* --- SUMMARY --- */ + + for (k=1; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +/* ----------------------------------------------- + Check the results to make sure all the loops + have actually been run. + This revised version (in 5.9 and above) sums the + absolute errors across the arrays, rather than + summing the values in the arrays and comparing + with the expected sum. This version is much + less sensitive to accumulation of roundoff error. +-------------------------------------------------- */ +void checkSTREAMresults (long N, double *a, double *b, double *c) +{ + double aj,bj,cj,scalar; + double asum,bsum,csum; + double epsilon; + int j,k,fail=0; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k= 0 ? (a) : -(a)) +#endif + asum = 0.0; + bsum = 0.0; + csum = 0.0; + for (j=0; j epsilon) { + printf ("Failed Validation on array a[]\n"); + printf (" Max Allowable Error : %f \n",epsilon); + printf (" Observed Error : %f \n",asum); + fail = 1; + } + if (bsum > epsilon) { + printf ("Failed Validation on array b[]\n"); + printf (" Max Allowable Error : %f \n",epsilon); + printf (" Observed Error : %f \n",bsum); + fail = 1; + } + if (csum > epsilon) { + printf ("Failed Validation on array c[]\n"); + printf (" Max Allowable Error : %f \n",epsilon); + printf (" Observed Error : %f \n",csum); + fail = 1; + } + if (fail == 0) { + printf ("Solution Validates\n"); + } +} + diff --git a/TO_DO b/TO_DO new file mode 100644 index 0000000..363b9da --- /dev/null +++ b/TO_DO @@ -0,0 +1,20 @@ +2016-05-18: + +* Incorporate the timer code for Microsoft Windows into the mainstream version of stream.c + +* Incorporate alternate array allocation into the mainstream version of stream.c +** posix_memalign() +** malloc() +** mmap() +** shmget()/shmat() + +* Figure out if the "restrict" keyword can actually be made portable and useful + for C versions.... + +* Update the Fortran version of STREAM to catch up with the functionality + and features of the C version 5.10 +** Updated result checking code with minimal roundoff error. +** Large integer array indices (N > (1<<32)). + +* Update the Fortran MPI version of STREAM to catch up with the functionality + and features of the C version 5.10 diff --git a/Versions/Experimental/Parallel_jobs b/Versions/Experimental/Parallel_jobs new file mode 100644 index 0000000..02fce72 --- /dev/null +++ b/Versions/Experimental/Parallel_jobs @@ -0,0 +1,31 @@ +#!/bin/csh +# +# This program runs multiple copies of the stream_wall benchmark +# to measure how much they interfere with each other.... +# +# John D. McCalpin, mccalpin@cs.virginia.edu +# Mon May 2 18:51:19 EDT 1994 +# set verbose + +#switch ($#argv) +#case 1: +# breaksw +#default: +# echo "Usage: $0 " +# exit 1 +#endsw + +foreach k (1 2 4 6 8) + set NCPU=$k + set i=$k + echo "Starting $i jobs" + while (`expr $i - 1` >= 0) + echo stream_d >P${NCPU}.${i} & + set i=`expr $i - 1` + end + wait + cat P${NCPU}.[1-${NCPU}] >P${NCPU}.out + rm P${NCPU}.[1-${NCPU}] + echo "All jobs done.... Output is in P${NCPU}.out" +end +exit 0 diff --git a/Versions/Experimental/do_offsets b/Versions/Experimental/do_offsets new file mode 100644 index 0000000..34532de --- /dev/null +++ b/Versions/Experimental/do_offsets @@ -0,0 +1,7 @@ +#!/bin/csh +foreach OFFSET (0 7 8 9 15 16 17 31 32 33 63 64 65) + echo $OFFSET + sed "s/offset=0/offset=${OFFSET}/" q.f + make q + q >>LOG +end diff --git a/Versions/Old/1996-08-18/stream_d.c b/Versions/Old/1996-08-18/stream_d.c new file mode 100644 index 0000000..867af5b --- /dev/null +++ b/Versions/Old/1996-08-18/stream_d.c @@ -0,0 +1,143 @@ +/* +* Program: Stream +* Programmer: John D. McCalpin +* Revision: 2.1, August 30, 1995 +* +* This program measures memory transfer rates in MB/s for simple +* computational kernels coded in Fortran. These numbers reveal the +* quality of code generation for simple uncacheable kernels as well +* as showing the cost of floating-point operations relative to memory +* accesses. +* +* INSTRUCTIONS: +* 1) (fortran-specific, omitted.) +* 2) Stream requires a good bit of memory to run. +* Adjust the Parameter 'N' in the second line of the main +* program to give a 'timing calibration' of at least 20 clicks. +* This will provide rate estimates that should be good to +* about 5% precision. +* 3) Compile the code with full optimization. Many compilers +* generate unreasonably bad code before the optimizer tightens +* things up. If the results are unreasonable good, on the +* other hand, the optimizer might be too smart for me! +* 4) Mail the results to mccalpin@cs.virginia.edu +* Be sure to include: +* a) computer hardware model number and software revision +* b) the compiler flags +* c) all of the output from the test case. +* Thanks! +* +* this version was ported from fortran to c by mark hahn, hahn+@pitt.edu. +*/ + +#define N 1000000 +#define NTIMES 10 + +#ifdef __hpux +#define _HPUX_SOURCE 1 +#else +#define _INCLUDE_POSIX_SOURCE 1 +#endif +#include +#include +#include +#include + +#ifndef MIN +#define MIN(x,y) ((x)<(y)?(x):(y)) +#endif +#ifndef MAX +#define MAX(x,y) ((x)>(y)?(x):(y)) +#endif + +struct timeval tvStart; + +void utimeStart() { + struct timezone tz; + gettimeofday(&tvStart,&tz); +} + +float utime() { + struct timeval tv; + struct timezone tz; + float utime; + gettimeofday(&tv,&tz); + utime = 1e6 * (tv.tv_sec - tvStart.tv_sec) + tv.tv_usec - tvStart.tv_usec; + return utime; +} + +typedef double real; +static real a[N],b[N],c[N]; + +int main() { + int j,k; + float times[4][NTIMES]; + static float rmstime[4] = {0}; + static float mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + static float maxtime[4] = {0}; + static char *label[4] = {"Assignment:", + "Scaling :", + "Summing :", + "SAXPYing :"}; + static float bytes[4] = { 2 * sizeof(real) * N, + 2 * sizeof(real) * N, + 3 * sizeof(real) * N, + 3 * sizeof(real) * N}; + + /* --- SETUP --- determine precision and check timing --- */ + utimeStart(); + for (j=0; j +#include +#include +#include + +double mysecond() +{ + long sec; + double secx; + struct tms realbuf; + + times(&realbuf); + secx = ( realbuf.tms_stime + realbuf.tms_utime ) / (float) CLK_TCK; + return ((double) secx); +} diff --git a/Versions/Old/2003-04-08/second_cpu.f b/Versions/Old/2003-04-08/second_cpu.f new file mode 100644 index 0000000..cef7c36 --- /dev/null +++ b/Versions/Old/2003-04-08/second_cpu.f @@ -0,0 +1,18 @@ +*------------------------------------- +* Sample timing routine +* This code works on Sun and Silicon Graphics machines. +* DOUBLE PRECISION function mysecond() +* real arg(2) +* mysecond = etime(arg) +* end +* Sample timing routine +* This code works on IBM RS/6000 machines + DOUBLE PRECISION FUNCTION mysecond() +C .. +C .. External Functions .. + INTEGER mclock + EXTERNAL mclock +C .. + mysecond = mclock()*0.01D0 + END + diff --git a/Versions/Old/2003-04-08/second_wall.c b/Versions/Old/2003-04-08/second_wall.c new file mode 100644 index 0000000..a9b799a --- /dev/null +++ b/Versions/Old/2003-04-08/second_wall.c @@ -0,0 +1,27 @@ +/* A Fortran-callable gettimeofday routine to give access + to the wall clock timer. + + This subroutine may need to be modified slightly to get + it to link with Fortran on your computer. + The most common difference is adding/removing a trailing + underscore character to the function name. +*/ + +#include +/* int gettimeofday(struct timeval *tp, struct timezone *tzp); */ + +double mysecond_() +{ +/* struct timeval { long tv_sec; + long tv_usec; }; + +struct timezone { int tz_minuteswest; + int tz_dsttime; }; */ + + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} diff --git a/Versions/Old/Experimental/Parallel_jobs b/Versions/Old/Experimental/Parallel_jobs new file mode 100644 index 0000000..02fce72 --- /dev/null +++ b/Versions/Old/Experimental/Parallel_jobs @@ -0,0 +1,31 @@ +#!/bin/csh +# +# This program runs multiple copies of the stream_wall benchmark +# to measure how much they interfere with each other.... +# +# John D. McCalpin, mccalpin@cs.virginia.edu +# Mon May 2 18:51:19 EDT 1994 +# set verbose + +#switch ($#argv) +#case 1: +# breaksw +#default: +# echo "Usage: $0 " +# exit 1 +#endsw + +foreach k (1 2 4 6 8) + set NCPU=$k + set i=$k + echo "Starting $i jobs" + while (`expr $i - 1` >= 0) + echo stream_d >P${NCPU}.${i} & + set i=`expr $i - 1` + end + wait + cat P${NCPU}.[1-${NCPU}] >P${NCPU}.out + rm P${NCPU}.[1-${NCPU}] + echo "All jobs done.... Output is in P${NCPU}.out" +end +exit 0 diff --git a/Versions/Old/Experimental/do_offsets b/Versions/Old/Experimental/do_offsets new file mode 100644 index 0000000..34532de --- /dev/null +++ b/Versions/Old/Experimental/do_offsets @@ -0,0 +1,7 @@ +#!/bin/csh +foreach OFFSET (0 7 8 9 15 16 17 31 32 33 63 64 65) + echo $OFFSET + sed "s/offset=0/offset=${OFFSET}/" q.f + make q + q >>LOG +end diff --git a/Versions/Old/second_cpu.c b/Versions/Old/second_cpu.c new file mode 100644 index 0000000..d0338a9 --- /dev/null +++ b/Versions/Old/second_cpu.c @@ -0,0 +1,15 @@ +#include +#include +#include +#include + +double mysecond() +{ + long sec; + double secx; + struct tms realbuf; + + times(&realbuf); + secx = ( realbuf.tms_stime + realbuf.tms_utime ) / (float) CLK_TCK; + return ((double) secx); +} diff --git a/Versions/Old/second_cpu.f b/Versions/Old/second_cpu.f new file mode 100644 index 0000000..cef7c36 --- /dev/null +++ b/Versions/Old/second_cpu.f @@ -0,0 +1,18 @@ +*------------------------------------- +* Sample timing routine +* This code works on Sun and Silicon Graphics machines. +* DOUBLE PRECISION function mysecond() +* real arg(2) +* mysecond = etime(arg) +* end +* Sample timing routine +* This code works on IBM RS/6000 machines + DOUBLE PRECISION FUNCTION mysecond() +C .. +C .. External Functions .. + INTEGER mclock + EXTERNAL mclock +C .. + mysecond = mclock()*0.01D0 + END + diff --git a/Versions/Old/second_wall.c b/Versions/Old/second_wall.c new file mode 100644 index 0000000..a9b799a --- /dev/null +++ b/Versions/Old/second_wall.c @@ -0,0 +1,27 @@ +/* A Fortran-callable gettimeofday routine to give access + to the wall clock timer. + + This subroutine may need to be modified slightly to get + it to link with Fortran on your computer. + The most common difference is adding/removing a trailing + underscore character to the function name. +*/ + +#include +/* int gettimeofday(struct timeval *tp, struct timezone *tzp); */ + +double mysecond_() +{ +/* struct timeval { long tv_sec; + long tv_usec; }; + +struct timezone { int tz_minuteswest; + int tz_dsttime; }; */ + + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} diff --git a/Versions/Old/stream.c.5.10 b/Versions/Old/stream.c.5.10 new file mode 100644 index 0000000..b9fd19e --- /dev/null +++ b/Versions/Old/stream.c.5.10 @@ -0,0 +1,496 @@ +/*-----------------------------------------------------------------------*/ +/* Program: Stream */ +/* Revision: $Id: stream.c,v 5.10 2009/01/28 13:22:09 mccalpin Exp mccalpin $ */ +/* Original code developed by John D. McCalpin */ +/* Programmers: John D. McCalpin */ +/* Joe R. Zagar */ +/* */ +/* This program measures memory transfer rates in MB/s for simple */ +/* computational kernels coded in C. */ +/*-----------------------------------------------------------------------*/ +/* Copyright 1991-2005: John D. McCalpin */ +/*-----------------------------------------------------------------------*/ +/* License: */ +/* 1. You are free to use this program and/or to redistribute */ +/* this program. */ +/* 2. You are free to modify this program for your own use, */ +/* including commercial use, subject to the publication */ +/* restrictions in item 3. */ +/* 3. You are free to publish results obtained from running this */ +/* program, or from works that you derive from this program, */ +/* with the following limitations: */ +/* 3a. In order to be referred to as "STREAM benchmark results", */ +/* published results must be in conformance to the STREAM */ +/* Run Rules, (briefly reviewed below) published at */ +/* http://www.cs.virginia.edu/stream/ref.html */ +/* and incorporated herein by reference. */ +/* As the copyright holder, John McCalpin retains the */ +/* right to determine conformity with the Run Rules. */ +/* 3b. Results based on modified source code or on runs not in */ +/* accordance with the STREAM Run Rules must be clearly */ +/* labelled whenever they are published. Examples of */ +/* proper labelling include: */ +/* "tuned STREAM benchmark results" */ +/* "based on a variant of the STREAM benchmark code" */ +/* Other comparable, clear and reasonable labelling is */ +/* acceptable. */ +/* 3c. Submission of results to the STREAM benchmark web site */ +/* is encouraged, but not required. */ +/* 4. Use of this program or creation of derived works based on this */ +/* program constitutes acceptance of these licensing restrictions. */ +/* 5. Absolutely no warranty is expressed or implied. */ +/*-----------------------------------------------------------------------*/ +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +/* INSTRUCTIONS: + * + * 1) Stream requires a good bit of memory to run. Adjust the + * value of 'N' (below) to give a 'timing calibration' of + * at least 20 clock-ticks. This will provide rate estimates + * that should be good to about 5% precision. + */ + +/* + * 3) Compile the code with full optimization. Many compilers + * generate unreasonably bad code before the optimizer tightens + * things up. If the results are unreasonably good, on the + * other hand, the optimizer might be too smart for me! + * + * Try compiling with: + * cc -O stream_omp.c -o stream_omp + * + * This is known to work on Cray, SGI, IBM, and Sun machines. + * + * + * 4) Mail the results to mccalpin@cs.virginia.edu + * Be sure to include: + * a) computer hardware model number and software revision + * b) the compiler flags + * c) all of the output from the test case. + * Thanks! + * + */ + +#include "stream.h" + +#define MAXNTIMES 100 +#define MAXSEGS 8 + +int +main(int argc, char **argv) + { + long N, OFFSET, SIZE; + int quantum; + int largepage,shmflag,shmid[MAXSEGS]; + int BytesPerWord; + register int j, k; + double scalar, t, times[4][MAXNTIMES]; + double *a, *b, *c; + double avgtime[4] = {0}; + double maxtime[4] = {0}; + double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + double bytes[4] = {2 * sizeof(double), + 2 * sizeof(double), + 3 * sizeof(double), + 3 * sizeof(double)}; + + /* --- default options --- */ + N = 2000000; + OFFSET = 0; + NTIMES = 10; + largepage = 0; + + /* --- NEW --- parse command line arguments using getopt --- */ + while (1) { + int this_option_optind = optind ? optind : 1; + int option_index = 0; + static struct option long_options[] = { + {"largepage", 1, 0, 'l'}, + {"length", 1, 0, 'n'}, + {"offset", 1, 0, 'o'}, + {"repetitions", 1, 0, 'r'}, + {"tuned", 1, 0, 't'}, + {"help", 0, 0, 'h'}, + {0, 0, 0, 0} + }; + + j = getopt_long (argc, argv, "ln:o:r:t:h", + long_options, &option_index); + if (j == -1) /* finished parsing all command-line options */ + break; + + switch (j) { + case 0: /* this should not happen */ + printf ("option %s", long_options[option_index].name); + if (optarg) + printf (" with arg %s", optarg); + printf ("\n"); + break; + + case 'l': /* requesting data allocation on large pages */ + printf ("User requested data allocation on large pages\n"); + largepage=1; + break; + + case 'n': /* define vector length in 10^6 elements */ + printf ("User requested Array Size of %d * 10^6 elements\n", optarg); + N = atoi(optarg); + if ( N >= 2147 ) { + printf("Warning: Array Size exceeds 2GB - watch for anomalies\n"); + } + N = N * 1000 * 1000; + break; + + case 'o': /* define offset in elements */ + printf ("User requested Array Offset of %d elements\n", optarg); + OFFSET = atoi(optarg); + break; + + case 'r': /* specify number of repetitions */ + printf ("option r with value '%s'\n", optarg); + NTIMES = atoi(optarg); + if (NTIMES > MAXNTIMES) { + NTIMES = MAXNTIMES; + printf("Note: requested repetitions exceeds maximum allowed\n"); + printf(" repeat count reset to %d\n",MAXNTIMES); + } + break; + + case 't': /* selection tuned version of code -- not currently used */ + printf ("option t with value '%s'\n", optarg); + break; + + case 'h': + printf ("Usage: %s [options]\n",argv[0]); + printf ("Options:\n"); + printf (" [-l] <-- request data put on large pages (in a shared segment)\n"); + printf (" [-n, --length] n <-- n is array length in 10^6 elements\n"); + printf (" [-o, --offset] n <-- n is offset/padding in elements\n"); + printf (" [-r, --repetitions] n <-- n is number of repetitions for timing (1st is not counted)\n"); + printf (" [-t, --tuned] nnn <-- nnn is version number of tuned kernels to execute\n"); + printf (" [-h, --help] <-- gives this help message and exits\n"); + exit(0); + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + printf("STREAM version $Revision: 5.10 $\n"); + printf(HLINE); + BytesPerWord = sizeof(double); + printf("This system uses %d bytes per DOUBLE PRECISION word.\n", + BytesPerWord); + + printf(HLINE); + printf("Array size = %d, Offset = %d\n" , N, OFFSET); + printf("Total memory required = %.1f MB.\n", + (3.0 * BytesPerWord) * ( (double) (N+OFFSET) / 1048576.0)); + printf(HLINE); + printf("Allocating three arrays....\n"); + + if (largepage) { + SIZE = (3*N + 3*OFFSET) * sizeof(double); + printf("Data SIZE needed %llu (Bytes)\n",SIZE); + SIZE = ceil((double)SIZE/(2048.*1024.)) * 2048*1024; + printf("Data SIZE requested %llu (Bytes)\n",SIZE); + printf("attempting to create (shmget) a shared segment of size %llu\n",SIZE); + shmid[0] = shmget(IPC_PRIVATE,SIZE,IPC_CREAT|SHM_HUGETLB); + if (shmid[0] == -1) { + perror("ERROR: failed shmget:"); + printf("(usually caused by non-root user trying to get large pages)\n"); + exit(2); + } + printf("shmget returned a shmid of %d\n",shmid[0]); + a = shmat(shmid[0],0,SHM_RND); + printf("shmat returned a pointer to %p\n",a); + if (a == (double *)(-1)) { + perror("ERROR: failed shmat:"); + printf("Deleting shared segment\n"); + shmctl(shmid[0],IPC_RMID,NULL); + exit(3); + } + b = a + N + OFFSET; + c = a + 2 * (N + OFFSET); + } + else{ + a = malloc( (N+0*OFFSET) * sizeof(double)); + b = malloc( (N+1*OFFSET) * sizeof(double)); + c = malloc( (N+2*OFFSET) * sizeof(double)); + if ( (a==0) || (b==0) || (c==0) ) { + printf("Error: one or more mallocs failed!\n"); + printf(" a = %p\n",a); + printf(" b = %p\n",b); + printf(" c = %p\n",c); + } + /* Move the starting points of b and c to implement the OFFSET */ + b += OFFSET; + c += 2*OFFSET; + } + printf("Array Starting Locations: \n"); + printf(" a = %p\n",a); + printf(" b = %p\n",b); + printf(" c = %p\n",c); + + printf(HLINE); + printf("Each test is run %d times, but only\n", NTIMES); + printf("the *best* time for each is used.\n"); + +#ifdef _OPENMP + printf(HLINE); + printf("OpenMP conditional compilation is active\n"); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } + } +#endif + + printf(HLINE); +#pragma omp parallel + { + printf ("Printing one line per active thread....\n"); + } + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < N; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + +#ifdef TUNED +#include "tuned.inc" +#else +#include "standard.inc" +#endif + + /* --- SUMMARY --- */ + + for (k=1; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +/* ----------------------------------------------- + Check the results to make sure all the loops + have actually been run. + This revised version (in 5.9 and above) sums the + absolute errors across the arrays, rather than + summing the values in the arrays and comparing + with the expected sum. This version is much + less sensitive to accumulation of roundoff error. +-------------------------------------------------- */ +void checkSTREAMresults (long N, double *a, double *b, double *c) +{ + double aj,bj,cj,scalar; + double asum,bsum,csum; + double epsilon; + int j,k,fail=0; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k= 0 ? (a) : -(a)) +#endif + asum = 0.0; + bsum = 0.0; + csum = 0.0; + for (j=0; j epsilon) { + printf ("Failed Validation on array a[]\n"); + printf (" Max Allowable Error : %f \n",epsilon); + printf (" Observed Error : %f \n",asum); + fail = 1; + } + if (bsum > epsilon) { + printf ("Failed Validation on array b[]\n"); + printf (" Max Allowable Error : %f \n",epsilon); + printf (" Observed Error : %f \n",bsum); + fail = 1; + } + if (csum > epsilon) { + printf ("Failed Validation on array c[]\n"); + printf (" Max Allowable Error : %f \n",epsilon); + printf (" Observed Error : %f \n",csum); + fail = 1; + } + if (fail == 0) { + printf ("Solution Validates\n"); + } +} + diff --git a/Versions/Old/stream_d.c b/Versions/Old/stream_d.c new file mode 100644 index 0000000..1f2bbe1 --- /dev/null +++ b/Versions/Old/stream_d.c @@ -0,0 +1,218 @@ +# include +# include +# include +# include +# include + +/* + * Program: Stream + * Programmer: Joe R. Zagar + * Revision: 4.0-BETA, October 24, 1995 + * Original code developed by John D. McCalpin + * + * This program measures memory transfer rates in MB/s for simple + * computational kernels coded in C. These numbers reveal the quality + * of code generation for simple uncacheable kernels as well as showing + * the cost of floating-point operations relative to memory accesses. + * + * INSTRUCTIONS: + * + * 1) Stream requires a good bit of memory to run. Adjust the + * value of 'N' (below) to give a 'timing calibration' of + * at least 20 clock-ticks. This will provide rate estimates + * that should be good to about 5% precision. + */ + +# define N 2000000 +# define NTIMES 10 +# define OFFSET 0 + +/* + * 3) Compile the code with full optimization. Many compilers + * generate unreasonably bad code before the optimizer tightens + * things up. If the results are unreasonably good, on the + * other hand, the optimizer might be too smart for me! + * + * Try compiling with: + * cc -O stream_d.c second_wall.c -o stream_d -lm + * + * This is known to work on Cray, SGI, IBM, and Sun machines. + * + * + * 4) Mail the results to mccalpin@cs.virginia.edu + * Be sure to include: + * a) computer hardware model number and software revision + * b) the compiler flags + * c) all of the output from the test case. + * Thanks! + * + */ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +static double a[N+OFFSET], + b[N+OFFSET], + c[N+OFFSET]; + +static double rmstime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(double) * N, + 2 * sizeof(double) * N, + 3 * sizeof(double) * N, + 3 * sizeof(double) * N + }; + +extern double mysecond(); + +int +main() + { + int quantum, checktick(); + int BytesPerWord; + register int j, k; + double scalar, t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + BytesPerWord = sizeof(double); + printf("This system uses %d bytes per DOUBLE PRECISION word.\n", + BytesPerWord); + + printf(HLINE); + printf("Array size = %d, Offset = %d\n" , N, OFFSET); + printf("Total memory required = %.1f MB.\n", + (3 * N * BytesPerWord) / 1048576.0); + printf("Each test is run %d times, but only\n", NTIMES); + printf("the *best* time for each is used.\n"); + + /* Get initial value for system clock. */ + + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + + t = mysecond(); + for (j = 0; j < N; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k +# include +# include +# include +# include + +/* INSTRUCTIONS: + * + * 1) Stream requires a good bit of memory to run. Adjust the + * value of 'N' (below) to give a 'timing calibration' of + * at least 20 clock-ticks. This will provide rate estimates + * that should be good to about 5% precision. + */ + +# define N 2000000 +# define NTIMES 10 +# define OFFSET 0 + +/* + * 3) Compile the code with full optimization. Many compilers + * generate unreasonably bad code before the optimizer tightens + * things up. If the results are unreasonably good, on the + * other hand, the optimizer might be too smart for me! + * + * Try compiling with: + * cc -O stream_omp.c -o stream_omp + * + * This is known to work on Cray, SGI, IBM, and Sun machines. + * + * + * 4) Mail the results to mccalpin@cs.virginia.edu + * Be sure to include: + * a) computer hardware model number and software revision + * b) the compiler flags + * c) all of the output from the test case. + * Thanks! + * + */ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +static double a[N+OFFSET], + b[N+OFFSET], + c[N+OFFSET]; + +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(double) * N, + 2 * sizeof(double) * N, + 3 * sizeof(double) * N, + 3 * sizeof(double) * N + }; + +extern double mysecond(); +extern void checkSTREAMresults(); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(double scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(double scalar); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + register int j, k; + double scalar, t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + BytesPerWord = sizeof(double); + printf("This system uses %d bytes per DOUBLE PRECISION word.\n", + BytesPerWord); + + printf(HLINE); + printf("Array size = %d, Offset = %d\n" , N, OFFSET); + printf("Total memory required = %.1f MB.\n", + (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); + printf("Each test is run %d times, but only\n", NTIMES); + printf("the *best* time for each is used.\n"); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel private(k) + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } +#endif + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < N; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +void checkSTREAMresults () +{ + double aj,bj,cj,scalar; + double asum,bsum,csum; + double epsilon; + int j,k; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k= 0 ? (a) : -(a)) + epsilon = 1.e-8; + + if (abs(aj-asum)/asum > epsilon) { + printf ("Failed Validation on array a[]\n"); + printf (" Expected : %f \n",aj); + printf (" Observed : %f \n",asum); + } + else if (abs(bj-bsum)/bsum > epsilon) { + printf ("Failed Validation on array b[]\n"); + printf (" Expected : %f \n",bj); + printf (" Observed : %f \n",bsum); + } + else if (abs(cj-csum)/csum > epsilon) { + printf ("Failed Validation on array c[]\n"); + printf (" Expected : %f \n",cj); + printf (" Observed : %f \n",csum); + } + else { + printf ("Solution Validates\n"); + } +} + +void tuned_STREAM_Copy() +{ + int j; +#pragma omp parallel for + for (j=0; j +#include +#include +#include + +double mysecond() +{ + long sec; + double secx; + struct tms realbuf; + + times(&realbuf); + secx = ( realbuf.tms_stime + realbuf.tms_utime ) / (float) CLK_TCK; + return ((double) secx); +} diff --git a/Versions/second_cpu.f b/Versions/second_cpu.f new file mode 100644 index 0000000..cef7c36 --- /dev/null +++ b/Versions/second_cpu.f @@ -0,0 +1,18 @@ +*------------------------------------- +* Sample timing routine +* This code works on Sun and Silicon Graphics machines. +* DOUBLE PRECISION function mysecond() +* real arg(2) +* mysecond = etime(arg) +* end +* Sample timing routine +* This code works on IBM RS/6000 machines + DOUBLE PRECISION FUNCTION mysecond() +C .. +C .. External Functions .. + INTEGER mclock + EXTERNAL mclock +C .. + mysecond = mclock()*0.01D0 + END + diff --git a/Versions/stream_5-10_posix_memalign.c b/Versions/stream_5-10_posix_memalign.c new file mode 100644 index 0000000..d2c722c --- /dev/null +++ b/Versions/stream_5-10_posix_memalign.c @@ -0,0 +1,609 @@ +/*-----------------------------------------------------------------------*/ +/* Program: STREAM */ +/* Revision: $Id: stream.c,v 5.10.1 2014/06/17 08:16:08 mccalpin Exp mccalpin $ */ +/* Original code developed by John D. McCalpin */ +/* Programmers: John D. McCalpin */ +/* Joe R. Zagar */ +/* */ +/* This program measures memory transfer rates in MB/s for simple */ +/* computational kernels coded in C. */ +/*-----------------------------------------------------------------------*/ +/* Copyright 1991-2013: John D. McCalpin */ +/*-----------------------------------------------------------------------*/ +/* License: */ +/* 1. You are free to use this program and/or to redistribute */ +/* this program. */ +/* 2. You are free to modify this program for your own use, */ +/* including commercial use, subject to the publication */ +/* restrictions in item 3. */ +/* 3. You are free to publish results obtained from running this */ +/* program, or from works that you derive from this program, */ +/* with the following limitations: */ +/* 3a. In order to be referred to as "STREAM benchmark results", */ +/* published results must be in conformance to the STREAM */ +/* Run Rules, (briefly reviewed below) published at */ +/* http://www.cs.virginia.edu/stream/ref.html */ +/* and incorporated herein by reference. */ +/* As the copyright holder, John McCalpin retains the */ +/* right to determine conformity with the Run Rules. */ +/* 3b. Results based on modified source code or on runs not in */ +/* accordance with the STREAM Run Rules must be clearly */ +/* labelled whenever they are published. Examples of */ +/* proper labelling include: */ +/* "tuned STREAM benchmark results" */ +/* "based on a variant of the STREAM benchmark code" */ +/* Other comparable, clear, and reasonable labelling is */ +/* acceptable. */ +/* 3c. Submission of results to the STREAM benchmark web site */ +/* is encouraged, but not required. */ +/* 4. Use of this program or creation of derived works based on this */ +/* program constitutes acceptance of these licensing restrictions. */ +/* 5. Absolutely no warranty is expressed or implied. */ +/*-----------------------------------------------------------------------*/ +# include +# include +# include +# include +# include +# include +# include + +/*----------------------------------------------------------------------- + * INSTRUCTIONS: + * + * 1) STREAM requires different amounts of memory to run on different + * systems, depending on both the system cache size(s) and the + * granularity of the system timer. + * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) + * to meet *both* of the following criteria: + * (a) Each array must be at least 4 times the size of the + * available cache memory. I don't worry about the difference + * between 10^6 and 2^20, so in practice the minimum array size + * is about 3.8 times the cache size. + * Example 1: One Xeon E3 with 8 MB L3 cache + * STREAM_ARRAY_SIZE should be >= 4 million, giving + * an array size of 30.5 MB and a total memory requirement + * of 91.5 MB. + * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) + * STREAM_ARRAY_SIZE should be >= 20 million, giving + * an array size of 153 MB and a total memory requirement + * of 458 MB. + * (b) The size should be large enough so that the 'timing calibration' + * output by the program is at least 20 clock-ticks. + * Example: most versions of Windows have a 10 millisecond timer + * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. + * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. + * This means the each array must be at least 1 GB, or 128M elements. + * + * Version 5.10 increases the default array size from 2 million + * elements to 10 million elements in response to the increasing + * size of L3 caches. The new default size is large enough for caches + * up to 20 MB. + * Version 5.10 changes the loop index variables from "register int" + * to "ssize_t", which allows array indices >2^32 (4 billion) + * on properly configured 64-bit systems. Additional compiler options + * (such as "-mcmodel=medium") may be required for large memory runs. + * + * Array size can be set at compile time without modifying the source + * code for the (many) compilers that support preprocessor definitions + * on the compile line. E.g., + * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M + * will override the default size of 10M with a new size of 100M elements + * per array. + */ +#ifndef STREAM_ARRAY_SIZE +# define STREAM_ARRAY_SIZE 10000000 +#endif + +/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result + * for any iteration after the first, therefore the minimum value + * for NTIMES is 2. + * There are no rules on maximum allowable values for NTIMES, but + * when running with STREAM_TYPE=float, the results will overflow + * if NTIMES exceeds 32. Results will probably overflow at some + * point with STREAM_TYPE=double, but I have not checked the exact value. + * Values larger than the default are unlikely to noticeably + * increase the reported performance. + * NTIMES can also be set on the compile line without changing the source + * code using, for example, "-DNTIMES=7". + */ +#ifdef NTIMES +#if NTIMES<=1 +# define NTIMES 10 +#endif +#endif +#ifndef NTIMES +# define NTIMES 10 +#endif + +/* Users are allowed to modify the "OFFSET" variable, which *may* change the + * relative alignment of the arrays (though compilers may change the + * effective offset by making the arrays non-contiguous on some systems). + * Use of non-zero values for OFFSET can be especially helpful if the + * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. + * OFFSET can also be set on the compile line without changing the source + * code using, for example, "-DOFFSET=56". + */ +#ifndef OFFSET +# define OFFSET 0 +#endif + +/* + * 3) Compile the code with optimization. Many compilers generate + * unreasonably bad code before the optimizer tightens things up. + * If the results are unreasonably good, on the other hand, the + * optimizer might be too smart for me! + * + * For a simple single-core version, try compiling with: + * cc -O stream.c -o stream + * This is known to work on many, many systems.... + * + * To use multiple cores, you need to tell the compiler to obey the OpenMP + * directives in the code. This varies by compiler, but a common example is + * gcc -O -fopenmp stream.c -o stream_omp + * The environment variable OMP_NUM_THREADS allows runtime control of the + * number of threads/cores used when the resulting "stream_omp" program + * is executed. + * + * To run with single-precision variables and arithmetic, simply add + * -DSTREAM_TYPE=float + * to the compile line. + * Note that this changes the minimum array sizes required --- see (1) above. + * + * The preprocessor directive "TUNED" does not do much -- it simply causes the + * code to call separate functions to execute each kernel. Trivial versions + * of these functions are provided, but they are *not* tuned -- they just + * provide predefined interfaces to be replaced with tuned code. + * + * + * 4) Optional: Mail the results to mccalpin@cs.virginia.edu + * Be sure to include info that will help me understand: + * a) the computer hardware configuration (e.g., processor model, memory type) + * b) the compiler name/version and compilation flags + * c) any run-time information (such as OMP_NUM_THREADS) + * d) all of the output from the test case. + * + * Thanks! + * + *-----------------------------------------------------------------------*/ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +#ifndef STREAM_TYPE +#define STREAM_TYPE double +#endif + +//static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], +// b[STREAM_ARRAY_SIZE+OFFSET], +// c[STREAM_ARRAY_SIZE+OFFSET]; +double *a,*b,*c; + +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE + }; + +extern double mysecond(); +extern void checkSTREAMresults(); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(STREAM_TYPE scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(STREAM_TYPE scalar); +#endif +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + int k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + size_t arraybytes,arrayalignment; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + printf("STREAM version $Revision: 5.10 $\n"); + printf(HLINE); + BytesPerWord = sizeof(STREAM_TYPE); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + arraybytes = (STREAM_ARRAY_SIZE + OFFSET)*sizeof(STREAM_TYPE); + arrayalignment = 64; + k = posix_memalign((void **)&a, arrayalignment, arraybytes); + if (k != 0) { + printf("Allocation of array a failed, return code is %d\n",k); + exit(1); + } + k = posix_memalign((void **)&b, arrayalignment, arraybytes); + if (k != 0) { + printf("Allocation of array b failed, return code is %d\n",k); + exit(1); + } + k = posix_memalign((void **)&c, arrayalignment, arraybytes); + if (k != 0) { + printf("Allocation of array c failed, return code is %d\n",k); + exit(1); + } + + printf(HLINE); +#ifdef N + printf("***** WARNING: ******\n"); + printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); + printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); + printf("***** WARNING: ******\n"); +#endif + + printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); + printf("Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); + printf("Each kernel will be executed %d times.\n", NTIMES); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" will be used to compute the reported bandwidth.\n"); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } + } +#endif + +#ifdef _OPENMP + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted = %i\n",k); +#endif + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < STREAM_ARRAY_SIZE; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +#ifndef abs +#define abs(a) ((a) >= 0 ? (a) : -(a)) +#endif +void checkSTREAMresults () +{ + STREAM_TYPE aj,bj,cj,scalar; + STREAM_TYPE aSumErr,bSumErr,cSumErr; + STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; + double epsilon; + ssize_t j; + int k,ierr,err; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } +#endif + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } +#endif + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } +#endif + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } +#ifdef VERBOSE + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); +#endif +} + +#ifdef TUNED +/* stubs for "tuned" versions of the kernels */ +void tuned_STREAM_Copy() +{ + ssize_t j; +#pragma omp parallel for + for (j=0; j +# include +# include +# include +# include + +/* + * Program: Stream + * Programmer: Joe R. Zagar + * Revision: 4.0-BETA, October 24, 1995 + * Original code developed by John D. McCalpin + * + * This program measures memory transfer rates in MB/s for simple + * computational kernels coded in C. These numbers reveal the quality + * of code generation for simple uncacheable kernels as well as showing + * the cost of floating-point operations relative to memory accesses. + * + * INSTRUCTIONS: + * + * 1) Stream requires a good bit of memory to run. Adjust the + * value of 'N' (below) to give a 'timing calibration' of + * at least 20 clock-ticks. This will provide rate estimates + * that should be good to about 5% precision. + */ + +# define N 2000000 +# define NTIMES 10 +# define OFFSET 0 + +/* + * 3) Compile the code with full optimization. Many compilers + * generate unreasonably bad code before the optimizer tightens + * things up. If the results are unreasonably good, on the + * other hand, the optimizer might be too smart for me! + * + * Try compiling with: + * cc -O stream_d.c second_wall.c -o stream_d -lm + * + * This is known to work on Cray, SGI, IBM, and Sun machines. + * + * + * 4) Mail the results to mccalpin@cs.virginia.edu + * Be sure to include: + * a) computer hardware model number and software revision + * b) the compiler flags + * c) all of the output from the test case. + * Thanks! + * + */ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +static double a[N+OFFSET], + b[N+OFFSET], + c[N+OFFSET]; + +static double rmstime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(double) * N, + 2 * sizeof(double) * N, + 3 * sizeof(double) * N, + 3 * sizeof(double) * N + }; + +extern double mysecond(); + +int +main() + { + int quantum, checktick(); + int BytesPerWord; + register int j, k; + double scalar, t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + BytesPerWord = sizeof(double); + printf("This system uses %d bytes per DOUBLE PRECISION word.\n", + BytesPerWord); + + printf(HLINE); + printf("Array size = %d, Offset = %d\n" , N, OFFSET); + printf("Total memory required = %.1f MB.\n", + (3 * N * BytesPerWord) / 1048576.0); + printf("Each test is run %d times, but only\n", NTIMES); + printf("the *best* time for each is used.\n"); + + /* Get initial value for system clock. */ + + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + + t = mysecond(); + for (j = 0; j < N; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k +# include +# include +# include +# include +# include +# include +# include +# include "mpi.h" + +/*----------------------------------------------------------------------- + * INSTRUCTIONS: + * + * 1) STREAM requires different amounts of memory to run on different + * systems, depending on both the system cache size(s) and the + * granularity of the system timer. + * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) + * to meet *both* of the following criteria: + * (a) Each array must be at least 4 times the size of the + * available cache memory. I don't worry about the difference + * between 10^6 and 2^20, so in practice the minimum array size + * is about 3.8 times the cache size. + * Example 1: One Xeon E3 with 8 MB L3 cache + * STREAM_ARRAY_SIZE should be >= 4 million, giving + * an array size of 30.5 MB and a total memory requirement + * of 91.5 MB. + * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) + * STREAM_ARRAY_SIZE should be >= 20 million, giving + * an array size of 153 MB and a total memory requirement + * of 458 MB. + * (b) The size should be large enough so that the 'timing calibration' + * output by the program is at least 20 clock-ticks. + * Example: most versions of Windows have a 10 millisecond timer + * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. + * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. + * This means the each array must be at least 1 GB, or 128M elements. + * + * Version 5.10 increases the default array size from 2 million + * elements to 10 million elements in response to the increasing + * size of L3 caches. The new default size is large enough for caches + * up to 20 MB. + * Version 5.10 changes the loop index variables from "register int" + * to "ssize_t", which allows array indices >2^32 (4 billion) + * on properly configured 64-bit systems. Additional compiler options + * (such as "-mcmodel=medium") may be required for large memory runs. + * + * Array size can be set at compile time without modifying the source + * code for the (many) compilers that support preprocessor definitions + * on the compile line. E.g., + * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M + * will override the default size of 10M with a new size of 100M elements + * per array. + */ + +// ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------ +// For the MPI version of STREAM, the three arrays with this many elements +// each will be *distributed* across the MPI ranks. +// +// Be careful when computing the array size needed for a particular target +// system to meet the minimum size requirement to ensure overflowing the caches. +// +// Example: +// Assume 4 nodes with two Intel Xeon E5-2680 processors (20 MiB L3) each. +// The *total* L3 cache size is 4*2*20 = 160 MiB, so each array must be +// at least 640 MiB, or at least 80 million 8 Byte elements. +// Note that it does not matter whether you use one MPI rank per node or +// 16 MPI ranks per node -- only the total array size and the total +// cache size matter. +// +#ifndef STREAM_ARRAY_SIZE +# define STREAM_ARRAY_SIZE 10000000 +#endif + +/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result + * for any iteration after the first, therefore the minimum value + * for NTIMES is 2. + * There are no rules on maximum allowable values for NTIMES, but + * values larger than the default are unlikely to noticeably + * increase the reported performance. + * NTIMES can also be set on the compile line without changing the source + * code using, for example, "-DNTIMES=7". + */ +#ifdef NTIMES +#if NTIMES<=1 +# define NTIMES 10 +#endif +#endif +#ifndef NTIMES +# define NTIMES 10 +#endif + +// Make the scalar coefficient modifiable at compile time. +// The old value of 3.0 cause floating-point overflows after a relatively small +// number of iterations. The new default of 0.42 allows over 2000 iterations for +// 32-bit IEEE arithmetic and over 18000 iterations for 64-bit IEEE arithmetic. +// The growth in the solution can be eliminated (almost) completely by setting +// the scalar value to 0.41421445, but this also means that the error checking +// code no longer triggers an error if the code does not actually execute the +// correct number of iterations! +#ifndef SCALAR +#define SCALAR 0.42 +#endif + + +// ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------ +// The OFFSET preprocessor variable is not used in this version of the benchmark. +// The user must change the code at or after the "posix_memalign" array allocations +// to change the relative alignment of the pointers. +// ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------ +#ifndef OFFSET +# define OFFSET 0 +#endif + + +/* + * 3) Compile the code with optimization. Many compilers generate + * unreasonably bad code before the optimizer tightens things up. + * If the results are unreasonably good, on the other hand, the + * optimizer might be too smart for me! + * + * For a simple single-core version, try compiling with: + * cc -O stream.c -o stream + * This is known to work on many, many systems.... + * + * To use multiple cores, you need to tell the compiler to obey the OpenMP + * directives in the code. This varies by compiler, but a common example is + * gcc -O -fopenmp stream.c -o stream_omp + * The environment variable OMP_NUM_THREADS allows runtime control of the + * number of threads/cores used when the resulting "stream_omp" program + * is executed. + * + * To run with single-precision variables and arithmetic, simply add + * -DSTREAM_TYPE=float + * to the compile line. + * Note that this changes the minimum array sizes required --- see (1) above. + * + * The preprocessor directive "TUNED" does not do much -- it simply causes the + * code to call separate functions to execute each kernel. Trivial versions + * of these functions are provided, but they are *not* tuned -- they just + * provide predefined interfaces to be replaced with tuned code. + * + * + * 4) Optional: Mail the results to mccalpin@cs.virginia.edu + * Be sure to include info that will help me understand: + * a) the computer hardware configuration (e.g., processor model, memory type) + * b) the compiler name/version and compilation flags + * c) any run-time information (such as OMP_NUM_THREADS) + * d) all of the output from the test case. + * + * Thanks! + * + *-----------------------------------------------------------------------*/ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +#ifndef STREAM_TYPE +#define STREAM_TYPE double +#endif + +//static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], +// b[STREAM_ARRAY_SIZE+OFFSET], +// c[STREAM_ARRAY_SIZE+OFFSET]; + +// Some compilers require an extra keyword to recognize the "restrict" qualifier. +double * restrict a, * restrict b, * restrict c; + +size_t array_elements, array_bytes, array_alignment; +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE + }; + +extern void checkSTREAMresults(STREAM_TYPE *AvgErrByRank, int numranks); +extern void computeSTREAMerrors(STREAM_TYPE *aAvgErr, STREAM_TYPE *bAvgErr, STREAM_TYPE *cAvgErr); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(STREAM_TYPE scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(STREAM_TYPE scalar); +#endif +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + int i,k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + double *TimesByRank; + double t0,t1,tmin; + int rc, numranks, myrank; + STREAM_TYPE AvgError[3] = {0.0,0.0,0.0}; + STREAM_TYPE *AvgErrByRank; + + /* --- SETUP --- call MPI_Init() before anything else! --- */ + + rc = MPI_Init(NULL, NULL); + t0 = MPI_Wtime(); + if (rc != MPI_SUCCESS) { + printf("ERROR: MPI Initialization failed with return code %d\n",rc); + exit(1); + } + // if either of these fail there is something really screwed up! + MPI_Comm_size(MPI_COMM_WORLD, &numranks); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + /* --- NEW FEATURE --- distribute requested storage across MPI ranks --- */ + array_elements = STREAM_ARRAY_SIZE / numranks; // don't worry about rounding vs truncation + array_alignment = 64; // Can be modified -- provides partial support for adjusting relative alignment + + // Dynamically allocate the three arrays using "posix_memalign()" + // NOTE that the OFFSET parameter is not used in this version of the code! + array_bytes = array_elements * sizeof(STREAM_TYPE); + k = posix_memalign((void **)&a, array_alignment, array_bytes); + if (k != 0) { + printf("Rank %d: Allocation of array a failed, return code is %d\n",myrank,k); + MPI_Abort(MPI_COMM_WORLD, 2); + exit(1); + } + k = posix_memalign((void **)&b, array_alignment, array_bytes); + if (k != 0) { + printf("Rank %d: Allocation of array b failed, return code is %d\n",myrank,k); + MPI_Abort(MPI_COMM_WORLD, 2); + exit(1); + } + k = posix_memalign((void **)&c, array_alignment, array_bytes); + if (k != 0) { + printf("Rank %d: Allocation of array c failed, return code is %d\n",myrank,k); + MPI_Abort(MPI_COMM_WORLD, 2); + exit(1); + } + + // Initial informational printouts -- rank 0 handles all the output + if (myrank == 0) { + printf(HLINE); + printf("STREAM version $Revision: 1.8 $\n"); + printf(HLINE); + BytesPerWord = sizeof(STREAM_TYPE); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + printf(HLINE); +#ifdef N + printf("***** WARNING: ******\n"); + printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); + printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); + printf("***** WARNING: ******\n"); +#endif + if (OFFSET != 0) { + printf("***** WARNING: ******\n"); + printf(" This version ignores the OFFSET parameter.\n"); + printf("***** WARNING: ******\n"); + } + + printf("Total Aggregate Array size = %llu (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE); + printf("Total Aggregate Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total Aggregate memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); + printf("Data is distributed across %d MPI ranks\n",numranks); + printf(" Array size per MPI rank = %llu (elements)\n" , (unsigned long long) array_elements); + printf(" Memory per array per MPI rank = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) array_elements / 1024.0/1024.0), + BytesPerWord * ( (double) array_elements / 1024.0/1024.0/1024.0)); + printf(" Total memory per MPI rank = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024./1024.)); + + printf(HLINE); + printf("Each kernel will be executed %d times.\n", NTIMES); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" will be used to compute the reported bandwidth.\n"); + printf("The SCALAR value used for this run is %f\n",SCALAR); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested for each MPI rank = %i\n",k); + } + } +#endif + +#ifdef _OPENMP + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted for rank 0 = %i\n",k); +#endif + + } + + /* --- SETUP --- initialize arrays and estimate precision of timer --- */ + +#pragma omp parallel for + for (j=0; j= 1) + printf("Your timer granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your timer granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + } + + /* Get initial timing estimate to compare to timer granularity. */ + /* All ranks need to run this code since it changes the values in array a */ + t = MPI_Wtime(); +#pragma omp parallel for + for (j = 0; j < array_elements; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (MPI_Wtime() - t); + + if (myrank == 0) { + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d timer ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 timer ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); +#ifdef VERBOSE + t1 = MPI_Wtime(); + printf("VERBOSE: total setup time for rank 0 = %f seconds\n",t1-t0); + printf(HLINE); +#endif + } + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + // This code has more barriers and timing calls than are actually needed, but + // this should not cause a problem for arrays that are large enough to satisfy + // the STREAM run rules. + // MAJOR FIX!!! Version 1.7 had the start timer for each loop *after* the + // MPI_Barrier(), when it should have been *before* the MPI_Barrier(). + // + + scalar = SCALAR; + for (k=0; k= 0 ? (a) : -(a)) +#endif +void computeSTREAMerrors(STREAM_TYPE *aAvgErr, STREAM_TYPE *bAvgErr, STREAM_TYPE *cAvgErr) +{ + STREAM_TYPE aj,bj,cj,scalar; + STREAM_TYPE aSumErr,bSumErr,cSumErr; + ssize_t j; + int k; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = SCALAR; + for (k=0; k epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } +#endif + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } +#endif + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } +#endif + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } +#ifdef VERBOSE + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); +#endif +} + +#ifdef TUNED +/* stubs for "tuned" versions of the kernels */ +void tuned_STREAM_Copy() +{ + ssize_t j; +#pragma omp parallel for + for (j=0; j +# include +# include +# include +# include + +/* INSTRUCTIONS: + * + * 1) Stream requires a good bit of memory to run. Adjust the + * value of 'N' (below) to give a 'timing calibration' of + * at least 20 clock-ticks. This will provide rate estimates + * that should be good to about 5% precision. + */ + +# define N 2000000 +# define NTIMES 10 +# define OFFSET 0 + +/* + * 3) Compile the code with full optimization. Many compilers + * generate unreasonably bad code before the optimizer tightens + * things up. If the results are unreasonably good, on the + * other hand, the optimizer might be too smart for me! + * + * Try compiling with: + * cc -O stream_omp.c -o stream_omp + * + * This is known to work on Cray, SGI, IBM, and Sun machines. + * + * + * 4) Mail the results to mccalpin@cs.virginia.edu + * Be sure to include: + * a) computer hardware model number and software revision + * b) the compiler flags + * c) all of the output from the test case. + * Thanks! + * + */ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +static double a[N+OFFSET], + b[N+OFFSET], + c[N+OFFSET]; + +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(double) * N, + 2 * sizeof(double) * N, + 3 * sizeof(double) * N, + 3 * sizeof(double) * N + }; + +extern double mysecond(); +extern void checkSTREAMresults(); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(double scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(double scalar); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + register int j, k; + double scalar, t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + BytesPerWord = sizeof(double); + printf("This system uses %d bytes per DOUBLE PRECISION word.\n", + BytesPerWord); + + printf(HLINE); + printf("Array size = %d, Offset = %d\n" , N, OFFSET); + printf("Total memory required = %.1f MB.\n", + (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); + printf("Each test is run %d times, but only\n", NTIMES); + printf("the *best* time for each is used.\n"); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel private(k) + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } +#endif + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < N; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +void checkSTREAMresults () +{ + double aj,bj,cj,scalar; + double asum,bsum,csum; + double epsilon; + int j,k; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k= 0 ? (a) : -(a)) + epsilon = 1.e-8; + + if (abs(aj-asum)/asum > epsilon) { + printf ("Failed Validation on array a[]\n"); + printf (" Expected : %f \n",aj); + printf (" Observed : %f \n",asum); + } + else if (abs(bj-bsum)/bsum > epsilon) { + printf ("Failed Validation on array b[]\n"); + printf (" Expected : %f \n",bj); + printf (" Observed : %f \n",bsum); + } + else if (abs(cj-csum)/csum > epsilon) { + printf ("Failed Validation on array c[]\n"); + printf (" Expected : %f \n",cj); + printf (" Observed : %f \n",csum); + } + else { + printf ("Solution Validates\n"); + } +} + +void tuned_STREAM_Copy() +{ + int j; +#pragma omp parallel for + for (j=0; j