diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index abc7b14b9..ef751105d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,75 +1,91 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
+# DESCRIPTION:
 ###############################################################################
 # General GitLab pipelines configurations for supercomputers and Linux clusters
 # at Lawrence Livermore National Laboratory (LLNL).
-#
 # This entire pipeline is LLNL-specific
 #
-# Important note: This file is a copy of the template provided by
-# llnl/radiuss-shared-ci. It should not require any change from the project to
-# get started but could feature project-specific stages.
+# Important note: This file is a template provided by llnl/radiuss-shared-ci.
+# Remains to set variable values, change the reference to the radiuss-shared-ci
+# repo, opt-in and out optional features. The project can then extend it with
+# additional stages.
 #
-# Instead, each project should provide:
-# - .gitlab/subscribed-pipelines.yml
+# In addition, each project should copy over and complete:
 # - .gitlab/custom-jobs-and-variables.yml
-# - .gitlab/${MACHINE}-build-and-test-extra.yml
+# - .gitlab/subscribed-pipelines.yml
+#
+# The jobs should be specified in a file local to the project,
+# - .gitlab/jobs/${CI_MACHINE}.yml
+# or generated (see LLNL/Umpire for an example).
 ###############################################################################
 
 # We define the following GitLab pipeline variables:
 variables:
-# Required information about GitHub repository
-  GITHUB_PROJECT_NAME: "RAJAPerf"
-  GITHUB_PROJECT_ORG: "LLNL"
-# Use the umdev service user to run CI. This prevents from running pipelines as
-# an actual user.
+##### LC GITLAB CONFIGURATION
+# Use a LLNL service user to run CI. This prevents from running pipelines as an
+# actual user.
   LLNL_SERVICE_USER: rajasa
 # Use the service user workspace. Solves permission issues, stores everything
 # at the same location whoever triggers a pipeline.
-#  CUSTOM_CI_BUILDS_DIR: ""
+#  CUSTOM_CI_BUILDS_DIR: "/usr/workspace/rajasa/gitlab-runner"
 # Tells Gitlab to recursively update the submodules when cloning the project.
   GIT_SUBMODULE_STRATEGY: recursive
-# We build the projects in the CI clone directory.
-# TODO: add a clean-up mechanism
+
+##### PROJECT VARIABLES
+# We build the projects in the CI clone directory (used in
+# script/gitlab/build_and_test.sh script).
+# TODO: add a clean-up mechanism.
   BUILD_ROOT: ${CI_PROJECT_DIR}
+
+##### SHARED_CI CONFIGURATION
+# Required information about GitHub repository
+  GITHUB_PROJECT_NAME: "RAJAPerf"
+  GITHUB_PROJECT_ORG: "LLNL"
 # Set the build-and-test command.
-  BUILD_AND_TEST_CMD: "./scripts/gitlab/build_and_test.sh"
-# Override the list of branch that will skip the "draft PR test".
-# Add protected branches here. Defaults to "develop main master".
-#  ALWAYS_RUN_LIST: "develop main"
+  JOB_CMD:
+    value: "./scripts/gitlab/build_and_test.sh"
+    expand: false
+# Override the pattern describing branches that will skip the "draft PR filter
+# test".  Add protected branches here. See default value in
+# preliminary-ignore-draft-pr.yml.
+#  ALWAYS_RUN_PATTERN: "^develop$|^main$|^v[0-9.]*-RC$"
 
-# We organize the CI on Gitlab in sub-pipelines. Each sub-pipeline corresponds
-# to a test phase on a given machine.
+# We organize the build-and-test stage with sub-pipelines. Each sub-pipeline
+# corresponds to a test batch on a given machine.
 
 # High level stages
 stages:
-  - machine-checks
+  - prerequisites
   - build-and-test
 
-# Template for jobs triggering a build-and-test sub-pipelines:
+# Template for jobs triggering a build-and-test sub-pipeline:
 .build-and-test:
   stage: build-and-test
   trigger:
     include:
       - local: '.gitlab/custom-jobs-and-variables.yml'
       - project: 'radiuss/radiuss-shared-ci'
-        ref: v2023.06.0
-        file: '${CI_MACHINE}-build-and-test.yml'
-      - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml'
+        ref: 'v2024.06.0'
+        file: 'pipelines/${CI_MACHINE}.yml'
+      - artifact: '${CI_MACHINE}-jobs.yml'
+        job: 'generate-job-lists'
     strategy: depend
     forward:
       pipeline_variables: true
 
 include:
-  # checks preliminary to running the actual CI test (optional)
+  - project: 'lc-templates/id_tokens'
+    file: 'id_tokens.yml'
+  # [Optional] checks preliminary to running the actual CI test
   #- project: 'radiuss/radiuss-shared-ci'
-  #  ref: v2023.03.1
-  #  file: 'preliminary-ignore-draft-pr.yml'
+  #  ref: 'v2024.06.0'
+  #  file: 'utilities/preliminary-ignore-draft-pr.yml'
   # pipelines subscribed by the project
   - local: '.gitlab/subscribed-pipelines.yml'
diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index a4081efe1..931d1961b 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -15,19 +15,30 @@ variables:
 
 # Ruby
 # Arguments for top level allocation
-  RUBY_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=45 --nodes=1"
+  RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --time=40 --nodes=1"
 # Arguments for job level allocation
-  RUBY_BUILD_AND_TEST_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=30 --nodes=1"
+# Note: We repeat the reservation, necessary when jobs are manually re-triggered.
+  RUBY_JOB_ALLOC: "--reservation=ci --nodes=1"
 # Project specific variants for ruby
   PROJECT_RUBY_VARIANTS: "~shared +openmp"
 # Project specific deps for ruby
-  PROJECT_RUBY_DEPS: ""
+  PROJECT_RUBY_DEPS: "^blt@develop "
+
+# Poodle
+# Arguments for top level allocation
+  POODLE_SHARED_ALLOC: "--exclusive --time=40 --nodes=1"
+# Arguments for job level allocation
+  POODLE_JOB_ALLOC: "--nodes=1"
+# Project specific variants for poodle
+  PROJECT_POODLE_VARIANTS: "~shared +openmp"
+# Project specific deps for poodle
+  PROJECT_POODLE_DEPS: "^blt@develop "
 
 # Corona
 # Arguments for top level allocation
-  CORONA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1"
+  CORONA_SHARED_ALLOC: "--exclusive --time-limit=12m --nodes=1 -o per-resource.count=2"
 # Arguments for job level allocation
-  CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=30m --nodes=1 --begin-time=+5s"
+  CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
 # Project specific variants for corona
   PROJECT_CORONA_VARIANTS: "~shared ~openmp"
 # Project specific deps for corona
@@ -35,28 +46,28 @@ variables:
 
 # Tioga
 # Arguments for top level allocation
-  TIOGA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1"
+  TIOGA_SHARED_ALLOC: "--queue=pci --exclusive --time-limit=26m --nodes=1 -o per-resource.count=2"
 # Arguments for job level allocation
-  TIOGA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1 --begin-time=+5s"
-# Project specific variants for corona
-  PROJECT_TIOGA_VARIANTS: "~shared ~openmp"
-# Project specific deps for corona
+  TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
+# Project specific variants for tioga
+  PROJECT_TIOGA_VARIANTS: "~shared +openmp"
+# Project specific deps for tioga
   PROJECT_TIOGA_DEPS: "^blt@develop "
 
 # Lassen and Butte use a different job scheduler (spectrum lsf) that does not
 # allow pre-allocation the same way slurm does.
 # Arguments for job level allocation
-  LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 30"
+  LASSEN_JOB_ALLOC: "1 -W 20 -q pci"
 # Project specific variants for lassen
   PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70"
 # Project specific deps for lassen
-  PROJECT_LASSEN_DEPS: ""
+  PROJECT_LASSEN_DEPS: "^blt@develop "
 
 # Configuration shared by build and test jobs specific to this project.
 # Not all configuration can be shared. Here projects can fine tune the
 # CI behavior.
 # See Umpire for an example (export junit test reports).
-.custom_build_and_test:
+.custom_job:
   artifacts:
     reports:
       junit: junit.xml
diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/jobs/corona.yml
similarity index 76%
rename from .gitlab/corona-build-and-test-extra.yml
rename to .gitlab/jobs/corona.yml
index 03d67218a..8fec233c5 100644
--- a/.gitlab/corona-build-and-test-extra.yml
+++ b/.gitlab/jobs/corona.yml
@@ -1,11 +1,18 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 #############################################################################
 
+# Override reproducer section to define project specific variables.
+.corona_reproducer_vars:
+  script:
+    - |
+      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
+      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+
 ########################
 # Overridden shared jobs
 ########################
diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/jobs/lassen.yml
similarity index 50%
rename from .gitlab/lassen-build-and-test-extra.yml
rename to .gitlab/jobs/lassen.yml
index 68850e5e8..c6eacf864 100644
--- a/.gitlab/lassen-build-and-test-extra.yml
+++ b/.gitlab/jobs/lassen.yml
@@ -1,11 +1,18 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ##############################################################################
 
+# Override reproducer section to define project specific variables.
+.lassen_reproducer_vars:
+  script:
+    - |
+      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
+      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+
 ########################
 # Overridden shared jobs
 ########################
@@ -16,18 +23,10 @@
 # Overriding shared spec: Longer allocation + extra flags
 xl_2022_08_19_gcc_8_3_1_cuda_11_2_0:
   variables:
-    SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}"
+    SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@=16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}"
     MODULE_LIST: "cuda/11.2.0"
-    LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120"
-  extends: .build_and_test_on_lassen
-
-# Overriding shared spec: Longer allocation + extra flags
-xl_2022_08_19_gcc_8_3_1_cuda_11_7_0:
-  variables:
-    SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.7.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}"
-    MODULE_LIST: "cuda/11.7.0"
-    LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120"
-  extends: .build_and_test_on_lassen
+    LASSEN_JOB_ALLOC: "1 -W 60 -q pci"
+  extends: .job_on_lassen
 
 
 ############
@@ -37,16 +36,24 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_7_0:
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
-##########
-# CUDA
-##########
+gcc_8_3_1:
+  variables:
+    SPEC: " ~shared +openmp %gcc@=8.3.1 ${PROJECT_LASSEN_DEPS}"
+  extends: .job_on_lassen
 
 gcc_8_3_1_cuda_11_5_0_ats_disabled:
-  extends: .build_and_test_on_lassen
+  extends: .job_on_lassen
+  variables:
+    SPEC: " ~shared +openmp +cuda %gcc@=8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}"
+    MODULE_LIST: "cuda/11.5.0"
+    LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci"
+
+gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi:
+  extends: .job_on_lassen
   variables:
-    SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers"
+    SPEC: " ~shared +openmp +cuda +mpi %gcc@=8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ${PROJECT_LASSEN_DEPS}"
     MODULE_LIST: "cuda/11.5.0"
-    LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 30"
+    LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci"
 
 ##########
 # OTHERS
@@ -54,18 +61,18 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled:
 
 clang_13_0_1_libcpp:
   variables:
-    SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\""
-  extends: .build_and_test_on_lassen
+    SPEC: " ~shared +openmp %clang@=13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ${PROJECT_LASSEN_DEPS}"
+  extends: .job_on_lassen
 
 #clang_14_0_5_asan:
 #  variables:
-#    SPEC: " ~shared +openmp %clang@14.0.5 cxxflags==\"-fsanitize=address\""
+#    SPEC: " ~shared +openmp %clang@=14.0.5 cxxflags==\"-fsanitize=address\" ${PROJECT_LASSEN_DEPS}"
 #    ASAN_OPTIONS: "detect_leaks=1"
 #    LSAN_OPTIONS: "suppressions=${CI_PROJECT_DIR}/tpl/RAJA/suppressions.asan"
-#  extends: .build_and_test_on_lassen
+#  extends: .job_on_lassen
 
 # Activated in RAJA, but we don't use desul atomics here
 #gcc_8_3_1_cuda_10_1_168_desul_atomics:
 #  variables:
-#    SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
-#  extends: .build_and_test_on_lassen
+#    SPEC: "+openmp +cuda +desul %gcc@=8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}"
+#  extends: .job_on_lassen
diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml
new file mode 100644
index 000000000..ed18f60f5
--- /dev/null
+++ b/.gitlab/jobs/poodle.yml
@@ -0,0 +1,55 @@
+##############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA Performance Suite project contributors.
+# See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+##############################################################################
+
+# Override reproducer section to define projet specific variables.
+.poodle_reproducer_vars:
+  script:
+    - |
+      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
+      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+
+########################
+# Overridden shared jobs
+########################
+# We duplicate the shared jobs description and add necessary changes for RAJA.
+# We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
+# the comparison with the original job is easier.
+
+clang_14_0_6:
+  variables:
+    SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@=14.0.6 ${PROJECT_POODLE_DEPS}"
+  extends: .job_on_poodle
+
+gcc_10_3_1:
+  variables:
+    SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@=10.3.1 ${PROJECT_POODLE_DEPS}"
+  extends: .job_on_poodle
+
+intel_19_1_2_gcc_10_3_1:
+  variables:
+    SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}"
+  extends: .job_on_poodle
+
+intel_2022_1_0:
+  variables:
+    SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=2022.1.0 ${PROJECT_POODLE_DEPS}"
+  allow_failure: true
+  extends: .job_on_poodle
+
+############
+# Extra jobs
+############
+# We do not recommend using ${PROJECT_<MACHINE>_VARIANTS} and
+# ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
+# describe the spec here.
+
+intel_2022_1_0_mpi:
+  variables:
+    SPEC: "~shared +openmp +mpi %intel@=2022.1.0 ^mvapich2 ${PROJECT_POODLE_DEPS}"
+  allow_failure: true
+  extends: .job_on_poodle
diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/jobs/ruby.yml
similarity index 51%
rename from .gitlab/ruby-build-and-test-extra.yml
rename to .gitlab/jobs/ruby.yml
index da320f4f8..3502ed3fb 100644
--- a/.gitlab/ruby-build-and-test-extra.yml
+++ b/.gitlab/jobs/ruby.yml
@@ -1,35 +1,46 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ##############################################################################
 
-########################
+# Override reproducer section to define project specific variables.
+.ruby_reproducer_vars:
+  script:
+    - |
+      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
+      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+
+#######################
 # Overridden shared jobs
 ########################
 # We duplicate the shared jobs description and add necessary changes for RAJA.
 # We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
 # the comparison with the original job is easier.
 
-# Overriding shared config for longer run and algorithm variants
 clang_14_0_6:
   variables:
-    SPEC: " ~shared +openmp +omptask %clang@14.0.6"
-  extends: .build_and_test_on_ruby
+    SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@=14.0.6 ${PROJECT_RUBY_DEPS}"
+  extends: .job_on_ruby
 
 gcc_10_3_1:
   variables:
-    SPEC: " ~shared +openmp +omptask %gcc@10.3.1"
+    SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@=10.3.1 ${PROJECT_RUBY_DEPS}"
     RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1"
-  extends: .build_and_test_on_ruby
+  extends: .job_on_ruby
 
-intel_19_1_2_gcc_8_5_0:
+intel_19_1_2_gcc_10_3_1:
   variables:
-    SPEC: " +openmp %intel@19.1.2.gcc.8.5.0"
+    SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}"
     RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1"
-  extends: .build_and_test_on_ruby
+  extends: .job_on_ruby
+
+intel_2022_1_0:
+  variables:
+    SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}"
+  extends: .job_on_ruby
 
 ############
 # Extra jobs
@@ -37,3 +48,8 @@ intel_19_1_2_gcc_8_5_0:
 # We do not recommend using ${PROJECT_<MACHINE>_VARIANTS} and
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
+
+intel_2022_1_0_mpi:
+  variables:
+    SPEC: "~shared +openmp +mpi %intel@=2022.1.0 ^mvapich2 ${PROJECT_RUBY_DEPS}"
+  extends: .job_on_ruby
diff --git a/.gitlab/tioga-build-and-test-extra.yml b/.gitlab/jobs/tioga.yml
similarity index 57%
rename from .gitlab/tioga-build-and-test-extra.yml
rename to .gitlab/jobs/tioga.yml
index 02a2feef6..bcf9eccb8 100644
--- a/.gitlab/tioga-build-and-test-extra.yml
+++ b/.gitlab/jobs/tioga.yml
@@ -1,11 +1,18 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 #############################################################################
 
+# Override reproducer section to define project specific variables.
+.tioga_reproducer_vars:
+  script:
+    - |
+      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
+      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+
 ########################
 # Overridden shared jobs
 ########################
@@ -15,8 +22,6 @@
 
 # No overridden jobs so far.
 
-# In post-build phase, deallocate resources.
-
 ############
 # Extra jobs
 ############
@@ -24,11 +29,13 @@
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
-# With GitLab CI, included files cannot be empty.
-#variables:
-#  INCLUDED_FILE_CANNOT_BE_EMPTY: "True"
+rocmcc_6_1_1_hip_openmp:
+  variables:
+    SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ${PROJECT_TIOGA_DEPS}"
+  extends: .job_on_tioga
 
-rocmcc_5_4_3_hip_openmp:
+rocmcc_6_1_1_hip_openmp_mpi:
   variables:
-    SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.4.3 ^hip@5.4.3 ^blt@develop"
-  extends: .build_and_test_on_tioga
+    SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ${PROJECT_TIOGA_DEPS}"
+  extends: .job_on_tioga
+  allow_failure: true
diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml
index 108e84a54..7e60a05e9 100644
--- a/.gitlab/subscribed-pipelines.yml
+++ b/.gitlab/subscribed-pipelines.yml
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -9,7 +9,7 @@
 # The template job to test whether a machine is up.
 # Expects CI_MACHINE defined to machine name.
 .machine-check:
-  stage: machine-checks
+  stage: prerequisites
   tags: [shell, oslic]
   variables:
     GIT_STRATEGY: none
@@ -30,6 +30,30 @@
 # Comment the jobs for machines you don’t need.
 ###
 
+# One job to generate the job list for all the subpipelines
+generate-job-lists:
+  stage: prerequisites
+  tags: [shell, oslic]
+  variables:
+    GIT_SUBMODULE_DEPTH: 2
+    GIT_SUBMODULE_STRATEGY: recursive
+    GIT_SUBMODULE_PATHS: tpl/RAJA
+    RADIUSS_JOBS_PATH: "tpl/RAJA/scripts/radiuss-spack-configs/gitlab/radiuss-jobs"
+    LOCAL_JOBS_PATH: ".gitlab/jobs"
+  script:
+    - cat ${RADIUSS_JOBS_PATH}/ruby.yml ${LOCAL_JOBS_PATH}/ruby.yml > ruby-jobs.yml
+    - cat ${RADIUSS_JOBS_PATH}/poodle.yml ${LOCAL_JOBS_PATH}/poodle.yml > poodle-jobs.yml
+    - cat ${RADIUSS_JOBS_PATH}/lassen.yml ${LOCAL_JOBS_PATH}/lassen.yml > lassen-jobs.yml
+    - cat ${RADIUSS_JOBS_PATH}/corona.yml ${LOCAL_JOBS_PATH}/corona.yml > corona-jobs.yml
+    - cat ${RADIUSS_JOBS_PATH}/tioga.yml ${LOCAL_JOBS_PATH}/tioga.yml > tioga-jobs.yml
+  artifacts:
+    paths:
+      - ruby-jobs.yml
+      - poodle-jobs.yml
+      - lassen-jobs.yml
+      - corona-jobs.yml
+      - tioga-jobs.yml
+
 # RUBY
 ruby-up-check:
   variables:
@@ -39,7 +63,19 @@ ruby-up-check:
 ruby-build-and-test:
   variables:
     CI_MACHINE: "ruby"
-  needs: [ruby-up-check]
+  needs: [ruby-up-check, generate-job-lists]
+  extends: [.build-and-test]
+
+# POODLE
+poodle-up-check:
+  variables:
+    CI_MACHINE: "poodle"
+  extends: [.machine-check]
+
+poodle-build-and-test:
+  variables:
+    CI_MACHINE: "poodle"
+  needs: [poodle-up-check, generate-job-lists]
   extends: [.build-and-test]
 
 # CORONA
@@ -51,7 +87,7 @@ corona-up-check:
 corona-build-and-test:
   variables:
     CI_MACHINE: "corona"
-  needs: [corona-up-check]
+  needs: [corona-up-check, generate-job-lists]
   extends: [.build-and-test]
 
 # TIOGA
@@ -63,7 +99,7 @@ tioga-up-check:
 tioga-build-and-test:
   variables:
     CI_MACHINE: "tioga"
-  needs: [tioga-up-check]
+  needs: [tioga-up-check, generate-job-lists]
   extends: [.build-and-test]
 
 # LASSEN
@@ -75,7 +111,7 @@ lassen-up-check:
 lassen-build-and-test:
   variables:
     CI_MACHINE: "lassen"
-  needs: [lassen-up-check]
+  needs: [lassen-up-check, generate-job-lists]
   extends: [.build-and-test]
 
 
diff --git a/.uberenv_config.json b/.uberenv_config.json
index e2353e1c9..fda595d3a 100644
--- a/.uberenv_config.json
+++ b/.uberenv_config.json
@@ -4,7 +4,7 @@
 "package_final_phase" : "initconfig",
 "package_source_dir" : "../..",
 "spack_url": "https://github.com/spack/spack.git",
-"spack_branch": "e4s-23.02",
+"spack_branch": "develop-2024-05-26",
 "spack_activate" : {},
 "spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs",
 "spack_packages_path": "tpl/RAJA/scripts/radiuss-spack-configs/packages",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 812d339b0..b9d0bd3c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -16,7 +16,7 @@ else()
 endif()
 
 option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable
-this, and all other variants, to run _only_ raw C loops." On)
+this, and all other variants, to run _only_ base variants." On)
 option(ENABLE_KOKKOS "Include Kokkos implementations of the kernels in the RAJA Perfsuite" Off)
 
 #
@@ -27,7 +27,7 @@ if (PERFSUITE_ENABLE_WARNINGS)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
 endif()
 
-if (ENABLE_KOKKOS)
+if (ENABLE_KOKKOS OR ENABLE_SYCL)
   set(CMAKE_CXX_STANDARD 17)
   set(BLT_CXX_STD c++17)
 else()
@@ -51,6 +51,12 @@ if (ENABLE_TESTS)
 endif()
 
 cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off)
+if (RAJA_PERFSUITE_ENABLE_MPI)
+set(RAJA_PERFSUITE_NUM_MPI_TASKS 4 CACHE STRING "Number of MPI tasks in tests")
+else()
+set(RAJA_PERFSUITE_NUM_MPI_TASKS 0 CACHE INTERNAL "Number of MPI tasks in tests")
+endif()
+message(STATUS "Using RAJA_PERFSUITE_NUM_MPI_TASKS: ${RAJA_PERFSUITE_NUM_MPI_TASKS}")
 
 cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off)
 
@@ -67,12 +73,33 @@ set(ENABLE_TBB Off CACHE BOOL "")
 
 set(RAJA_USE_CHRONO On CACHE BOOL "")
 
+set(RAJA_PERFSUITE_TUNING_CUDA_ARCH "0" CACHE STRING "CUDA arch to tune the execution for, ex '700' for sm_70")
+set(RAJA_PERFSUITE_TUNING_HIP_ARCH "0" CACHE STRING "HIP arch to tune the execution for, ex '910' for gfx90a, '942' for gfx942")
+
 set(RAJA_PERFSUITE_GPU_BLOCKSIZES "" CACHE STRING "Comma separated list of GPU block sizes, ex '256,1024'")
 
+set(RAJA_PERFSUITE_ATOMIC_REPLICATIONS "" CACHE STRING "Comma separated list of atomic replications, ex '1,256,4096'")
+
+set(RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD "" CACHE STRING "Comma separated list of atomic replications, ex '1,256,4096'")
+
 set(RAJA_RANGE_ALIGN 4)
 set(RAJA_RANGE_MIN_LENGTH 32)
 set(RAJA_DATA_ALIGN 64)
 
+string(LENGTH "${RAJA_PERFSUITE_TUNING_CUDA_ARCH}" CUDA_ARCH_LENGTH)
+if (CUDA_ARCH_LENGTH GREATER 1)
+  message(STATUS "Using cuda tunings for arch: ${RAJA_PERFSUITE_TUNING_CUDA_ARCH}")
+else()
+  message(STATUS "Using default cuda arch tunings")
+endif()
+
+string(LENGTH "${RAJA_PERFSUITE_TUNING_HIP_ARCH}" HIP_ARCH_LENGTH)
+if (HIP_ARCH_LENGTH GREATER 1)
+  message(STATUS "Using hip tunings for arch: ${RAJA_PERFSUITE_TUNING_HIP_ARCH}")
+else()
+  message(STATUS "Using default hip arch tunings")
+endif()
+
 string(LENGTH "${RAJA_PERFSUITE_GPU_BLOCKSIZES}" BLOCKSIZES_LENGTH)
 if (BLOCKSIZES_LENGTH GREATER 0)
   message(STATUS "Using gpu block size(s): ${RAJA_PERFSUITE_GPU_BLOCKSIZES}")
@@ -80,6 +107,20 @@ else()
   message(STATUS "Using default gpu block size(s)")
 endif()
 
+string(LENGTH "${RAJA_PERFSUITE_ATOMIC_REPLICATIONS}" ATOMIC_REPLICATIONS_LENGTH)
+if (ATOMIC_REPLICATIONS_LENGTH GREATER 0)
+  message(STATUS "Using atomic replication(s): ${RAJA_PERFSUITE_ATOMIC_REPLICATIONS}")
+else()
+  message(STATUS "Using default atomic replication(s)")
+endif()
+
+string(LENGTH "${RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD}" GPU_ITEMS_PER_THREAD_LENGTH)
+if (GPU_ITEMS_PER_THREAD_LENGTH GREATER 0)
+  message(STATUS "Using gpu items per thread(s): ${RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD}")
+else()
+  message(STATUS "Using default gpu items per thread(s)")
+endif()
+
 # exclude RAJA make targets from top-level build...
 add_subdirectory(tpl/RAJA)
 
@@ -95,8 +136,8 @@ if (ENABLE_OPENMP)
   add_definitions(-DRUN_OPENMP)
 endif ()
 
-set(RAJA_PERFSUITE_VERSION_MAJOR 2023)
-set(RAJA_PERFSUITE_VERSION_MINOR 06)
+set(RAJA_PERFSUITE_VERSION_MAJOR 2024)
+set(RAJA_PERFSUITE_VERSION_MINOR 07)
 set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0)
 
 set(RAJA_PERFSUITE_DEPENDS RAJA)
@@ -110,6 +151,9 @@ endif()
 if (ENABLE_CUDA)
   list(APPEND RAJA_PERFSUITE_DEPENDS cuda)
 endif()
+if (ENABLE_SYCL)
+  list(APPEND RAJA_PERFSUITE_DEPENDS sycl)
+endif()
 
 # Kokkos requires hipcc as the CMAKE_CXX_COMPILER for HIP AMD/VEGA GPU
 # platforms, whereas RAJAPerf Suite uses blt/CMake FindHIP to set HIP compiler. 
diff --git a/Dockerfile b/Dockerfile
index 9d7f6b197..b15e3c102 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 ##############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -111,12 +111,12 @@ RUN . /opt/spack/share/spack/setup-env.sh && \
 ##    make -j 6 && \
 ##    cd .. && rm -rf build
 
-FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl
+FROM ghcr.io/rse-ops/intel-ubuntu-23.04:intel-2023.2.1 AS sycl
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN /bin/bash -c "source /opt/view/setvars.sh && \
-    cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \
+    cmake -DCMAKE_CXX_COMPILER=dpcpp -DENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 -DENABLE_TESTS=On .. && \
     make -j 6 &&\
-    ./bin/raja-perf.exe --checkrun 5 -sp" && \
+    ./bin/raja-perf.exe --checkrun --exclude-variants Base_SYCL RAJA_SYCL -sp" && \
     cd .. && rm -rf build
diff --git a/LICENSE b/LICENSE
index 039a20b01..27c1ef431 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2017-2023, Lawrence Livermore National Security, LLC.
+Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/README.md b/README.md
index bf2eee850..04aeea048 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 [comment]: # (#################################################################)
-[comment]: # (Copyright 2017-23, Lawrence Livermore National Security, LLC)
+[comment]: # (Copyright 2017-24, Lawrence Livermore National Security, LLC)
 [comment]: # (and RAJA Performance Suite project contributors.)
 [comment]: # (See the RAJAPerf/LICENSE file for details.)
 [comment]: #
diff --git a/RELEASE b/RELEASE
index 4b8dcac50..61fc02251 100644
--- a/RELEASE
+++ b/RELEASE
@@ -2,7 +2,7 @@
 
 RAJA Performance Suite
 
-Copyright (c) 2017-23, Lawrence Livermore National Security, LLC. 
+Copyright (c) 2017-24, Lawrence Livermore National Security, LLC. 
 Produced at the Lawrence Livermore National Laboratory.
 All rights reserved. See details in the RAJAPerf/LICENSE file.
 
diff --git a/TODO/WIP-COUPLE.cpp b/TODO/WIP-COUPLE.cpp
index 2e7c70197..6f0feeed8 100644
--- a/TODO/WIP-COUPLE.cpp
+++ b/TODO/WIP-COUPLE.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -110,7 +110,7 @@ void COUPLE::runKernel(VariantID vid, size_t tune_idx)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::loop_exec>(
+        RAJA::forall<RAJA::seq_exec>(
           RAJA::RangeSegment(kmin, kmax), [=](Index_type k) {
           COUPLE_BODY;
         });
diff --git a/TODO/WIP-COUPLE.hpp b/TODO/WIP-COUPLE.hpp
index 33faa85cc..bf29503f3 100644
--- a/TODO/WIP-COUPLE.hpp
+++ b/TODO/WIP-COUPLE.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index da8637d19..41f9c0cd7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -56,8 +56,8 @@ jobs:
 ##        docker_target: nvcc11.1.1-debug
 ##      hip5.1.3:
 ##        docker_target: hip5.1.3
-##     sycl:
-##       docker_target: sycl
+      sycl:
+        docker_target: sycl
   pool:
     vmImage: 'ubuntu-latest'
   variables:
diff --git a/blt b/blt
index 5a792c177..9ff77344f 160000
--- a/blt
+++ b/blt
@@ -1 +1 @@
-Subproject commit 5a792c1775e7a7628d84dcde31652a689f1df7b5
+Subproject commit 9ff77344f0b2a6ee345e452bddd6bfd46cbbfa35
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index ac86f5bcc..9b4df01d6 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/docs/conf.py b/docs/conf.py
index 6673fa10f..ee3729c8f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -79,16 +79,16 @@
 
 # General information about the project.
 project = u'RAJAPerf'
-copyright = u'2017-2023, Lawrence Livermore National Security, LLNS'
+copyright = u'2017-2024, Lawrence Livermore National Security, LLNS'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = u'2022.10'
+version = u'2024.07'
 # The full version, including alpha/beta/rc tags.
-release = u'2022.10.0'
+release = u'2024.07.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/index.rst b/docs/index.rst
index 12ec445a5..438c89f82 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors. 
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 6b1d35172..0a42ee80c 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1 +1,3 @@
-docutils<0.20
+docutils
+sphinx==6.2.1
+sphinx-rtd-theme==1.2.2
diff --git a/docs/sphinx/dev_guide/branch_development.rst b/docs/sphinx/dev_guide/branch_development.rst
index 8d2e04437..318076584 100644
--- a/docs/sphinx/dev_guide/branch_development.rst
+++ b/docs/sphinx/dev_guide/branch_development.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/dev_guide/build_configurations.rst b/docs/sphinx/dev_guide/build_configurations.rst
index 7ce70decf..4972d85a2 100644
--- a/docs/sphinx/dev_guide/build_configurations.rst
+++ b/docs/sphinx/dev_guide/build_configurations.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst
index 231b00ee3..1fdd1a55f 100644
--- a/docs/sphinx/dev_guide/ci.rst
+++ b/docs/sphinx/dev_guide/ci.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst
index 74f86d3cd..bdac32a30 100644
--- a/docs/sphinx/dev_guide/contributing.rst
+++ b/docs/sphinx/dev_guide/contributing.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst
index c2c976ff3..d04aa25ab 100644
--- a/docs/sphinx/dev_guide/index.rst
+++ b/docs/sphinx/dev_guide/index.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors. 
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/dev_guide/kernel_class.rst b/docs/sphinx/dev_guide/kernel_class.rst
index 5d544dd68..015b7592f 100644
--- a/docs/sphinx/dev_guide/kernel_class.rst
+++ b/docs/sphinx/dev_guide/kernel_class.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst
index 38d8274a0..05271dd8e 100644
--- a/docs/sphinx/dev_guide/kernel_class_impl.rst
+++ b/docs/sphinx/dev_guide/kernel_class_impl.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/dev_guide/release_process.rst b/docs/sphinx/dev_guide/release_process.rst
index 8b1942758..0542ec08e 100644
--- a/docs/sphinx/dev_guide/release_process.rst
+++ b/docs/sphinx/dev_guide/release_process.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/dev_guide/structure.rst b/docs/sphinx/dev_guide/structure.rst
index 5c25ef2a2..bc11f9941 100644
--- a/docs/sphinx/dev_guide/structure.rst
+++ b/docs/sphinx/dev_guide/structure.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/rajaperf_license.rst b/docs/sphinx/rajaperf_license.rst
index a7985861f..5233fff7b 100644
--- a/docs/sphinx/rajaperf_license.rst
+++ b/docs/sphinx/rajaperf_license.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
@@ -12,7 +12,7 @@
 RAJA Performance Suite Copyright and License Information
 ==========================================================
 
-Copyright (c) 2017-23, Lawrence Livermore National Security, LLC.
+Copyright (c) 2017-24, Lawrence Livermore National Security, LLC.
 
 Produced at the Lawrence Livermore National Laboratory.
 
diff --git a/docs/sphinx/user_guide/CMakeLists.txt b/docs/sphinx/user_guide/CMakeLists.txt
index 912f38a7a..e084390e8 100644
--- a/docs/sphinx/user_guide/CMakeLists.txt
+++ b/docs/sphinx/user_guide/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA erformance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst
index 082fb9f4e..372f495e9 100644
--- a/docs/sphinx/user_guide/build.rst
+++ b/docs/sphinx/user_guide/build.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
@@ -201,7 +201,7 @@ multiple versions of GPU kernels that will run with different GPU thread-block
 sizes. The CMake option for this is 
 ``-DRAJA_PERFSUITE_GPU_BLOCKSIZES=<list,of,block,sizes>``. For example::
 
-  $ mkdir my-gnu-build
+  $ mkdir my-gpu-build
   $ cd my-gpu-build
   $ cmake <cmake args> \
     -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \
@@ -211,6 +211,41 @@ sizes. The CMake option for this is
 will build versions of GPU kernels that use 64, 128, 256, 512, and 1024 threads
 per GPU thread-block.
 
+Building with specific GPU atomic replication tunings
+-----------------------------------------------------
+
+If desired, you can build a version of the RAJA Performance Suite code with
+multiple versions of GPU kernels that will run with different GPU atomic
+replication amounts. The CMake option for this is
+``-DRAJA_PERFSUITE_ATOMIC_REPLICATIONS=<list,of,atomic,replication,amounts>``. For example::
+
+  $ mkdir my-gpu-build
+  $ cd my-gpu-build
+  $ cmake <cmake args> \
+    -DRAJA_PERFSUITE_ATOMIC_REPLICATIONS=1,256,4096 \
+    ..
+  $ make -j
+
+will build versions of GPU kernels that use 1, 256, and 4096 atomic
+replications.
+
+Building with specific GPU items per thread tunings
+-----------------------------------------------------
+
+If desired, you can build a version of the RAJA Performance Suite code with
+multiple versions of GPU kernels that will run with different GPU items per
+thread amounts. The CMake option for this is
+``-DRAJA_PERFSUITE_GPU_ITEMS_PER_THREAD=<list,of,items,per,thread,amounts>``. For example::
+
+  $ mkdir my-gpu-build
+  $ cd my-gpu-build
+  $ cmake <cmake args> \
+    -DRAJA_PERFSUITE_GPU_ITEMS_PER_THREAD=1,2,4,8 \
+    ..
+  $ make -j
+
+will build versions of GPU kernels that use 1, 2, 4, and 8 items per thread.
+
 Building with Caliper
 ---------------------
 
diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst
index 0bd7d5570..33475a6b9 100644
--- a/docs/sphinx/user_guide/index.rst
+++ b/docs/sphinx/user_guide/index.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors. 
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst
index 3d0879278..2af530e9a 100644
--- a/docs/sphinx/user_guide/output.rst
+++ b/docs/sphinx/user_guide/output.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst
index 19a8917bd..083263d61 100644
--- a/docs/sphinx/user_guide/run.rst
+++ b/docs/sphinx/user_guide/run.rst
@@ -1,5 +1,5 @@
 .. ##
-.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 .. ## and RAJA Performance Suite project contributors.
 .. ## See the RAJAPerf/LICENSE file for details.
 .. ##
diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh
new file mode 100755
index 000000000..f002631f3
--- /dev/null
+++ b/scripts/alcf-builds/sycl.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+
+BUILD_SUFFIX=sycl
+: ${BUILD_TYPE:=RelWithDebInfo}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/alcf-builds/sycl.cmake
+
+rm -rf build_${BUILD_SUFFIX}_${USER} >/dev/null
+mkdir build_${BUILD_SUFFIX}_${USER} && cd build_${BUILD_SUFFIX}_${USER}
+
+cmake \
+  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_OPENMP=Off \
+  -DENABLE_CUDA=Off \
+  -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \
+  -DENABLE_TARGET_OPENMP=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DENABLE_SYCL=On \
+  -DCMAKE_CXX_STANDARD=17 \
+  -DCMAKE_LINKER=icpx \
+  "$@" \
+  ..
+
+make -j 18
diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
index de837bed9..430d50b4b 100755
--- a/scripts/gitlab/build_and_test.sh
+++ b/scripts/gitlab/build_and_test.sh
@@ -7,7 +7,7 @@ then
 fi
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC and RAJA
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC and RAJA
 # project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -21,17 +21,26 @@ hostname="$(hostname)"
 truehostname=${hostname//[0-9]/}
 project_dir="$(pwd)"
 
-build_root=${BUILD_ROOT:-""}
 hostconfig=${HOST_CONFIG:-""}
 spec=${SPEC:-""}
+module_list=${MODULE_LIST:-""}
 job_unique_id=${CI_JOB_ID:-""}
+use_dev_shm=${USE_DEV_SHM:-true}
+
 raja_version=${UPDATE_RAJA:-""}
 sys_type=${SYS_TYPE:-""}
-use_dev_shm=${USE_DEV_SHM:-true}
 
 spack_upstream_path=${SPACK_UPSTREAM_PATH:-"/usr/workspace/umdev/RAJAPerf/upstream"}
 update_spack_upstream=${UPDATE_SPACK_UPSTREAM:-false}
 
+if [[ -n ${module_list} ]]
+then
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~~~~~ Modules to load: ${module_list}"
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    module load ${module_list}
+fi
+
 prefix=""
 
 if [[ ${update_spack_upstream} == true ]]
@@ -55,8 +64,9 @@ then
     prefix="${prefix}-${job_unique_id}"
     mkdir -p ${prefix}
 else
-    prefix="spack-and-build-root"
-    mkdir ${prefix}
+    # We set the prefix in the parent directory so that spack dependencies are not installed inside the source tree.
+    prefix="$(pwd)/../spack-and-build-root"
+    mkdir -p ${prefix}
 fi
 
 # Dependencies
@@ -131,17 +141,8 @@ fi
 hostconfig=$(basename ${hostconfig_path})
 
 # Build Directory
-if [[ -z ${build_root} ]]
-then
-    if [[ -d /dev/shm && ${use_dev_shm} == true ]]
-    then
-        build_root="${prefix}"
-    else
-        build_root="$(pwd)"
-    fi
-else
-    build_root="${build_root}"
-fi
+# When using /dev/shm, we use prefix for both spack builds and source build, unless BUILD_ROOT was defined
+build_root=${BUILD_ROOT:-"${prefix}"}
 
 build_dir="${build_root}/build_${hostconfig//.cmake/}"
 
@@ -162,7 +163,7 @@ then
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 
     # Map CPU core allocations
-    declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32)
+    declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["poodle"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32)
 
     # If using Multi-project, set up the submodule
     if [[ -n ${raja_version} ]]
@@ -186,25 +187,34 @@ then
     rm -rf ${build_dir} 2>/dev/null
     mkdir -p ${build_dir} && cd ${build_dir}
 
-    date
+    # We set the MPI tests command to allow overlapping.
+    # Shared allocation: Allows build_and_test.sh to run within a sub-allocation (see CI config).
+    # Use /dev/shm: Prevent MPI tests from running on a node where the build dir doesn't exist.
+    cmake_options=""
+    if [[ "${truehostname}" == "ruby" || "${truehostname}" == "poodle" ]]
+    then
+        cmake_options="-DBLT_MPI_COMMAND_APPEND:STRING=--overlap"
+    fi
 
+    date
     if [[ "${truehostname}" == "corona"  || "${truehostname}" == "tioga" ]]
     then
         module unload rocm
     fi
     $cmake_exe \
       -C ${hostconfig_path} \
+      ${cmake_options} \
       ${project_dir}
     if ! $cmake_exe --build . -j ${core_counts[$truehostname]}
     then
         echo "ERROR: compilation failed, building with verbose output..."
         $cmake_exe --build . --verbose -j 1
     fi
+    date
 
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
     echo "~~~~~ RAJA Perf Suite Built"
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    date
 fi
 
 if [[ ! -d ${build_dir} ]]
@@ -214,6 +224,7 @@ fi
 
 cd ${build_dir}
 
+date
 echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 echo "~~~~~ TESTING RAJAPERF SUITE"
 echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
@@ -226,33 +237,12 @@ then
     # in case we want to make them disctinct in the future.
     #
 
-    if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then
-        if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path}
-        then
-            echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
-            echo "lrun -n1 ... ctest --output-on-failure -T test"
-            echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-            lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test
-        else
-            echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
-            echo "lrun -n1 ... ctest --output-on-failure -T test"
-            echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-            lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test
-        fi
-    else
-        if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path}
-        then
-            echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
-            echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt"
-            echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-            ctest --output-on-failure -T test 2>&1 | tee tests_output.txt
-        else
-            echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
-            echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt"
-            echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-            ctest --output-on-failure -T test 2>&1 | tee tests_output.txt
-        fi
-    fi
+    echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~"
+    echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt"
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    date
+    ctest --output-on-failure -T test 2>&1 | tee tests_output.txt
+    date
 
     no_test_str="No tests were found!!!"
     if [[ "$(tail -n 1 tests_output.txt)" == "${no_test_str}" ]]
diff --git a/scripts/install_llvm.sh b/scripts/install_llvm.sh
index f1a16dcfa..b264f59de 100755
--- a/scripts/install_llvm.sh
+++ b/scripts/install_llvm.sh
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors. 
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh
index a6fc06451..15fde9bf1 100755
--- a/scripts/lc-builds/blueos_clang.sh
+++ b/scripts/lc-builds/blueos_clang.sh
@@ -1,17 +1,17 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
   echo "    blueos_clang.sh 11.0.1"
-  echo "  -or - "
+  echo "  - or - "
   echo "    blueos_clang.sh ibm-10.0.1-gcc-8.3.1"
   exit
 fi
diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh
index 2f7fdf5e9..67ffdcf91 100755
--- a/scripts/lc-builds/blueos_clang_omptarget.sh
+++ b/scripts/lc-builds/blueos_clang_omptarget.sh
@@ -1,17 +1,17 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
   echo "    blueos_clang_omptarget.sh 10.0.1-gcc-8.3.1"
-  echo "  - or - "
+  echo "  - or -"
   echo "    blueos_clang_omptarget.sh ibm-10.0.1-gcc-8.3.1"
   exit
 fi
diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh
index b51ad749a..fe71ddf77 100755
--- a/scripts/lc-builds/blueos_gcc.sh
+++ b/scripts/lc-builds/blueos_gcc.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
   echo "    blueos_gcc.sh 8.3.1"
diff --git a/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh
new file mode 100755
index 000000000..14118494a
--- /dev/null
+++ b/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 5 ]]; then
+  echo
+  echo "You must pass 5 arguments to the script (in this order): "
+  echo "   1) compiler version number for nvcc"
+  echo "   2) CUDA compute architecture (number only, not 'sm_70' for example)"
+  echo "   3) compiler version number for clang. "
+  echo "   4) path to caliper cmake directory"
+  echo "   5) path to adiak cmake directory"
+  echo
+  echo "For example: "
+  echo "    blueos_nvcc_clang-mpi_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak"
+  exit
+fi
+
+COMP_NVCC_VER=$1
+COMP_ARCH=$2
+COMP_CLANG_VER=$3
+CALI_DIR=$4
+ADIAK_DIR=$5
+shift 5
+
+BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-mpi-${COMP_CLANG_VER}-caliper
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.20.2
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_MPI=ON \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_SEPARABLE_COMPILATION=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
+  -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  -DRAJA_PERFSUITE_USE_CALIPER=ON \
+  -Dcaliper_DIR=${CALI_DIR} \
+  -Dadiak_DIR=${ADIAK_DIR} \
+  -DRAJA_PERFSUITE_GPU_BLOCKSIZES=128,256,512,1024 \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo
+echo "  Please note that you have to disable CUDA GPU hooks when you run"
+echo "  the RAJA Perf Suite; for example,"
+echo
+echo "    lrun -1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh
index 9801459b9..59b74d923 100755
--- a/scripts/lc-builds/blueos_nvcc_clang.sh
+++ b/scripts/lc-builds/blueos_nvcc_clang.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [[ $# -ne 3 ]]; then
+if [[ $# -lt 3 ]]; then
   echo
   echo "You must pass 3 arguments to the script (in this order): "
   echo "   1) compiler version number for nvcc"
@@ -24,7 +24,7 @@ COMP_ARCH=$2
 COMP_CLANG_VER=$3
 shift 3
 
-BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER}
+BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER}
 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake
 
 echo
@@ -45,6 +45,7 @@ cmake \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
+  -DCUDA_SEPARABLE_COMPILATION=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
   -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
   -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
diff --git a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh
index b121d68c2..238b9a30e 100755
--- a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh
+++ b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [[ $# -ne 5 ]]; then
+if [[ $# -lt 5 ]]; then
   echo
   echo "You must pass 5 arguments to the script (in this order): "
   echo "   1) compiler version number for nvcc"
@@ -17,7 +17,7 @@ if [[ $# -ne 5 ]]; then
   echo "   5) path to adiak cmake directory"
   echo
   echo "For example: "
-  echo "    blueos_nvcc_clang_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/adiak-lassen/lib/cmake/adiak"
+  echo "    blueos_nvcc_clang_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak"
   exit
 fi
 
@@ -28,7 +28,7 @@ CALI_DIR=$4
 ADIAK_DIR=$5
 shift 5
 
-BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER}
+BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER}
 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake
 
 echo
@@ -49,6 +49,7 @@ cmake \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
+  -DCUDA_SEPARABLE_COMPILATION=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
   -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
   -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
diff --git a/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh b/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh
new file mode 100755
index 000000000..9fdcdb3a7
--- /dev/null
+++ b/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 3 ]]; then
+  echo
+  echo "You must pass 5 arguments to the script (in this order): "
+  echo "   1) compiler version number for nvcc"
+  echo "   2) CUDA compute architecture (number only, not 'sm_70' for example)"
+  echo "   3) compiler version number for gcc"
+  echo "   4) path to caliper cmake directory"
+  echo "   5) path to adiak cmake directory"
+  echo
+  echo "For example: "
+  echo "    blueos_nvcc_gcc-mpi_caliper.sh 10.2.89 70 8.3.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak"
+  exit
+fi
+
+COMP_NVCC_VER=$1
+COMP_ARCH=$2
+COMP_GCC_VER=$3
+CALI_DIR=$4
+ADIAK_DIR=$5
+shift 5
+
+BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER}-mpi-caliper
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.20.2
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_MPI=ON \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_SEPARABLE_COMPILATION=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
+  -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  -DRAJA_PERFSUITE_USE_CALIPER=ON \
+  -Dcaliper_DIR=${CALI_DIR} \
+  -Dadiak_DIR=${ADIAK_DIR} \
+  -DRAJA_PERFSUITE_GPU_BLOCKSIZES=128,256,512,1024 \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo
+echo "  Please note that you have to disable CUDA GPU hooks when you run"
+echo "  the RAJA Perf Suite; for example,"
+echo
+echo "    lrun -1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh
index 200e86f9b..d1e24fdac 100755
--- a/scripts/lc-builds/blueos_nvcc_gcc.sh
+++ b/scripts/lc-builds/blueos_nvcc_gcc.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [[ $# -ne 3 ]]; then
+if [[ $# -lt 3 ]]; then
   echo
   echo "You must pass 3 arguments to the script (in this order): "
   echo "   1) compiler version number for nvcc"
@@ -24,7 +24,7 @@ COMP_ARCH=$2
 COMP_GCC_VER=$3
 shift 3
 
-BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER}
+BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER}
 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake
 
 echo
@@ -45,6 +45,7 @@ cmake \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
+  -DCUDA_SEPARABLE_COMPILATION=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
   -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
   -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh
index 9f2489694..1950dcadc 100755
--- a/scripts/lc-builds/blueos_nvcc_xl.sh
+++ b/scripts/lc-builds/blueos_nvcc_xl.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [[ $# -ne 3 ]]; then
+if [[ $# -lt 3 ]]; then
   echo
   echo "You must pass 3 arguments to the script (in this order): "
   echo "   1) compiler version number for nvcc"
@@ -24,7 +24,7 @@ COMP_ARCH=$2
 COMP_XL_VER=$3
 shift 3
 
-BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-xl${COMP_XL_VER}
+BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-xl-${COMP_XL_VER}
 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_xl_X.cmake
 
 echo
@@ -45,6 +45,7 @@ cmake \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
+  -DCUDA_SEPARABLE_COMPILATION=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
   -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
   -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh
index c715d1c25..09e192fa5 100755
--- a/scripts/lc-builds/blueos_pgi.sh
+++ b/scripts/lc-builds/blueos_pgi.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
   echo "    blueos_pgi.sh 21.1"
diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh
index 631f8ef5c..d8a718229 100755
--- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh
+++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -26,7 +26,7 @@ COMP_ARCH=$3
 COMP_CLANG_VER=$4
 shift 4
 
-BUILD_SUFFIX=lc_blueos-spectrum${COMP_MPI_VER}-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER}
+BUILD_SUFFIX=lc_blueos-spectrum-${COMP_MPI_VER}-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER}
 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake
 
 echo
@@ -49,6 +49,7 @@ cmake \
   -DENABLE_MPI=On \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
+  -DCUDA_SEPARABLE_COMPILATION=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
   -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
   -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh b/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh
new file mode 100755
index 000000000..dd71dcc62
--- /dev/null
+++ b/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 4 ]]; then
+  echo
+  echo "You must pass 4 arguments to the script (in this order): "
+  echo "   1) compiler version number for spectrum mpi"
+  echo "   2) compiler version number for nvcc (number only, not 'sm_70' for example)"
+  echo "   3) CUDA compute architecture"
+  echo "   4) compiler version number for gcc. "
+  echo
+  echo "For example: "
+  echo "    blueos_spectrum_nvcc_gcc.sh rolling-release 10.2.89 70 8.3.1"
+  exit
+fi
+
+COMP_MPI_VER=$1
+COMP_NVCC_VER=$2
+COMP_ARCH=$3
+COMP_GCC_VER=$4
+shift 4
+
+BUILD_SUFFIX=lc_blueos-spectrum-${COMP_MPI_VER}-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.20.2
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DMPI_CXX_COMPILER=/usr/tce/packages/spectrum-mpi/spectrum-mpi-${COMP_MPI_VER}-gcc-${COMP_GCC_VER}/bin/mpig++ \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_MPI=On \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_SEPARABLE_COMPILATION=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
+  -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo
+echo "  Please note that you have to run with mpi when you run"
+echo "  the RAJA Perf Suite; for example,"
+echo
+echo "    lrun -n4 ./bin/raja-perf.exe"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh
index 5d30ab1ea..9729db57e 100755
--- a/scripts/lc-builds/blueos_xl.sh
+++ b/scripts/lc-builds/blueos_xl.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
   echo "    blueos_xl.sh 2021.03.31"
diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh
index 5f972f0dc..559c59900 100755
--- a/scripts/lc-builds/blueos_xl_omptarget.sh
+++ b/scripts/lc-builds/blueos_xl_omptarget.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
   echo "    blueos_xl_omptarget.sh 2022.08.19"
diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh
new file mode 100755
index 000000000..6dbeb9ee5
--- /dev/null
+++ b/scripts/lc-builds/corona_sycl.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 1 ]]; then
+  echo
+  echo "You must pass 1 argument to the script: "
+  echo "   1) SYCL compiler installation path"
+  echo
+  echo "For example: "
+  echo "    corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1"
+  exit
+fi
+
+SYCL_PATH=$1
+shift 1
+
+BUILD_SUFFIX=corona-sycl
+: ${BUILD_TYPE:=RelWithDebInfo}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/corona_sycl.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX}_${USER} >/dev/null
+mkdir build_${BUILD_SUFFIX}_${USER} && cd build_${BUILD_SUFFIX}_${USER}
+
+DATE=$(printf '%(%Y-%m-%d)T\n' -1)
+
+export PATH=${SYCL_PATH}/bin:$PATH
+export LD_LIBRARY_PATH=${SYCL_PATH}/lib:${SYCL_PATH}/lib64:$LD_LIBRARY_PATH
+
+## NOTE: RAJA tests are turned off due to compilation issues.
+
+cmake \
+  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+  -DSYCL_LIB_PATH:STRING="${SYCL_PATH}/lib" \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_OPENMP=Off \
+  -DENABLE_CUDA=Off \
+  -DRAJA_ENABLE_TARGET_OPENMP=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DRAJA_ENABLE_SYCL=On \
+  -DCMAKE_C_COMPILER=clang \
+  -DCMAKE_CXX_COMPILER=clang++ \
+  -DCMAKE_LINKER=clang++ \
+  -DBLT_CXX_STD=c++17 \
+  -DENABLE_TESTS=On \
+  -DENABLE_EXAMPLES=On \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo 
+echo "cd into directory build_${BUILD_SUFFIX}_${USER} and run make to build RAJA"
+echo 
+echo "To run RAJA tests, exercises, etc. with the build, please do the following:"
+echo 
+echo "   1) Load the ROCm module version matching the version in the compiler path"
+echo "      you passed to this script."
+echo 
+echo "   2) Prefix the LD_LIBRARY_PATH environment variable with "
+echo "        SYCL_PATH/lib:SYCL_PATH/lib64"
+echo 
+echo "      where SYCL_PATH is set to the compiler installation path you passed"
+echo "      to this script (using the proper command for your shell)."
+echo
+echo "***********************************************************************" 
diff --git a/scripts/lc-builds/toss3_hipcc.sh b/scripts/lc-builds/toss3_hipcc.sh
deleted file mode 100755
index b5d9b2760..000000000
--- a/scripts/lc-builds/toss3_hipcc.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env bash
-
-###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-###############################################################################
-
-if [[ $# -ne 2 ]]; then
-  echo
-  echo "You must pass 2 arguments to the script (in this order): "
-  echo "   1) compiler version number"
-  echo "   2) HIP compute architecture"
-  echo
-  echo "For example: "
-  echo "    toss3_hipcc.sh 5.1.0 gfx906"
-  exit
-fi
-
-COMP_VER=$1
-COMP_ARCH=$2
-shift 2
-
-HIP_CLANG_FLAGS="--offload-arch=${COMP_ARCH}"
-HOSTCONFIG="hip_3_X"
-
-if [[ ${COMP_VER} == 4.5.* ]]
-then
-  HIP_CLANG_FLAGS="${HIP_CLANG_FLAGS} -mllvm -amdgpu-fixed-function-abi=1"
-  HOSTCONFIG="hip_4_5_link_X"
-elif [[ ${COMP_VER} == 4.* ]]
-then
-  HOSTCONFIG="hip_4_link_X"
-elif [[ ${COMP_VER} == 3.* ]]
-then
-  HOSTCONFIG="hip_3_X"
-else
-  echo "Unknown hip version, using ${HOSTCONFIG} host-config"
-fi
-
-BUILD_SUFFIX=lc_toss3-hipcc-${COMP_VER}-${COMP_ARCH}
-RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/hip_link_X.cmake
-
-echo
-echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
-echo "Configuration extra arguments:"
-echo "   $@"
-echo
-
-rm -rf build_${BUILD_SUFFIX} >/dev/null
-mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
-
-
-module load cmake/3.23.1
-
-cmake \
-  -DCMAKE_BUILD_TYPE=Release \
-  -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \
-  -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \
-  -DHIP_CLANG_PATH=/opt/rocm-${COMP_VER}/llvm/bin \
-  -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang \
-  -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang++ \
-  -DHIP_CLANG_FLAGS="${HIP_CLANG_FLAGS}" \
-  -DBLT_CXX_STD=c++14 \ 
-  -C ${RAJA_HOSTCONFIG} \
-  -DENABLE_HIP=ON \
-  -DENABLE_OPENMP=OFF \
-  -DENABLE_CUDA=OFF \
-  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
-  "$@" \
-  ..
-
-echo
-echo "***********************************************************************"
-echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
-echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss3_pgi.sh b/scripts/lc-builds/toss3_pgi.sh
deleted file mode 100755
index 9967dd769..000000000
--- a/scripts/lc-builds/toss3_pgi.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-###############################################################################
-# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJAPERF/COPYRIGHT file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-###############################################################################
-
-if [ "$1" == "" ]; then
-  echo
-  echo "You must pass a compiler version number to script. For example,"
-  echo "    toss3_pgi.sh 20.1"
-  exit
-fi
-
-COMP_VER=$1
-shift 1
-
-BUILD_SUFFIX=lc_toss3-pgi-${COMP_VER}
-RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/pgi_X.cmake
-
-echo
-echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
-echo "Configuration extra arguments:"
-echo "   $@"
-echo
-
-rm -rf build_${BUILD_SUFFIX} 2>/dev/null
-mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
-
-module load cmake/3.20.2
-
-cmake \
-  -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgc++ \
-  -DCMAKE_C_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgcc \
-  -DBLT_CXX_STD=c++14 \
-  -C ${RAJA_HOSTCONFIG} \
-  -DENABLE_OPENMP=On \
-  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
-  "$@" \
-  ..
-
-echo
-echo "***********************************************************************"
-echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
-echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh
index 7d2de5397..c571e568d 100755
--- a/scripts/lc-builds/toss4_amdclang.sh
+++ b/scripts/lc-builds/toss4_amdclang.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -15,7 +15,7 @@ if [[ $# -lt 2 ]]; then
   echo "   3...) optional arguments to cmake"
   echo
   echo "For example: "
-  echo "    toss4_amdclang.sh 5.1.0 gfx906"
+  echo "    toss4_amdclang.sh 5.7.0 gfx906"
   exit
 fi
 
@@ -44,6 +44,12 @@ echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in i
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
+echo "To get cmake to work you may have to configure with"
+echo "   -DHIP_PLATFORM=amd"
+echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
 
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
@@ -53,23 +59,28 @@ module load cmake/3.23.1
 
 # unload rocm to avoid configuration problems where the loaded rocm and COMP_VER
 # are inconsistent causing the rocprim from the module to be used unexpectedly
-module unload rocm
+# module unload rocm
 
+if [[ ${COMP_VER} =~ .*magic.* ]]; then
+  ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}"
+else
+  ROCM_PATH="/usr/tce/packages/rocmcc-tce/rocmcc-${COMP_VER}"
+fi
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \
-  -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \
-  -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \
-  -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \
-  -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \
+  -DROCM_ROOT_DIR="${ROCM_PATH}" \
+  -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \
+  -DHIP_PATH=${ROCM_PATH}/llvm/bin \
+  -DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/amdclang \
+  -DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/amdclang++ \
   -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \
   -DGPU_TARGETS="${COMP_ARCH}" \
   -DAMDGPU_TARGETS="${COMP_ARCH}" \
   -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
   -DENABLE_HIP=ON \
-  -DENABLE_OPENMP=OFF \
+  -DENABLE_OPENMP=ON \
   -DENABLE_CUDA=OFF \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
@@ -78,7 +89,7 @@ cmake \
 echo
 echo "***********************************************************************"
 echo
-echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJAPerf"
 echo
 echo "  Please note that you have to have a consistent build environment"
 echo "  when you make RAJA as cmake may reconfigure; unload the rocm module"
diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh
new file mode 100755
index 000000000..015416e8e
--- /dev/null
+++ b/scripts/lc-builds/toss4_amdclang_asan.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 2 ]]; then
+  echo
+  echo "You must pass 2 or more arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) HIP compute architecture"
+  echo "   3...) optional arguments to cmake"
+  echo
+  echo "For example: "
+  echo "    toss4_amdclang_asan.sh 5.7.0 gfx90a"
+  exit
+fi
+
+COMP_VER=$1
+COMP_ARCH=$2
+shift 2
+
+HOSTCONFIG="hip_3_X"
+
+if [[ ${COMP_VER} == 4.* ]]
+then
+##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1"
+  HOSTCONFIG="hip_4_link_X"
+elif [[ ${COMP_VER} == 3.* ]]
+then
+  HOSTCONFIG="hip_3_X"
+else
+  echo "Unknown hip version, using ${HOSTCONFIG} host-config"
+fi
+
+BUILD_SUFFIX=lc_toss4-amdclang-${COMP_VER}-${COMP_ARCH}-asan
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+echo "To get cmake to work you may have to configure with"
+echo "   -DHIP_PLATFORM=amd"
+echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+
+module load cmake/3.23.1
+
+# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER
+# are inconsistent causing the rocprim from the module to be used unexpectedly
+# module unload rocm
+
+if [[ ${COMP_VER} =~ .*magic.* ]]; then
+  ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}"
+else
+  ROCM_PATH="/usr/tce/packages/rocmcc-tce/rocmcc-${COMP_VER}"
+fi
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DROCM_ROOT_DIR="${ROCM_PATH}" \
+  -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \
+  -DHIP_PATH=${ROCM_PATH}/llvm/bin \
+  -DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/amdclang \
+  -DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/amdclang++ \
+  -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}:xnack+" \
+  -DGPU_TARGETS="${COMP_ARCH}:xnack+" \
+  -DAMDGPU_TARGETS="${COMP_ARCH}:xnack+" \
+  -DCMAKE_C_FLAGS="-fsanitize=address -shared-libsan" \
+  -DCMAKE_CXX_FLAGS="-fsanitize=address -shared-libsan" \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_HIP=ON \
+  -DENABLE_OPENMP=ON \
+  -DENABLE_CUDA=OFF \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJAPerf"
+echo
+echo "  Please note that you have to have a consistent build environment"
+echo "  when you make RAJA as cmake may reconfigure; load the appropriate"
+echo "  rocm and rocmcc modules (${COMP_VER}) when building."
+echo
+echo "    module load rocm/COMP_VER rocmcc/COMP_VER"
+echo "    srun -n1 make"
+echo
+echo "  Run with these environment options when using asan"
+echo "    ASAN_OPTIONS=print_suppressions=0:detect_leaks=0"
+echo "    HSA_XNACK=1"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh
new file mode 100755
index 000000000..072443ff8
--- /dev/null
+++ b/scripts/lc-builds/toss4_cce_hip.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 3 ]]; then
+  echo
+  echo "You must pass 3 or more arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) HIP version"
+  echo "   3) HIP compute architecture"
+  echo "   4...) optional arguments to cmake"
+  echo
+  echo "For example: "
+  echo "    toss4_cce_hip.sh 14.0.3 5.2.3 gfx90a"
+  exit
+fi
+
+COMP_VER=$1
+HIP_VER=$2
+HIP_ARCH=$3
+shift 3
+
+HOSTCONFIG="hip_3_X"
+
+BUILD_SUFFIX=lc_toss4-cce-${COMP_VER}-hip-${HIP_VER}-${HIP_ARCH}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+
+module load cmake/3.24.2
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_C_COMPILER="/usr/tce/packages/cce-tce/cce-${COMP_VER}/bin/craycc" \
+  -DCMAKE_CXX_COMPILER="/usr/tce/packages/cce-tce/cce-${COMP_VER}/bin/crayCC" \
+  -DHIP_PATH=/opt/rocm-${HIP_VER}/hip \
+  -DCMAKE_HIP_ARCHITECTURES=${HIP_ARCH} \
+  -DGPU_TARGETS=${HIP_ARCH} \
+  -DAMDGPU_TARGETS=${HIP_ARCH} \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_HIP=ON \
+  -DENABLE_OPENMP=ON \
+  -DENABLE_CUDA=OFF \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you have to have a consistent build environment"
+echo "  when you make RAJA as cmake may reconfigure; load the appropriate"
+echo "  cce module (${COMP_VER}) when building."
+echo
+echo "    module load cce-tce/${COMP_VER}"
+echo "    srun -n1 make"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_clang-mpi_caliper.sh b/scripts/lc-builds/toss4_clang-mpi_caliper.sh
new file mode 100755
index 000000000..d3f4eb4bf
--- /dev/null
+++ b/scripts/lc-builds/toss4_clang-mpi_caliper.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 3 ]]; then
+  echo
+  echo "You must pass 3 arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) path to caliper cmake directory"
+  echo "   3) path to adiak cmake directory"
+  echo
+  echo "For example: "
+  echo "    toss4_clang-mpi_caliper.sh 14.0.6 /usr/workspace/wsb/asde/caliper-quartz/share/cmake/caliper /usr/workspace/wsb/asde/caliper-quartz/lib/cmake/adiak"
+  exit
+fi
+
+COMP_VER=$1
+CALI_DIR=$2
+ADIAK_DIR=$3
+shift 3
+
+BUILD_SUFFIX=lc_toss4-clang-mpi-${COMP_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.23.1
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_MPI=ON \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  -DRAJA_PERFSUITE_USE_CALIPER=ON \
+  -Dcaliper_DIR=${CALI_DIR} \
+  -Dadiak_DIR=${ADIAK_DIR} \
+  -DCMAKE_C_FLAGS="-g -O0" \
+  -DCMAKE_CXX_FLAGS="-g -O0" \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss3_clang.sh b/scripts/lc-builds/toss4_clang.sh
similarity index 65%
rename from scripts/lc-builds/toss3_clang.sh
rename to scripts/lc-builds/toss4_clang.sh
index 7406363bc..64b11c012 100755
--- a/scripts/lc-builds/toss3_clang.sh
+++ b/scripts/lc-builds/toss4_clang.sh
@@ -1,24 +1,24 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
-  echo "    toss3_clang.sh 10.0.1"
+  echo "    toss4_clang.sh 10.3.1"
   exit
 fi
 
 COMP_VER=$1
 shift 1
 
-BUILD_SUFFIX=lc_toss3-clang-${COMP_VER}
-RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake
+BUILD_SUFFIX=lc_toss4-clang-${COMP_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake
 
 echo
 echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
@@ -29,7 +29,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.20.2
+module load cmake/3.23.1
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
@@ -40,8 +40,3 @@ cmake \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..
-
-echo
-echo "***********************************************************************"
-echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
-echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh
index 273390561..89ece7b23 100755
--- a/scripts/lc-builds/toss4_clang_caliper.sh
+++ b/scripts/lc-builds/toss4_clang_caliper.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [[ $# -ne 3 ]]; then
+if [[ $# -lt 3 ]]; then
   echo
   echo "You must pass 3 arguments to the script (in this order): "
   echo "   1) compiler version number"
@@ -25,7 +25,7 @@ ADIAK_DIR=$3
 shift 3
 
 BUILD_SUFFIX=lc_toss4-clang-${COMP_VER}
-RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake
 
 echo
 echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
@@ -36,7 +36,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.21.1
+module load cmake/3.23.1
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh
index 614f2caec..db9cafa5c 100755
--- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh
+++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
@@ -52,6 +52,25 @@ echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in i
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
+echo "To get cmake to work you may have to configure with"
+echo "   -DHIP_PLATFORM=amd"
+echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
+echo "To work around some issues where *_FUSED kernels crash add these options"
+echo "   -DCMAKE_CXX_FLAGS=\"-fgpu-rdc\""
+echo "   -DCMAKE_EXE_LINKER_FLAGS=\"-fgpu-rdc\""
+echo
+echo "To work around some issues where *_FUSED kernels perform poorly use this environment variable"
+echo "   env HSA_SCRATCH_SINGLE_LIMIT=4000000000"
+echo
+echo "To work around some issues where the build fails with a weird error about max or fmax add these options"
+echo "   -DCMAKE_CXX_FLAGS=\"--hip-version={hip_version:ex=6.1.2}\""
+echo "   -DCMAKE_EXE_LINKER_FLAGS=\"--hip-version={hip_version:ex=6.1.2}\""
+echo
+
+
 
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
@@ -61,18 +80,27 @@ module load cmake/3.23.1
 
 # unload rocm to avoid configuration problems where the loaded rocm and COMP_VER
 # are inconsistent causing the rocprim from the module to be used unexpectedly
-module unload rocm
+module unload rocm rocmcc
 
+if [[ "${COMP_VER}" == *-magic ]]; then
+  ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}"
+  MPI_ROCM_PATH="/usr/tce/packages/cray-mpich/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}"
+else
+  ROCM_PATH="/opt/rocm-${COMP_VER}"
+  MPI_ROCM_PATH=/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}
+fi
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DMPI_C_COMPILER="/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}/bin/mpiamdclang" \
-  -DMPI_CXX_COMPILER="/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}/bin/mpiamdclang++" \
-  -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \
-  -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \
-  -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \
-  -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \
-  -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \
+  -DMPI_C_COMPILER="${MPI_ROCM_PATH}/bin/mpiamdclang" \
+  -DMPI_CXX_COMPILER="${MPI_ROCM_PATH}/bin/mpiamdclang++" \
+  -DCMAKE_PREFIX_PATH="${ROCM_PATH}/lib/cmake" \
+  -DHIP_PLATFORM=amd \
+  -DROCM_ROOT_DIR="${ROCM_PATH}" \
+  -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \
+  -DHIP_PATH="${ROCM_PATH}/llvm/bin" \
+  -DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/amdclang" \
+  -DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/amdclang++" \
   -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \
   -DGPU_TARGETS="${COMP_ARCH}" \
   -DAMDGPU_TARGETS="${COMP_ARCH}" \
@@ -98,10 +126,10 @@ echo
 echo "    module unload rocm"
 echo "    srun -n1 make"
 echo
-echo "  Please note that cray-mpich requires libmodules.so.1 from cce to run."
+echo "  Please note that rocm requires libpgmath.so from rocm/llvm to run."
 echo "  Until this is handled transparently in the build system you may add "
-echo "  cce to your LD_LIBRARY_PATH."
+echo "  rocm/llvm to your LD_LIBRARY_PATH."
 echo
-echo "    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/cce-tce/cce-13.0.2/cce/x86_64/lib/"
+echo "    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm-${COMP_VER}/llvm/lib"
 echo
 echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss3_mvapich2_gcc.sh b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh
similarity index 54%
rename from scripts/lc-builds/toss3_mvapich2_gcc.sh
rename to scripts/lc-builds/toss4_gcc-mpi_caliper.sh
index 8c9e0662c..62389ea73 100755
--- a/scripts/lc-builds/toss3_mvapich2_gcc.sh
+++ b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh
@@ -1,25 +1,31 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 3 ]]; then
   echo
-  echo "You must pass a compiler version number to script. For example,"
-  echo "    toss3_mvapich2_gcc.sh 2.3 10.2.1"
+  echo "You must pass 3 arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) path to caliper cmake directory"
+  echo "   3) path to adiak cmake directory"
+  echo
+  echo "For example: "
+  echo "    toss4_gcc-mpi_caliper.sh 10.3.1 /usr/workspace/wsb/asde/caliper-quartz/share/cmake/caliper /usr/workspace/wsb/asde/caliper-quartz/lib/cmake/adiak"
   exit
 fi
 
-MPI_VER=$1
-COMP_VER=$2
-shift 2
+COMP_VER=$1
+CALI_DIR=$2
+ADIAK_DIR=$3
+shift 3
 
-BUILD_SUFFIX=lc_toss3-mvapich2-${MPI_VER}-gcc-${COMP_VER}
-RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake
+BUILD_SUFFIX=lc_toss4-gcc-mpi-${COMP_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/gcc_X.cmake
 
 echo
 echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
@@ -30,28 +36,26 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.20.2
+module load cmake/3.23.1
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DMPI_CXX_COMPILER=/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-gcc-${COMP_VER}/bin/mpic++ \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/gcc \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \
   -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
-  -DENABLE_MPI=On \
+  -DENABLE_MPI=ON \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  -DRAJA_PERFSUITE_USE_CALIPER=ON \
+  -Dcaliper_DIR=${CALI_DIR} \
+  -Dadiak_DIR=${ADIAK_DIR} \
+  -DCMAKE_C_FLAGS="-g -O0" \
+  -DCMAKE_CXX_FLAGS="-g -O0" \
   "$@" \
   ..
 
 echo
 echo "***********************************************************************"
-echo
 echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
-echo
-echo "  Please note that you have to run with mpi when you run"
-echo "  the RAJA Perf Suite; for example,"
-echo
-echo "    srun -n2 ./bin/raja-perf.exe"
-echo
 echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss3_gcc.sh b/scripts/lc-builds/toss4_gcc.sh
similarity index 65%
rename from scripts/lc-builds/toss3_gcc.sh
rename to scripts/lc-builds/toss4_gcc.sh
index 4e7bf6bc1..1d0a98af7 100755
--- a/scripts/lc-builds/toss3_gcc.sh
+++ b/scripts/lc-builds/toss4_gcc.sh
@@ -1,24 +1,24 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
-  echo "    toss3_gcc.sh 8.3.1"
+  echo "    toss4_gcc.sh 10.3.1"
   exit
 fi
 
 COMP_VER=$1
 shift 1
 
-BUILD_SUFFIX=lc_toss3-gcc-${COMP_VER}
-RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake
+BUILD_SUFFIX=lc_toss4-gcc-${COMP_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/gcc_X.cmake
 
 echo
 echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
@@ -29,7 +29,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.20.2
+module load cmake/3.23.1
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
@@ -40,8 +40,3 @@ cmake \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..
-
-echo
-echo "***********************************************************************"
-echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
-echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_gcc_caliper.sh b/scripts/lc-builds/toss4_gcc_caliper.sh
index 11fd22605..dad854b59 100755
--- a/scripts/lc-builds/toss4_gcc_caliper.sh
+++ b/scripts/lc-builds/toss4_gcc_caliper.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [[ $# -ne 3 ]]; then
+if [[ $# -lt 3 ]]; then
   echo
   echo "You must pass 3 arguments to the script (in this order): "
   echo "   1) compiler version number"
@@ -36,7 +36,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.21.1
+module load cmake/3.23.1
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh
new file mode 100755
index 000000000..71642e1f1
--- /dev/null
+++ b/scripts/lc-builds/toss4_hipcc.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 2 ]]; then
+  echo
+  echo "You must pass 2 or more arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) HIP compute architecture"
+  echo "   3...) optional arguments to cmake"
+  echo
+  echo "For example: "
+  echo "    toss4_hipcc.sh 4.1.0 gfx906"
+  exit
+fi
+
+COMP_VER=$1
+COMP_ARCH=$2
+shift 2
+
+HOSTCONFIG="hip_3_X"
+
+if [[ ${COMP_VER} == 4.* ]]
+then
+##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1"
+  HOSTCONFIG="hip_4_link_X"
+elif [[ ${COMP_VER} == 3.* ]]
+then
+  HOSTCONFIG="hip_3_X"
+else
+  echo "Unknown hip version, using ${HOSTCONFIG} host-config"
+fi
+
+BUILD_SUFFIX=lc_toss4-hipcc-${COMP_VER}-${COMP_ARCH}
+RAJA_HOST_CONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+
+module load cmake/3.23.1
+
+# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER
+# are inconsistent causing the rocprim from the module to be used unexpectedly
+module unload rocm
+
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \
+  -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \
+  -DHIP_PATH=/opt/rocm-${COMP_VER}/bin \
+  -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \
+  -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \
+  -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \
+  -DGPU_TARGETS="${COMP_ARCH}" \
+  -DAMDGPU_TARGETS="${COMP_ARCH}" \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_HIP=ON \
+  -DENABLE_OPENMP=ON \
+  -DENABLE_CUDA=OFF \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you have to have a consistent build environment"
+echo "  when you make RAJA as cmake may reconfigure; unload the rocm module"
+echo "  or load the appropriate rocm module (${COMP_VER}) when building."
+echo
+echo "    module unload rocm"
+echo "    srun -n1 make"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh
new file mode 100755
index 000000000..dc042a369
--- /dev/null
+++ b/scripts/lc-builds/toss4_icpc-classic.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 1 ]]; then
+  echo
+  echo "You must pass a compiler version number to script. For example,"
+  echo "    toss4_icpc-classic.sh 19.1.2"
+  exit
+fi
+
+COMP_VER=$1
+shift 1
+
+BUILD_SUFFIX=lc_toss4-icpc-classic-${COMP_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpc-classic_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.23.1
+
+##
+# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
+# times at a potential cost of slower 'forall' execution.
+##
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you may need to add some intel openmp libraries to your"
+echo "  LD_LIBRARY_PATH to run with openmp."
+echo
+echo "    LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel/intel-${COMP_VER}/compiler/lib/intel64_lin"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss4_icpc.sh
similarity index 66%
rename from scripts/lc-builds/toss3_icpc.sh
rename to scripts/lc-builds/toss4_icpc.sh
index a8b7de2b9..77d81605f 100755
--- a/scripts/lc-builds/toss3_icpc.sh
+++ b/scripts/lc-builds/toss4_icpc.sh
@@ -1,32 +1,24 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
-  echo "    toss3_icpc.sh 19.1.0"
+  echo "    toss4_icpc.sh 2022.3"
   exit
 fi
 
 COMP_VER=$1
 shift 1
 
-COMP_MAJOR_VER=${COMP_VER:0:2}
-GCC_HEADER_VER=7
-
-if [ ${COMP_MAJOR_VER} -gt 18 ]
-then
-  GCC_HEADER_VER=8
-fi
-
-BUILD_SUFFIX=lc_toss3-icpc-${COMP_VER}
-RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/icpc_X_gcc${GCC_HEADER_VER}headers.cmake
+BUILD_SUFFIX=lc_toss4-icpc-${COMP_VER}
+RAJA_HOST_CONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpc_X.cmake
 
 echo
 echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
@@ -37,10 +29,10 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.20.2
+module load cmake/3.23.1
 
 ##
-# CMake option -DENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
+# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
 # times at a potential cost of slower 'forall' execution.
 ##
 
@@ -50,6 +42,7 @@ cmake \
   -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \
   -DBLT_CXX_STD=c++14 \
   -C ${RAJA_HOSTCONFIG} \
+  -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
@@ -57,5 +50,12 @@ cmake \
 
 echo
 echo "***********************************************************************"
-echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you may need to add some intel openmp libraries to your"
+echo "  LD_LIBRARY_PATH to run with openmp."
+echo
+echo "    LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel/intel-${COMP_VER}/compiler/lib/intel64_lin"
+echo
 echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh
new file mode 100755
index 000000000..0a89683c3
--- /dev/null
+++ b/scripts/lc-builds/toss4_icpx.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 1 ]]; then
+  echo
+  echo "You must pass a compiler version number to script. For example,"
+  echo "    toss4_icpx.sh 2022.1.0"
+  exit
+fi
+
+COMP_VER=$1
+shift 1
+
+BUILD_SUFFIX=lc_toss4-icpx-${COMP_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpx_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.23.1
+
+##
+# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
+# times at a potential cost of slower 'forall' execution.
+##
+
+source /usr/tce/packages/intel/intel-${COMP_VER}/setvars.sh
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icpx \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icx \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/toss4_mvapich2_icpx.sh b/scripts/lc-builds/toss4_mvapich2_icpx.sh
new file mode 100755
index 000000000..def610fb2
--- /dev/null
+++ b/scripts/lc-builds/toss4_mvapich2_icpx.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 2 ]]; then
+  echo
+  echo "You must pass 2 or more arguments to the script (in this order): "
+  echo "   1) mvapich2 compiler version number"
+  echo "   2) icpx compiler version number"
+  echo "   3...) optional arguments to cmake"
+  echo
+  echo "For example: "
+  echo "    toss4_mvapich2_icpx.sh 2022.1.0"
+  exit
+fi
+
+MPI_VER=$1
+COMP_VER=$2
+shift 2
+
+BUILD_SUFFIX=lc_toss4-mvapich2-${MPI_VER}-icpx-${COMP_VER}
+RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpx_X.cmake
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.23.1
+
+##
+# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
+# times at a potential cost of slower 'forall' execution.
+##
+
+source /usr/tce/packages/intel/intel-${COMP_VER}/setvars.sh
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DMPI_C_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicc" \
+  -DMPI_CXX_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicxx" \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icpx \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icx \
+  -DBLT_CXX_STD=c++14 \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_MPI=ON \
+  -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo
+echo "  Please note that you have to run with mpi when you run"
+echo "  the RAJA Perf Suite; for example,"
+echo
+echo "    srun -n2 ./bin/raja-perf.exe"
+echo
+echo "***********************************************************************"
diff --git a/scripts/make_release_tarball.sh b/scripts/make_release_tarball.sh
index cd86cdc80..1956d0436 100755
--- a/scripts/make_release_tarball.sh
+++ b/scripts/make_release_tarball.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors. 
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/scripts/travis_build_and_test.sh b/scripts/travis_build_and_test.sh
index 5ca692a49..027d41ed7 100755
--- a/scripts/travis_build_and_test.sh
+++ b/scripts/travis_build_and_test.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors. 
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/scripts/ubuntu-builds/ubuntu_clang.sh b/scripts/ubuntu-builds/ubuntu_clang.sh
index 68b722774..7ddba9a7d 100755
--- a/scripts/ubuntu-builds/ubuntu_clang.sh
+++ b/scripts/ubuntu-builds/ubuntu_clang.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
   echo "    ubuntu_clang.sh 10"
@@ -22,6 +22,8 @@ RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/ubuntu-builds/clang_X.cmake
 
 echo
 echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
 echo
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
@@ -39,5 +41,5 @@ cmake \
 
 echo
 echo "***********************************************************************"
-echo "cd into directory ${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
 echo "***********************************************************************"
diff --git a/scripts/ubuntu-builds/ubuntu_gcc.sh b/scripts/ubuntu-builds/ubuntu_gcc.sh
index 04c57fce7..e40c65482 100755
--- a/scripts/ubuntu-builds/ubuntu_gcc.sh
+++ b/scripts/ubuntu-builds/ubuntu_gcc.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJAPerf/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if [ "$1" == "" ]; then
+if [[ $# -lt 1 ]]; then
   echo
   echo "You must pass a compiler version number to script. For example,"
   echo "    ubuntu_gcc.sh 8"
@@ -22,6 +22,8 @@ RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/ubuntu-builds/gcc_X.cmake
 
 echo
 echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
 echo
 
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
@@ -39,5 +41,5 @@ cmake \
 
 echo
 echo "***********************************************************************"
-echo "cd into directory ${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite"
 echo "***********************************************************************"
diff --git a/scripts/update_copyright.sh b/scripts/update_copyright.sh
index d3bdeb170..527e42d43 100755
--- a/scripts/update_copyright.sh
+++ b/scripts/update_copyright.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors. 
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -17,7 +17,8 @@
 # as well.
 #
 # IMPORTANT: Since this file is not modified (it is running the shell 
-# script commands), you must EDIT THE COPYRIGHT DATES ABOVE MANUALLY.
+# script commands), you must EDIT THE COPYRIGHT DATES IN THE HEADER ABOVE 
+# MANUALLY.
 #
 # Edit the 'find' command below to change the set of files that will be
 # modified.
@@ -46,18 +47,18 @@ for i in `cat files2change`
 do
     echo $i
     cp $i $i.sed.bak
-    sed "s/Copyright (c) 2017-22/Copyright (c) 2017-23/" $i.sed.bak > $i
+    sed "s/Copyright (c) 2017-23/Copyright (c) 2017-24/" $i.sed.bak > $i
 done
 
 echo LICENSE
 cp LICENSE LICENSE.sed.bak
-sed "s/Copyright (c) 2017-2022/Copyright (c) 2017-2023/" LICENSE.sed.bak > LICENSE
+sed "s/Copyright (c) 2017-2023/Copyright (c) 2017-2024/" LICENSE.sed.bak > LICENSE
 
-for i in RELEASE README.md
+for i in RELEASE README.md docs/conf.py
 do
     echo $i
     cp $i $i.sed.bak
-    sed "s/2017-22/2017-23/" $i.sed.bak > $i
+    sed "s/2017-23/2017-24/" $i.sed.bak > $i
 done
 
 #=============================================================================
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 24e0a0815..f60d14744 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -18,8 +18,9 @@ add_subdirectory(polybench)
 add_subdirectory(stream)
 add_subdirectory(stream-kokkos)
 add_subdirectory(algorithm)
+add_subdirectory(comm)
 
-set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS
+set(RAJA_PERFSUITE_LIBS
     common
     apps
     basic
@@ -29,7 +30,9 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS
     polybench
     stream
     stream-kokkos
-    algorithm)
+    algorithm
+    comm)
+set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_LIBS})
 list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS})
 
 if(RAJA_ENABLE_TARGET_OPENMP)
@@ -57,12 +60,6 @@ blt_add_executable(
   apps/PRESSURE.cpp
   apps/PRESSURE-Seq.cpp
   apps/PRESSURE-OMPTarget.cpp
-  apps/HALOEXCHANGE.cpp
-  apps/HALOEXCHANGE-Seq.cpp
-  apps/HALOEXCHANGE-OMPTarget.cpp
-  apps/HALOEXCHANGE_FUSED.cpp
-  apps/HALOEXCHANGE_FUSED-Seq.cpp
-  apps/HALOEXCHANGE_FUSED-OMPTarget.cpp
   apps/LTIMES.cpp
   apps/LTIMES-Seq.cpp
   apps/LTIMES-OMPTarget.cpp
@@ -75,6 +72,9 @@ blt_add_executable(
   apps/MASS3DPA.cpp
   apps/MASS3DPA-Seq.cpp
   apps/MASS3DPA-OMPTarget.cpp
+  apps/MATVEC_3D_STENCIL.cpp
+  apps/MATVEC_3D_STENCIL-Seq.cpp
+  apps/MATVEC_3D_STENCIL-OMPTarget.cpp
   apps/NODAL_ACCUMULATION_3D.cpp
   apps/NODAL_ACCUMULATION_3D-Seq.cpp
   apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp
@@ -138,6 +138,9 @@ blt_add_executable(
   basic/TRAP_INT.cpp
   basic/TRAP_INT-Seq.cpp
   basic/TRAP_INT-OMPTarget.cpp
+  basic/MULTI_REDUCE.cpp
+  basic/MULTI_REDUCE-Seq.cpp
+  basic/MULTI_REDUCE-OMPTarget.cpp
   lcals/DIFF_PREDICT.cpp
   lcals/DIFF_PREDICT-Seq.cpp
   lcals/DIFF_PREDICT-OMPTarget.cpp
@@ -248,6 +251,28 @@ blt_add_executable(
   algorithm/MEMCPY.cpp
   algorithm/MEMCPY-Seq.cpp
   algorithm/MEMCPY-OMPTarget.cpp
+  algorithm/ATOMIC.cpp
+  algorithm/ATOMIC-Seq.cpp
+  algorithm/ATOMIC-OMPTarget.cpp
+  algorithm/HISTOGRAM.cpp
+  algorithm/HISTOGRAM-Seq.cpp
+  algorithm/HISTOGRAM-OMPTarget.cpp
+  comm/HALO_base.cpp
+  comm/HALO_PACKING.cpp
+  comm/HALO_PACKING-Seq.cpp
+  comm/HALO_PACKING-OMPTarget.cpp
+  comm/HALO_PACKING_FUSED.cpp
+  comm/HALO_PACKING_FUSED-Seq.cpp
+  comm/HALO_PACKING_FUSED-OMPTarget.cpp
+  comm/HALO_SENDRECV.cpp
+  comm/HALO_SENDRECV-Seq.cpp
+  comm/HALO_SENDRECV-OMPTarget.cpp
+  comm/HALO_EXCHANGE.cpp
+  comm/HALO_EXCHANGE-Seq.cpp
+  comm/HALO_EXCHANGE-OMPTarget.cpp
+  comm/HALO_EXCHANGE_FUSED.cpp
+  comm/HALO_EXCHANGE_FUSED-Seq.cpp
+  comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp
   DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS}
 )
 install( TARGETS raja-perf-omptarget.exe
@@ -264,4 +289,7 @@ blt_add_executable(
 install( TARGETS raja-perf.exe
          RUNTIME DESTINATION bin
        )
+install( TARGETS ${RAJA_PERFSUITE_LIBS}
+         LIBRARY DESTINATION lib
+       )
 endif()
diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp
index 3ce688d29..7aa549262 100644
--- a/src/RAJAPerfSuiteDriver.cpp
+++ b/src/RAJAPerfSuiteDriver.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/ATOMIC-Cuda.cpp b/src/algorithm/ATOMIC-Cuda.cpp
new file mode 100644
index 000000000..a286c60d2
--- /dev/null
+++ b/src/algorithm/ATOMIC-Cuda.cpp
@@ -0,0 +1,338 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+const size_t warp_size = 32;
+
+template < size_t block_size, size_t replication >
+__launch_bounds__(block_size)
+__global__ void atomic_replicate_thread(Real_ptr atomic,
+                          Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i, ATOMIC_VALUE);
+  }
+}
+
+template < size_t block_size, size_t replication >
+__launch_bounds__(block_size)
+__global__ void atomic_replicate_warp(Real_ptr atomic,
+                          Index_type iend)
+{
+  Real_type val = 0;
+
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    val = ATOMIC_VALUE;
+  }
+
+  using WarpReduce = cub::WarpReduce<Real_type, warp_size>;
+  __shared__ typename WarpReduce::TempStorage warp_reduce_storage;
+  val = WarpReduce(warp_reduce_storage).Sum(val);
+  if ((threadIdx.x % warp_size) == 0) {
+    ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i/warp_size, val);
+  }
+}
+
+template < size_t block_size, size_t replication >
+__launch_bounds__(block_size)
+__global__ void atomic_replicate_block(Real_ptr atomic,
+                          Index_type iend)
+{
+  Real_type val = 0;
+
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    val = ATOMIC_VALUE;
+  }
+
+  using BlockReduce = cub::BlockReduce<Real_type, block_size>;
+  __shared__ typename BlockReduce::TempStorage block_reduce_storage;
+  val = BlockReduce(block_reduce_storage).Sum(val);
+  if (threadIdx.x == 0) {
+    ATOMIC_RAJA_BODY(RAJA::cuda_atomic, blockIdx.x, val);
+  }
+}
+
+
+template < size_t block_size, size_t replication >
+void ATOMIC::runCudaVariantReplicateGlobal(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  ATOMIC_DATA_SETUP(replication);
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchCudaKernel( (atomic_replicate_thread<block_size, replication>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         atomic,
+                         iend );
+
+    }
+    stopTimer();
+
+  } else  if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::cuda_exec<block_size, true /*async*/>>(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i, ATOMIC_VALUE);
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ATOMIC : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+}
+
+template < size_t block_size, size_t replication >
+void ATOMIC::runCudaVariantReplicateWarp(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  ATOMIC_DATA_SETUP(replication);
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchCudaKernel( (atomic_replicate_warp<block_size, replication>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         atomic,
+                         iend );
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ATOMIC : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+}
+
+template < size_t block_size, size_t replication >
+void ATOMIC::runCudaVariantReplicateBlock(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  ATOMIC_DATA_SETUP(replication);
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchCudaKernel( (atomic_replicate_block<block_size, replication>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         atomic,
+                         iend );
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ATOMIC : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+}
+
+void ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+          if (run_params.numValidAtomicReplication() == 0u ||
+              run_params.validAtomicReplication(replication)) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantReplicateGlobal<decltype(block_size)::value, replication>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+        if ( vid == Base_CUDA ) {
+
+          seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(replication)) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runCudaVariantReplicateWarp<decltype(block_size)::value, replication>(vid);
+
+              }
+
+              t += 1;
+
+            }
+
+          });
+
+          seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(replication)) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runCudaVariantReplicateBlock<decltype(block_size)::value, replication>(vid);
+
+              }
+
+              t += 1;
+
+            }
+
+          });
+
+        }
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  ATOMIC : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void ATOMIC::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+          if (run_params.numValidAtomicReplication() == 0u ||
+              run_params.validAtomicReplication(replication)) {
+
+            addVariantTuningName(vid, "replicate_"+std::to_string(replication)+
+                                      "_global_"+std::to_string(block_size));
+
+          }
+
+        });
+
+        if ( vid == Base_CUDA ) {
+
+          seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(replication)) {
+
+              addVariantTuningName(vid, "replicate_"+std::to_string(replication)+
+                                        "_warp_"+std::to_string(block_size));
+
+            }
+
+          });
+
+          seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(replication)) {
+
+              addVariantTuningName(vid, "replicate_"+std::to_string(replication)+
+                                        "_block_"+std::to_string(block_size));
+
+            }
+
+          });
+
+        }
+
+      }
+
+    });
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/algorithm/ATOMIC-Hip.cpp b/src/algorithm/ATOMIC-Hip.cpp
new file mode 100644
index 000000000..fbb103596
--- /dev/null
+++ b/src/algorithm/ATOMIC-Hip.cpp
@@ -0,0 +1,338 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <rocprim/block/block_reduce.hpp>
+#include <rocprim/warp/warp_reduce.hpp>
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+const size_t warp_size = 64;
+
+template < size_t block_size, size_t replication >
+__launch_bounds__(block_size)
+__global__ void atomic_replicate_thread(Real_ptr atomic,
+                          Index_type iend)
+{
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    ATOMIC_RAJA_BODY(RAJA::hip_atomic, i, ATOMIC_VALUE);
+  }
+}
+
+template < size_t block_size, size_t replication >
+__launch_bounds__(block_size)
+__global__ void atomic_replicate_warp(Real_ptr atomic,
+                          Index_type iend)
+{
+  Real_type val = 0;
+
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    val = ATOMIC_VALUE;
+  }
+
+  using WarpReduce = rocprim::warp_reduce<Real_type, warp_size>;
+  __shared__ typename WarpReduce::storage_type warp_reduce_storage;
+  WarpReduce().reduce(val, val, warp_reduce_storage);
+  if ((threadIdx.x % warp_size) == 0) {
+    ATOMIC_RAJA_BODY(RAJA::hip_atomic, i/warp_size, val);
+  }
+}
+
+template < size_t block_size, size_t replication >
+__launch_bounds__(block_size)
+__global__ void atomic_replicate_block(Real_ptr atomic,
+                          Index_type iend)
+{
+  Real_type val = 0;
+
+  Index_type i = blockIdx.x * block_size + threadIdx.x;
+  if (i < iend) {
+    val = ATOMIC_VALUE;
+  }
+
+  using BlockReduce = rocprim::block_reduce<Real_type, block_size>;
+  __shared__ typename BlockReduce::storage_type block_reduce_storage;
+  BlockReduce().reduce(val, val, block_reduce_storage);
+  if (threadIdx.x == 0) {
+    ATOMIC_RAJA_BODY(RAJA::hip_atomic, blockIdx.x, val);
+  }
+}
+
+
+template < size_t block_size, size_t replication >
+void ATOMIC::runHipVariantReplicateGlobal(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  ATOMIC_DATA_SETUP(replication);
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchHipKernel( (atomic_replicate_thread<block_size, replication>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         atomic,
+                         iend );
+
+    }
+    stopTimer();
+
+  } else  if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::hip_exec<block_size, true /*async*/>>(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+          ATOMIC_RAJA_BODY(RAJA::hip_atomic, i, ATOMIC_VALUE);
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ATOMIC : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+}
+
+template < size_t block_size, size_t replication >
+void ATOMIC::runHipVariantReplicateWarp(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  ATOMIC_DATA_SETUP(replication);
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchHipKernel( (atomic_replicate_warp<block_size, replication>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         atomic,
+                         iend );
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ATOMIC : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+}
+
+template < size_t block_size, size_t replication >
+void ATOMIC::runHipVariantReplicateBlock(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  ATOMIC_DATA_SETUP(replication);
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchHipKernel( (atomic_replicate_block<block_size, replication>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         atomic,
+                         iend );
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ATOMIC : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+}
+
+void ATOMIC::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+          if (run_params.numValidAtomicReplication() == 0u ||
+              run_params.validAtomicReplication(replication)) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantReplicateGlobal<block_size, replication>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+        if ( vid == Base_HIP ) {
+
+          seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(replication)) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runHipVariantReplicateWarp<block_size, replication>(vid);
+
+              }
+
+              t += 1;
+
+            }
+
+          });
+
+          seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(replication)) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runHipVariantReplicateBlock<block_size, replication>(vid);
+
+              }
+
+              t += 1;
+
+            }
+
+          });
+
+        }
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  ATOMIC : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void ATOMIC::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+          if (run_params.numValidAtomicReplication() == 0u ||
+              run_params.validAtomicReplication(replication)) {
+
+            addVariantTuningName(vid, "replicate_"+std::to_string(replication)+
+                                      "_global_"+std::to_string(block_size));
+
+          }
+
+        });
+
+        if ( vid == Base_HIP ) {
+
+          seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(replication)) {
+
+              addVariantTuningName(vid, "replicate_"+std::to_string(replication)+
+                                        "_warp_"+std::to_string(block_size));
+
+            }
+
+          });
+
+          seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(replication)) {
+
+              addVariantTuningName(vid, "replicate_"+std::to_string(replication)+
+                                        "_block_"+std::to_string(block_size));
+
+            }
+
+          });
+
+        }
+
+      }
+
+    });
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/algorithm/ATOMIC-OMP.cpp b/src/algorithm/ATOMIC-OMP.cpp
new file mode 100644
index 000000000..ae3863bb1
--- /dev/null
+++ b/src/algorithm/ATOMIC-OMP.cpp
@@ -0,0 +1,153 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+template < size_t replication >
+void ATOMIC::runOpenMPVariantReplicate(VariantID vid)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  ATOMIC_DATA_SETUP(replication);
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          #pragma omp atomic
+          ATOMIC_BODY(i, ATOMIC_VALUE);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      auto atomic_base_lam = [=](Index_type i) {
+                                 #pragma omp atomic
+                                 ATOMIC_BODY(i, ATOMIC_VALUE);
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          atomic_base_lam(i);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            ATOMIC_RAJA_BODY(RAJA::omp_atomic, i, ATOMIC_VALUE);
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  ATOMIC : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+
+void ATOMIC::runOpenMPVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_OpenMP || vid == Lambda_OpenMP || vid == RAJA_OpenMP ) {
+
+    seq_for(cpu_atomic_replications_type{}, [&](auto replication) {
+
+      if (run_params.numValidAtomicReplication() == 0u ||
+          run_params.validAtomicReplication(replication)) {
+
+        if (tune_idx == t) {
+
+          runOpenMPVariantReplicate<replication>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  ATOMIC : Unknown OMP Target variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void ATOMIC::setOpenMPTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_OpenMP || vid == Lambda_OpenMP || vid == RAJA_OpenMP ) {
+
+    seq_for(cpu_atomic_replications_type{}, [&](auto replication) {
+
+      if (run_params.numValidAtomicReplication() == 0u ||
+          run_params.validAtomicReplication(replication)) {
+
+        addVariantTuningName(vid, "replicate_"+std::to_string(replication));
+
+      }
+
+    });
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/ATOMIC-OMPTarget.cpp b/src/algorithm/ATOMIC-OMPTarget.cpp
new file mode 100644
index 000000000..2c7bb7203
--- /dev/null
+++ b/src/algorithm/ATOMIC-OMPTarget.cpp
@@ -0,0 +1,127 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+template < size_t replication >
+void ATOMIC::runOpenMPTargetVariantReplicate(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  ATOMIC_DATA_SETUP(replication);
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(atomic)
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        #pragma omp atomic
+        ATOMIC_BODY(i, ATOMIC_VALUE);
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          ATOMIC_RAJA_BODY(RAJA::omp_atomic, i, ATOMIC_VALUE);
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ATOMIC : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+
+}
+
+void ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) {
+
+    seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+      if (run_params.numValidAtomicReplication() == 0u ||
+          run_params.validAtomicReplication(replication)) {
+
+        if (tune_idx == t) {
+
+          runOpenMPTargetVariantReplicate<replication>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  ATOMIC : Unknown OMP Target variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void ATOMIC::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) {
+
+    seq_for(gpu_atomic_replications_type{}, [&](auto replication) {
+
+      if (run_params.numValidAtomicReplication() == 0u ||
+          run_params.validAtomicReplication(replication)) {
+
+        addVariantTuningName(vid, "replicate_"+std::to_string(replication));
+
+      }
+
+    });
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/algorithm/ATOMIC-Seq.cpp b/src/algorithm/ATOMIC-Seq.cpp
new file mode 100644
index 000000000..1cccb8a6b
--- /dev/null
+++ b/src/algorithm/ATOMIC-Seq.cpp
@@ -0,0 +1,146 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+template < size_t replication >
+void ATOMIC::runSeqVariantReplicate(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  ATOMIC_DATA_SETUP(replication);
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          ATOMIC_BODY(i, ATOMIC_VALUE);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      auto atomic_base_lam = [=](Index_type i) {
+                                 ATOMIC_BODY(i, ATOMIC_VALUE);
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          atomic_base_lam(i);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+            ATOMIC_RAJA_BODY(RAJA::seq_atomic, i, ATOMIC_VALUE);
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+    default : {
+      getCout() << "\n  ATOMIC : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+  ATOMIC_DATA_TEARDOWN(replication);
+
+}
+
+
+void ATOMIC::runSeqVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_Seq || vid == Lambda_Seq || vid == RAJA_Seq ) {
+
+    seq_for(cpu_atomic_replications_type{}, [&](auto replication) {
+
+      if (run_params.numValidAtomicReplication() == 0u ||
+          run_params.validAtomicReplication(replication)) {
+
+        if (tune_idx == t) {
+
+          runSeqVariantReplicate<replication>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  ATOMIC : Unknown OMP Target variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void ATOMIC::setSeqTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_Seq || vid == Lambda_Seq || vid == RAJA_Seq ) {
+
+    seq_for(cpu_atomic_replications_type{}, [&](auto replication) {
+
+      if (run_params.numValidAtomicReplication() == 0u ||
+          run_params.validAtomicReplication(replication)) {
+
+        addVariantTuningName(vid, "replicate_"+std::to_string(replication));
+
+      }
+
+    });
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp
new file mode 100644
index 000000000..8da1c2421
--- /dev/null
+++ b/src/algorithm/ATOMIC.cpp
@@ -0,0 +1,80 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ATOMIC.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+ATOMIC::ATOMIC(const RunParams& params)
+  : KernelBase(rajaperf::Algorithm_ATOMIC, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(50);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setFLOPsPerRep(getActualProblemSize());
+
+  setUsesFeature(Forall);
+  setUsesFeature(Atomic);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( Lambda_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( Lambda_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+ATOMIC::~ATOMIC()
+{
+}
+
+void ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  m_init = 0;
+  m_final = -static_cast<int>(vid);
+}
+
+void ATOMIC::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += static_cast<Checksum_type>(m_final);
+}
+
+void ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/ATOMIC.hpp b/src/algorithm/ATOMIC.hpp
new file mode 100644
index 000000000..55ab41ad8
--- /dev/null
+++ b/src/algorithm/ATOMIC.hpp
@@ -0,0 +1,113 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// ATOMIC kernel reference implementation:
+/// Test atomic throughput with an amount of replication known at compile time.
+///
+/// for (Index_type i = 0; i < N; ++i ) {
+///   atomic[i%replication] += 1;
+/// }
+///
+
+#ifndef RAJAPerf_Algorithm_ATOMIC_HPP
+#define RAJAPerf_Algorithm_ATOMIC_HPP
+
+#define ATOMIC_DATA_SETUP(replication) \
+  Real_type init = m_init; \
+  Real_ptr atomic; \
+  allocAndInitDataConst(atomic, replication, init, vid);
+
+#define ATOMIC_DATA_TEARDOWN(replication) \
+  { \
+    auto reset_atomic = scopedMoveData(atomic, replication, vid); \
+    m_final = init; \
+    for (size_t r = 0; r < replication; ++r ) { \
+      m_final += atomic[r]; \
+    } \
+  } \
+  deallocData(atomic, vid);
+
+#define ATOMIC_VALUE 1.0
+
+#define ATOMIC_BODY(i, val) \
+  atomic[(i)%replication] += (val)
+
+#define ATOMIC_RAJA_BODY(policy, i, val) \
+  RAJA::atomicAdd<policy>(&atomic[(i)%replication], (val))
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace algorithm
+{
+
+class ATOMIC : public KernelBase
+{
+public:
+
+  ATOMIC(const RunParams& params);
+
+  ~ATOMIC();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runKokkosVariant(VariantID vid, size_t tune_idx);
+
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+
+  template < size_t replication >
+  void runSeqVariantReplicate(VariantID vid);
+  template < size_t replication >
+  void runOpenMPVariantReplicate(VariantID vid);
+  template < size_t block_size, size_t replication >
+  void runCudaVariantReplicateGlobal(VariantID vid);
+  template < size_t block_size, size_t replication >
+  void runHipVariantReplicateGlobal(VariantID vid);
+  template < size_t block_size, size_t replication >
+  void runCudaVariantReplicateWarp(VariantID vid);
+  template < size_t block_size, size_t replication >
+  void runHipVariantReplicateWarp(VariantID vid);
+  template < size_t block_size, size_t replication >
+  void runCudaVariantReplicateBlock(VariantID vid);
+  template < size_t block_size, size_t replication >
+  void runHipVariantReplicateBlock(VariantID vid);
+  template < size_t replication >
+  void runOpenMPTargetVariantReplicate(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
+  static const size_t default_cpu_atomic_replication = 64;
+  using cpu_atomic_replications_type = integer::make_atomic_replication_list_type<default_cpu_atomic_replication>;
+  static const size_t default_atomic_replication = 4096;
+  using gpu_atomic_replications_type = integer::make_atomic_replication_list_type<default_atomic_replication>;
+
+  Real_type m_init;
+  Real_type m_final;
+};
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt
index 54334242e..515c35baa 100644
--- a/src/algorithm/CMakeLists.txt
+++ b/src/algorithm/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -30,6 +30,7 @@ blt_add_library(
           REDUCE_SUM-Cuda.cpp
           REDUCE_SUM-OMP.cpp
           REDUCE_SUM-OMPTarget.cpp
+          REDUCE_SUM-Sycl.cpp
           MEMSET.cpp
           MEMSET-Seq.cpp
           MEMSET-Hip.cpp
@@ -42,5 +43,17 @@ blt_add_library(
           MEMCPY-Cuda.cpp
           MEMCPY-OMP.cpp
           MEMCPY-OMPTarget.cpp
+          ATOMIC.cpp
+          ATOMIC-Seq.cpp
+          ATOMIC-Hip.cpp
+          ATOMIC-Cuda.cpp
+          ATOMIC-OMP.cpp
+          ATOMIC-OMPTarget.cpp
+          HISTOGRAM.cpp
+          HISTOGRAM-Seq.cpp
+          HISTOGRAM-Hip.cpp
+          HISTOGRAM-Cuda.cpp
+          HISTOGRAM-OMP.cpp
+          HISTOGRAM-OMPTarget.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp
new file mode 100644
index 000000000..0bc363ee3
--- /dev/null
+++ b/src/algorithm/HISTOGRAM-Cuda.cpp
@@ -0,0 +1,397 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HISTOGRAM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "cub/device/device_histogram.cuh"
+#include "cub/util_allocator.cuh"
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+constexpr Index_type warp_size = 32;
+
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts,
+                                         Index_ptr bins,
+                                         Index_type iend,
+                                         Index_type num_bins,
+                                         Index_type shared_replication,
+                                         Index_type global_replication)
+{
+  if (shared_replication > 0) {
+
+    extern __shared__ HISTOGRAM::Data_type shared_counts[];
+    for (Index_type t = threadIdx.x;
+         t < Index_type(num_bins * shared_replication);
+         t += block_size) {
+      shared_counts[t] = HISTOGRAM::Data_type(0);
+    }
+    __syncthreads();
+
+    {
+      Index_type i = blockIdx.x * block_size + threadIdx.x;
+      for ( ; i < iend ; i += gridDim.x * block_size ) {
+        Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication);
+        RAJA::atomicAdd<RAJA::cuda_atomic>(&shared_counts[offset], HISTOGRAM::Data_type(1));
+      }
+    }
+
+    __syncthreads();
+    for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) {
+      auto block_sum = HISTOGRAM::Data_type(0);
+      for (Index_type s = 0; s < shared_replication; ++s) {
+        block_sum += shared_counts[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)];
+      }
+      if (block_sum != HISTOGRAM::Data_type(0)) {
+        Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins;
+        RAJA::atomicAdd<RAJA::cuda_atomic>(&global_counts[offset], block_sum);
+      }
+    }
+
+  } else {
+
+    Index_type i = blockIdx.x * block_size + threadIdx.x;
+    Index_type warp = i / warp_size;
+    for ( ; i < iend ; i += gridDim.x * block_size ) {
+      Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins;
+      RAJA::atomicAdd<RAJA::cuda_atomic>(&global_counts[offset], HISTOGRAM::Data_type(1));
+    }
+  }
+}
+
+
+void HISTOGRAM::runCudaVariantLibrary(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  HISTOGRAM_DATA_SETUP;
+
+  RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, 1);
+
+  RAJAPERF_UNUSED_VAR(counts_init);
+
+  if ( vid == Base_CUDA ) {
+
+    cudaStream_t stream = res.get_stream();
+
+    int len = iend - ibegin;
+
+    // Determine temporary device storage requirements
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage,
+                                                     temp_storage_bytes,
+                                                     bins+ibegin,
+                                                     counts,
+                                                     static_cast<int>(num_bins+1),
+                                                     static_cast<Index_type>(0),
+                                                     num_bins,
+                                                     len,
+                                                     stream));
+
+    // Allocate temporary storage
+    unsigned char* temp_storage;
+    allocData(DataSpace::CudaDevice, temp_storage, temp_storage_bytes);
+    d_temp_storage = temp_storage;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      // Run
+      cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage,
+                                                       temp_storage_bytes,
+                                                       bins+ibegin,
+                                                       counts,
+                                                       static_cast<int>(num_bins+1),
+                                                       static_cast<Index_type>(0),
+                                                       num_bins,
+                                                       len,
+                                                       stream));
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, 1);
+      HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, 1);
+
+    }
+    stopTimer();
+
+    // Free temporary storage
+    deallocData(DataSpace::CudaDevice, temp_storage);
+
+  } else {
+     getCout() << "\n  HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+  RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts);
+
+}
+
+
+template < Index_type block_size,
+           Index_type preferred_global_replication,
+           Index_type preferred_shared_replication,
+           typename MappingHelper >
+void HISTOGRAM::runCudaVariantAtomicRuntime(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  HISTOGRAM_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    auto* func = &histogram_atomic_runtime<block_size>;
+
+    cudaFuncAttributes func_attr;
+    cudaErrchk(cudaFuncGetAttributes(&func_attr, (const void*)func));
+    const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes;
+    const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins;
+
+    const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication));
+    const Index_type shmem = shared_replication * num_bins * sizeof(Data_type);
+
+    const Index_type max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, func, block_size, shmem);
+    const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+    const Index_type grid_size = std::min(normal_grid_size, max_grid_size);
+
+    const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size));
+
+    RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication);
+
+      RPlaunchCudaKernel( func,
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          counts,
+                          bins,
+                          iend,
+                          num_bins,
+                          shared_replication,
+                          global_replication );
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication);
+      for (Index_type bin = 0; bin < num_bins; ++bin) {
+        Data_type count_final = Data_type(0);
+        for (Index_type r = 0; r < global_replication; ++r) {
+          Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins;
+          count_final += hcounts[offset];
+        }
+        counts_final[bin] = count_final;
+      }
+
+    }
+    stopTimer();
+
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts);
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    using exec_policy = std::conditional_t<MappingHelper::direct,
+        RAJA::cuda_exec<block_size, true /*async*/>,
+        RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+    using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy<
+        RAJA::cuda::MultiReduceTuning<
+          RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+          RAJA::cuda::AtomicReplicationTuning<
+            RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer<
+              RAJA::cuda::ConstantPreferredReplicationConcretizer<preferred_shared_replication>>,
+            RAJA::cuda::thread_xyz<>,
+            RAJA::GetOffsetRight<int>>,
+          RAJA::cuda::AtomicReplicationTuning<
+            RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer<
+              RAJA::cuda::ConstantPreferredReplicationConcretizer<preferred_global_replication>>,
+            RAJA::cuda::warp_global_xyz<>,
+            RAJA::GetOffsetLeft<int>>>>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy);
+
+      RAJA::forall<exec_policy>( res,
+          RAJA::RangeSegment(ibegin, iend),
+          [=] __device__ (Index_type i) {
+        HISTOGRAM_BODY;
+      });
+
+      HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+}
+
+
+void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA ) {
+
+    if (tune_idx == t) {
+
+      runCudaVariantLibrary(vid);
+
+    }
+
+    t += 1;
+
+  }
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if (camp::size<cuda_atomic_global_replications_type>::value == 0 &&
+              camp::size<cuda_atomic_shared_replications_type>::value == 0 ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantAtomicRuntime<decltype(block_size)::value,
+                                          default_cuda_atomic_global_replication,
+                                          default_cuda_atomic_shared_replication,
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+          seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(global_replication)) {
+
+              seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) {
+
+                if (tune_idx == t) {
+
+                  setBlockSize(block_size);
+                  runCudaVariantAtomicRuntime<decltype(block_size)::value,
+                                              decltype(global_replication)::value,
+                                              decltype(shared_replication)::value,
+                                              decltype(mapping_helper)>(vid);
+
+                }
+
+                t += 1;
+
+              });
+
+            }
+
+          });
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void HISTOGRAM::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA ) {
+
+    addVariantTuningName(vid, "cub");
+
+  }
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if (camp::size<cuda_atomic_global_replications_type>::value == 0 &&
+              camp::size<cuda_atomic_shared_replications_type>::value == 0 ) {
+
+            addVariantTuningName(vid, "atomic_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+          }
+
+          seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(global_replication)) {
+
+              seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) {
+
+                addVariantTuningName(vid, "atomic_"
+                                          "shared("+std::to_string(shared_replication)+")_"+
+                                          "global("+std::to_string(global_replication)+")_"+
+                                          decltype(mapping_helper)::get_name()+"_"+
+                                          std::to_string(block_size));
+
+              });
+
+            }
+
+          });
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp
new file mode 100644
index 000000000..5a25bca5c
--- /dev/null
+++ b/src/algorithm/HISTOGRAM-Hip.cpp
@@ -0,0 +1,426 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HISTOGRAM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#if defined(__HIPCC__)
+#define ROCPRIM_HIP_API 1
+#include "rocprim/device/device_histogram.hpp"
+#elif defined(__CUDACC__)
+#include "cub/device/device_histogram.cuh"
+#include "cub/util_allocator.cuh"
+#endif
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+constexpr Index_type warp_size = 64;
+
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts,
+                                         Index_ptr bins,
+                                         Index_type iend,
+                                         Index_type num_bins,
+                                         Index_type shared_replication,
+                                         Index_type global_replication)
+{
+  if (shared_replication > 0) {
+
+    extern __shared__ HISTOGRAM::Data_type shared_counts[];
+    for (Index_type t = threadIdx.x;
+         t < Index_type(num_bins * shared_replication);
+         t += block_size) {
+      shared_counts[t] = HISTOGRAM::Data_type(0);
+    }
+    __syncthreads();
+
+    {
+      Index_type i = blockIdx.x * block_size + threadIdx.x;
+      for ( ; i < iend ; i += gridDim.x * block_size ) {
+        Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication);
+        RAJA::atomicAdd<RAJA::hip_atomic>(&shared_counts[offset], HISTOGRAM::Data_type(1));
+      }
+    }
+
+    __syncthreads();
+    for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) {
+      auto block_sum = HISTOGRAM::Data_type(0);
+      for (Index_type s = 0; s < shared_replication; ++s) {
+        block_sum += shared_counts[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)];
+      }
+      if (block_sum != HISTOGRAM::Data_type(0)) {
+        Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins;
+        RAJA::atomicAdd<RAJA::hip_atomic>(&global_counts[offset], block_sum);
+      }
+    }
+
+  } else {
+
+    Index_type i = blockIdx.x * block_size + threadIdx.x;
+    Index_type warp = i / warp_size;
+    for ( ; i < iend ; i += gridDim.x * block_size ) {
+      Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins;
+      RAJA::atomicAdd<RAJA::hip_atomic>(&global_counts[offset], HISTOGRAM::Data_type(1));
+    }
+  }
+}
+
+
+void HISTOGRAM::runHipVariantLibrary(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  HISTOGRAM_DATA_SETUP;
+
+  RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, 1);
+
+  RAJAPERF_UNUSED_VAR(counts_init);
+
+  if ( vid == Base_HIP ) {
+
+    hipStream_t stream = res.get_stream();
+
+    int len = iend - ibegin;
+
+    // Determine temporary device storage requirements
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+#if defined(__HIPCC__)
+    hipErrchk(::rocprim::histogram_even(d_temp_storage,
+                                        temp_storage_bytes,
+                                        bins+ibegin,
+                                        len,
+                                        counts,
+                                        static_cast<int>(num_bins+1),
+                                        static_cast<Index_type>(0),
+                                        num_bins,
+                                        stream));
+#elif defined(__CUDACC__)
+    cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage,
+                                                     temp_storage_bytes,
+                                                     bins+ibegin,
+                                                     counts,
+                                                     static_cast<int>(num_bins+1),
+                                                     static_cast<Index_type>(0),
+                                                     num_bins,
+                                                     len,
+                                                     stream));
+#endif
+
+    // Allocate temporary storage
+    unsigned char* temp_storage;
+    allocData(DataSpace::HipDevice, temp_storage, temp_storage_bytes);
+    d_temp_storage = temp_storage;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      // Run
+#if defined(__HIPCC__)
+      hipErrchk(::rocprim::histogram_even(d_temp_storage,
+                                          temp_storage_bytes,
+                                          bins+ibegin,
+                                          len,
+                                          counts,
+                                          static_cast<int>(num_bins+1),
+                                          static_cast<Index_type>(0),
+                                          num_bins,
+                                          stream));
+#elif defined(__CUDACC__)
+      cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage,
+                                                       temp_storage_bytes,
+                                                       bins+ibegin,
+                                                       counts,
+                                                       static_cast<int>(num_bins+1),
+                                                       static_cast<Index_type>(0),
+                                                       num_bins,
+                                                       len,
+                                                       stream));
+#endif
+
+      RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, 1);
+      HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, 1);
+
+    }
+    stopTimer();
+
+    // Free temporary storage
+    deallocData(DataSpace::HipDevice, temp_storage);
+
+  } else {
+     getCout() << "\n  HISTOGRAM : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+  RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts);
+
+}
+
+
+template < Index_type block_size,
+           Index_type preferred_global_replication,
+           Index_type preferred_shared_replication,
+           typename MappingHelper >
+void HISTOGRAM::runHipVariantAtomicRuntime(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  HISTOGRAM_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    auto* func = &histogram_atomic_runtime<block_size>;
+
+    hipFuncAttributes func_attr;
+    hipErrchk(hipFuncGetAttributes(&func_attr, (const void*)func));
+    const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes;
+    const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins;
+
+    const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication));
+    const Index_type shmem = shared_replication * num_bins * sizeof(Data_type);
+
+    const Index_type max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, func, block_size, shmem);
+    const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+    const Index_type grid_size = std::min(normal_grid_size, max_grid_size);
+
+    const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size));
+
+    RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication);
+
+      RPlaunchHipKernel( func,
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         counts,
+                         bins,
+                         iend,
+                         num_bins,
+                         shared_replication,
+                         global_replication );
+
+      RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication);
+      for (Index_type bin = 0; bin < num_bins; ++bin) {
+        Data_type count_final = Data_type(0);
+        for (Index_type r = 0; r < global_replication; ++r) {
+          Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins;
+          count_final += hcounts[offset];
+        }
+        counts_final[bin] = count_final;
+      }
+
+    }
+    stopTimer();
+
+    RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts);
+
+  } else if ( vid == RAJA_HIP ) {
+
+    using exec_policy = std::conditional_t<MappingHelper::direct,
+        RAJA::hip_exec<block_size, true /*async*/>,
+        RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+    using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy<
+        RAJA::hip::MultiReduceTuning<
+          RAJA::hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+          RAJA::hip::AtomicReplicationTuning<
+            RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer<
+              RAJA::hip::ConstantPreferredReplicationConcretizer<preferred_shared_replication>>,
+            RAJA::hip::thread_xyz<>,
+            RAJA::GetOffsetRight<int>>,
+          RAJA::hip::AtomicReplicationTuning<
+            RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer<
+              RAJA::hip::ConstantPreferredReplicationConcretizer<preferred_global_replication>>,
+            RAJA::hip::warp_global_xyz<>,
+            RAJA::GetOffsetLeft<int>>>>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy);
+
+      RAJA::forall<exec_policy>( res,
+          RAJA::RangeSegment(ibegin, iend),
+          [=] __device__ (Index_type i) {
+        HISTOGRAM_BODY;
+      });
+
+      HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  HISTOGRAM : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+}
+
+
+void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP ) {
+
+    if (tune_idx == t) {
+
+      runHipVariantLibrary(vid);
+
+    }
+
+    t += 1;
+
+  }
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if (camp::size<hip_atomic_global_replications_type>::value == 0 &&
+              camp::size<hip_atomic_shared_replications_type>::value == 0 ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantAtomicRuntime<decltype(block_size)::value,
+                                          default_hip_atomic_global_replication,
+                                          default_hip_atomic_shared_replication,
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+          seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(global_replication)) {
+
+              seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) {
+
+                if (tune_idx == t) {
+
+                  setBlockSize(block_size);
+                  runHipVariantAtomicRuntime<decltype(block_size)::value,
+                                             decltype(global_replication)::value,
+                                             decltype(shared_replication)::value,
+                                             decltype(mapping_helper)>(vid);
+
+                }
+
+                t += 1;
+
+              });
+
+            }
+
+          });
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  HISTOGRAM : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void HISTOGRAM::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP ) {
+
+    addVariantTuningName(vid, "rocprim");
+
+  }
+
+  if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if (camp::size<hip_atomic_global_replications_type>::value == 0 &&
+              camp::size<hip_atomic_shared_replications_type>::value == 0 ) {
+
+            addVariantTuningName(vid, "atomic_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+          }
+
+          seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(global_replication)) {
+
+              seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) {
+
+                addVariantTuningName(vid, "atomic_"
+                                          "shared("+std::to_string(shared_replication)+")_"+
+                                          "global("+std::to_string(global_replication)+")_"+
+                                          decltype(mapping_helper)::get_name()+"_"+
+                                          std::to_string(block_size));
+
+              });
+
+            }
+
+          });
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/algorithm/HISTOGRAM-OMP.cpp b/src/algorithm/HISTOGRAM-OMP.cpp
new file mode 100644
index 000000000..87b554b47
--- /dev/null
+++ b/src/algorithm/HISTOGRAM-OMP.cpp
@@ -0,0 +1,121 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HISTOGRAM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+void HISTOGRAM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  HISTOGRAM_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      HISTOGRAM_SETUP_COUNTS;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        HISTOGRAM_INIT_COUNTS;
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          #pragma omp atomic
+          HISTOGRAM_BODY;
+        }
+
+        HISTOGRAM_FINALIZE_COUNTS;
+
+      }
+      stopTimer();
+
+      HISTOGRAM_TEARDOWN_COUNTS;
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      HISTOGRAM_SETUP_COUNTS;
+
+      auto histogram_base_lam = [=](Index_type i) {
+                                 #pragma omp atomic
+                                 HISTOGRAM_BODY;
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        HISTOGRAM_INIT_COUNTS;
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          histogram_base_lam(i);
+        }
+
+        HISTOGRAM_FINALIZE_COUNTS;
+
+      }
+      stopTimer();
+
+      HISTOGRAM_TEARDOWN_COUNTS;
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        HISTOGRAM_INIT_COUNTS_RAJA(RAJA::omp_multi_reduce);
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            HISTOGRAM_BODY;
+        });
+
+        HISTOGRAM_FINALIZE_COUNTS_RAJA(RAJA::omp_multi_reduce);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  HISTOGRAM : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+  HISTOGRAM_DATA_TEARDOWN;
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/HISTOGRAM-OMPTarget.cpp b/src/algorithm/HISTOGRAM-OMPTarget.cpp
new file mode 100644
index 000000000..033f309c9
--- /dev/null
+++ b/src/algorithm/HISTOGRAM-OMPTarget.cpp
@@ -0,0 +1,68 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HISTOGRAM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+
+void HISTOGRAM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  HISTOGRAM_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      initOpenMPDeviceData(counts, counts_init, num_bins);
+
+      #pragma omp target is_device_ptr(counts, bins)
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        #pragma omp atomic
+        HISTOGRAM_BODY;
+      }
+
+      getOpenMPDeviceData(counts_final, counts, num_bins);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  HISTOGRAM : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+
+  HISTOGRAM_DATA_TEARDOWN;
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/algorithm/HISTOGRAM-Seq.cpp b/src/algorithm/HISTOGRAM-Seq.cpp
new file mode 100644
index 000000000..e41ab171e
--- /dev/null
+++ b/src/algorithm/HISTOGRAM-Seq.cpp
@@ -0,0 +1,114 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HISTOGRAM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+void HISTOGRAM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  HISTOGRAM_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      HISTOGRAM_SETUP_COUNTS;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        HISTOGRAM_INIT_COUNTS;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          HISTOGRAM_BODY;
+        }
+
+        HISTOGRAM_FINALIZE_COUNTS;
+
+      }
+      stopTimer();
+
+      HISTOGRAM_TEARDOWN_COUNTS;
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      HISTOGRAM_SETUP_COUNTS;
+
+      auto histogram_base_lam = [=](Index_type i) {
+                                 HISTOGRAM_BODY;
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        HISTOGRAM_INIT_COUNTS;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          histogram_base_lam(i);
+        }
+
+        HISTOGRAM_FINALIZE_COUNTS;
+
+      }
+      stopTimer();
+
+      HISTOGRAM_TEARDOWN_COUNTS;
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        HISTOGRAM_INIT_COUNTS_RAJA(RAJA::seq_multi_reduce);
+
+        RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+            HISTOGRAM_BODY;
+        });
+
+        HISTOGRAM_FINALIZE_COUNTS_RAJA(RAJA::seq_multi_reduce);
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+    default : {
+      getCout() << "\n  HISTOGRAM : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+  HISTOGRAM_DATA_TEARDOWN;
+
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp
new file mode 100644
index 000000000..60ad2975e
--- /dev/null
+++ b/src/algorithm/HISTOGRAM.cpp
@@ -0,0 +1,151 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HISTOGRAM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <algorithm>
+#include <stdlib.h>
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+
+HISTOGRAM::HISTOGRAM(const RunParams& params)
+  : KernelBase(rajaperf::Algorithm_HISTOGRAM, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(50);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  m_num_bins = params.getMultiReduceNumBins();
+  m_bin_assignment_algorithm = params.getMultiReduceBinAssignmentAlgorithm();
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins +
+                      1*sizeof(Index_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins );
+  setBytesAtomicModifyWrittenPerRep( 0 );
+  setFLOPsPerRep(1 * getActualProblemSize());
+
+  setUsesFeature(Forall);
+  setUsesFeature(Atomic);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Kokkos_Lambda );
+}
+
+HISTOGRAM::~HISTOGRAM()
+{
+}
+
+void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocData(m_bins, getActualProblemSize(), vid);
+  {
+    auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid);
+
+    const bool init_random_per_iterate =
+        (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Random);
+    const bool init_random_sizes =
+        (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsRandomSizes);
+    const bool init_even_sizes =
+        (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsEvenSizes);
+    const bool init_all_one =
+        (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Single);
+
+    if (init_even_sizes || init_random_sizes || init_all_one) {
+      Real_ptr data = nullptr;
+      if (init_even_sizes) {
+        allocData(data, m_num_bins, Base_Seq);
+        for (Index_type b = 0; b < m_num_bins; ++b) {
+          data[b] = static_cast<Real_type>(b+1) / m_num_bins;
+        }
+      } else if (init_random_sizes) {
+        allocAndInitDataRandValue(data, m_num_bins, Base_Seq);
+        std::sort(data, data+m_num_bins);
+      } else if (init_all_one) {
+        allocData(data, m_num_bins, Base_Seq);
+        for (Index_type b = 0; b < m_num_bins; ++b) {
+          data[b] = static_cast<Real_type>(0);
+        }
+      }
+
+      Index_type actual_prob_size = getActualProblemSize();
+      Index_type bin = 0;
+      for (Index_type i = 0; i < actual_prob_size; ++i) {
+        Real_type pos = static_cast<Real_type>(i) / actual_prob_size;
+        while (bin+1 < m_num_bins && pos >= data[bin]) {
+          bin += 1;
+        }
+        m_bins[i] = bin;
+      }
+
+      deallocData(data, Base_Seq);
+
+    } else if (init_random_per_iterate) {
+      Real_ptr data;
+      allocAndInitDataRandValue(data, getActualProblemSize(), Base_Seq);
+
+      for (Index_type i = 0; i < getActualProblemSize(); ++i) {
+        m_bins[i] = static_cast<Index_type>(data[i] * m_num_bins);
+        if (m_bins[i] >= m_num_bins) {
+          m_bins[i] = m_num_bins - 1;
+        }
+        if (m_bins[i] < 0) {
+          m_bins[i] = 0;
+        }
+      }
+
+      deallocData(data, Base_Seq);
+    } else {
+      throw 1;
+    }
+  }
+
+  m_counts_init.resize(m_num_bins, 0);
+  m_counts_final.resize(m_num_bins, 0);
+}
+
+void HISTOGRAM::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += calcChecksum(m_counts_final.data(), m_num_bins, vid);
+}
+
+void HISTOGRAM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_bins, vid);
+  m_counts_init.clear(); m_counts_init.shrink_to_fit();
+  m_counts_final.clear(); m_counts_final.shrink_to_fit();
+}
+
+} // end namespace algorithm
+} // end namespace rajaperf
diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp
new file mode 100644
index 000000000..2752f2c92
--- /dev/null
+++ b/src/algorithm/HISTOGRAM.hpp
@@ -0,0 +1,146 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// HISTOGRAM kernel reference implementation:
+///
+/// Index_type* counts = calloc(num_bins, sizeof(Index_type));
+/// for (Index_type i = 0; i < N; ++i ) {
+///   counts[bins[i]] += 1;
+/// }
+///
+
+#ifndef RAJAPerf_Algorithm_HISTOGRAM_HPP
+#define RAJAPerf_Algorithm_HISTOGRAM_HPP
+
+#define HISTOGRAM_DATA_SETUP \
+  Index_type num_bins = m_num_bins; \
+  Index_ptr bins = m_bins; \
+  std::vector<Data_type>& counts_init = m_counts_init; \
+  std::vector<Data_type>& counts_final = m_counts_final;
+
+#define HISTOGRAM_DATA_TEARDOWN
+
+
+#define HISTOGRAM_SETUP_COUNTS \
+  Data_ptr counts; \
+  allocData(getReductionDataSpace(vid), counts, num_bins);
+
+#define HISTOGRAM_TEARDOWN_COUNTS \
+  deallocData(counts, vid);
+
+#define HISTOGRAM_INIT_COUNTS \
+  for (Index_type b = 0; b < num_bins; ++b ) { \
+    counts[b] = counts_init[b]; \
+  }
+
+#define HISTOGRAM_FINALIZE_COUNTS \
+  for (Index_type b = 0; b < num_bins; ++b ) { \
+    counts_final[b] = counts[b]; \
+  }
+
+#define HISTOGRAM_INIT_COUNTS_RAJA(policy) \
+  RAJA::MultiReduceSum<policy, Data_type> counts(counts_init);
+
+#define HISTOGRAM_FINALIZE_COUNTS_RAJA(policy) \
+  counts.get_all(counts_final);
+
+#define HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, replication) \
+  for (Index_type b = 0; b < (num_bins); ++b) { \
+    Data_type count_final = 0; \
+    for (size_t r = 0; r < (replication); ++r) { \
+      count_final += (hcounts)[HISTOGRAM_GPU_BIN_INDEX(b, r, replication)]; \
+    } \
+    counts_final[b] = count_final; \
+  }
+
+
+#define HISTOGRAM_BODY \
+  counts[bins[i]] += static_cast<Data_type>(1);
+
+#define HISTOGRAM_RAJA_BODY(policy) \
+  RAJA::atomicAdd<policy>(&counts[bins[i]], static_cast<Data_type>(1));
+
+#define HISTOGRAM_GPU_BIN_INDEX(bin, offset, replication) \
+  ((bin)*(replication) + ((offset)%(replication)))
+
+#define HISTOGRAM_GPU_RAJA_BODY(policy, counts, index, value) \
+  RAJA::atomicAdd<policy>(&(counts)[(index)], (value));
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace algorithm
+{
+
+class HISTOGRAM : public KernelBase
+{
+public:
+  using Data_type = unsigned long long;
+  using Data_ptr = Data_type*;
+
+  HISTOGRAM(const RunParams& params);
+
+  ~HISTOGRAM();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runKokkosVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  void runCudaVariantLibrary(VariantID vid);
+  void runHipVariantLibrary(VariantID vid);
+
+  template < Index_type block_size,
+             Index_type preferred_global_replication,
+             Index_type preferred_shared_replication,
+             typename MappingHelper >
+  void runCudaVariantAtomicRuntime(VariantID vid);
+  template < Index_type block_size,
+             Index_type preferred_global_replication,
+             Index_type preferred_shared_replication,
+             typename MappingHelper >
+  void runHipVariantAtomicRuntime(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
+
+  static const size_t default_cuda_atomic_global_replication = 2;
+  static const size_t default_cuda_atomic_shared_replication = 16;
+  using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty
+  using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty
+
+  static const size_t default_hip_atomic_global_replication = 32;
+  static const size_t default_hip_atomic_shared_replication = 4;
+  using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty
+  using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty
+
+  Index_type m_num_bins;
+  RunParams::BinAssignmentAlgorithm m_bin_assignment_algorithm;
+  Index_ptr m_bins;
+  std::vector<Data_type> m_counts_init;
+  std::vector<Data_type> m_counts_final;
+};
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp
index fca6848f8..9f0fda034 100644
--- a/src/algorithm/MEMCPY-Cuda.cpp
+++ b/src/algorithm/MEMCPY-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -48,7 +48,9 @@ void MEMCPY::runCudaVariantLibrary(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk( cudaMemcpyAsync(MEMCPY_STD_ARGS, cudaMemcpyDefault, res.get_stream()) );
+      cudaErrchk( cudaMemcpyAsync(MEMCPY_STD_ARGS,
+                                  cudaMemcpyDefault,
+                                  res.get_stream()) );
 
     }
     stopTimer();
@@ -89,9 +91,11 @@ void MEMCPY::runCudaVariantBlock(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      memcpy<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-          x, y, iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (memcpy<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, y, iend );
 
     }
     stopTimer();
@@ -107,9 +111,12 @@ void MEMCPY::runCudaVariantBlock(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-          ibegin, iend, memcpy_lambda );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(memcpy_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, memcpy_lambda );
 
     }
     stopTimer();
diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp
index d0c239a67..0e880c1b4 100644
--- a/src/algorithm/MEMCPY-Hip.cpp
+++ b/src/algorithm/MEMCPY-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -48,7 +48,9 @@ void MEMCPY::runHipVariantLibrary(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk( hipMemcpyAsync(MEMCPY_STD_ARGS, hipMemcpyDefault, res.get_stream()) );
+      hipErrchk( hipMemcpyAsync(MEMCPY_STD_ARGS,
+                                hipMemcpyDefault,
+                                res.get_stream()) );
 
     }
     stopTimer();
@@ -89,10 +91,11 @@ void MEMCPY::runHipVariantBlock(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL( (memcpy<block_size>),
-          dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-          x, y, iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (memcpy<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         x, y, iend );
 
     }
     stopTimer();
@@ -108,10 +111,12 @@ void MEMCPY::runHipVariantBlock(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(memcpy_lambda)>),
-          grid_size, block_size, shmem, res.get_stream(),
-          ibegin, iend, memcpy_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(memcpy_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, memcpy_lambda );
 
     }
     stopTimer();
diff --git a/src/algorithm/MEMCPY-OMP.cpp b/src/algorithm/MEMCPY-OMP.cpp
index 55b63afd6..184f897bf 100644
--- a/src/algorithm/MEMCPY-OMP.cpp
+++ b/src/algorithm/MEMCPY-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/MEMCPY-OMPTarget.cpp b/src/algorithm/MEMCPY-OMPTarget.cpp
index 4f4932793..0b3536d42 100644
--- a/src/algorithm/MEMCPY-OMPTarget.cpp
+++ b/src/algorithm/MEMCPY-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/MEMCPY-Seq.cpp b/src/algorithm/MEMCPY-Seq.cpp
index 02a24668f..57c3f219f 100644
--- a/src/algorithm/MEMCPY-Seq.cpp
+++ b/src/algorithm/MEMCPY-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp
index 49446a265..f8ced7ac7 100644
--- a/src/algorithm/MEMCPY.cpp
+++ b/src/algorithm/MEMCPY.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ MEMCPY::MEMCPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature(Forall);
diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp
index 9fa46ae9e..b6cd49038 100644
--- a/src/algorithm/MEMCPY.hpp
+++ b/src/algorithm/MEMCPY.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -71,7 +71,7 @@ class MEMCPY : public KernelBase
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp
index bca349509..d0c60e97d 100644
--- a/src/algorithm/MEMSET-Cuda.cpp
+++ b/src/algorithm/MEMSET-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -89,11 +89,11 @@ void MEMSET::runCudaVariantBlock(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      memset<block_size><<<grid_size, block_size,
-                  shmem, res.get_stream()>>>( x,
-                                                   val,
-                                                   iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (memset<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, val, iend );
 
     }
     stopTimer();
@@ -109,9 +109,12 @@ void MEMSET::runCudaVariantBlock(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-          ibegin, iend, memset_lambda );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(memset_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, memset_lambda );
 
     }
     stopTimer();
diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp
index d0dacd545..c838aed28 100644
--- a/src/algorithm/MEMSET-Hip.cpp
+++ b/src/algorithm/MEMSET-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -89,10 +89,11 @@ void MEMSET::runHipVariantBlock(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL( (memset<block_size>),
-          dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-          x, val, iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (memset<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         x, val, iend );
 
     }
     stopTimer();
@@ -108,10 +109,12 @@ void MEMSET::runHipVariantBlock(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(memset_lambda)>),
-          grid_size, block_size, shmem, res.get_stream(),
-          ibegin, iend, memset_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(memset_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, memset_lambda );
 
     }
     stopTimer();
diff --git a/src/algorithm/MEMSET-OMP.cpp b/src/algorithm/MEMSET-OMP.cpp
index ebd931e4d..66a6e027c 100644
--- a/src/algorithm/MEMSET-OMP.cpp
+++ b/src/algorithm/MEMSET-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/MEMSET-OMPTarget.cpp b/src/algorithm/MEMSET-OMPTarget.cpp
index ec6d9c716..cee5a8577 100644
--- a/src/algorithm/MEMSET-OMPTarget.cpp
+++ b/src/algorithm/MEMSET-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/MEMSET-Seq.cpp b/src/algorithm/MEMSET-Seq.cpp
index 145fd462e..3064e7cb1 100644
--- a/src/algorithm/MEMSET-Seq.cpp
+++ b/src/algorithm/MEMSET-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp
index 95d3d5321..04ad4f52c 100644
--- a/src/algorithm/MEMSET.cpp
+++ b/src/algorithm/MEMSET.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ MEMSET::MEMSET(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (0*sizeof(Real_type) + 1*sizeof(Real_type)) +
-                  (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature(Forall);
diff --git a/src/algorithm/MEMSET.hpp b/src/algorithm/MEMSET.hpp
index ebf2f867b..0266c9e1a 100644
--- a/src/algorithm/MEMSET.hpp
+++ b/src/algorithm/MEMSET.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -71,7 +71,7 @@ class MEMSET : public KernelBase
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_type m_val;
diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp
index d36614f9e..302ab35d6 100644
--- a/src/algorithm/REDUCE_SUM-Cuda.cpp
+++ b/src/algorithm/REDUCE_SUM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,6 +18,10 @@
 #include "cub/util_allocator.cuh"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -26,7 +30,7 @@ namespace algorithm
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init,
+__global__ void reduce_sum(Real_ptr x, Real_ptr sum, Real_type sum_init,
                            Index_type iend)
 {
   extern __shared__ Real_type psum[ ];
@@ -46,15 +50,9 @@ __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
-  if ( threadIdx.x == 0 ) {
-    RAJA::atomicAdd<RAJA::cuda_atomic>( dsum, psum[ 0 ] );
-  }
-#else // this doesn't work due to data races
   if ( threadIdx.x == 0 ) {
-    *dsum += psum[ 0 ];
+    RAJA::atomicAdd<RAJA::cuda_atomic>( sum, psum[ 0 ] );
   }
-#endif
 }
 
 
@@ -74,8 +72,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid)
 
     int len = iend - ibegin;
 
-    Real_type* sum_storage;
-    allocData(DataSpace::CudaPinned, sum_storage, 1);
+    RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1);
 
     // Determine temporary device storage requirements
     void* d_temp_storage = nullptr;
@@ -83,7 +80,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid)
     cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage,
                                            temp_storage_bytes,
                                            x+ibegin,
-                                           sum_storage,
+                                           sum,
                                            len,
                                            ::cub::Sum(),
                                            m_sum_init,
@@ -102,21 +99,21 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid)
       cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage,
                                              temp_storage_bytes,
                                              x+ibegin,
-                                             sum_storage,
+                                             sum,
                                              len,
                                              ::cub::Sum(),
                                              m_sum_init,
                                              stream));
 
-      cudaErrchk(cudaStreamSynchronize(stream));
-      m_sum = *sum_storage;
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(sum, hsum, 1, 1);
+      m_sum = hsum[0];
 
     }
     stopTimer();
 
     // Free temporary storage
     deallocData(DataSpace::CudaDevice, temp_storage);
-    deallocData(DataSpace::CudaPinned, sum_storage);
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum);
 
   } else {
 
@@ -126,11 +123,10 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid)
 
 }
 
-template < size_t block_size >
-void REDUCE_SUM::runCudaVariantBlock(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void REDUCE_SUM::runCudaVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getCudaResource()};
@@ -139,40 +135,68 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    Real_ptr dsum;
-    allocData(DataSpace::CudaDevice, dsum, 1);
+    RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1);
+
+    constexpr size_t shmem = sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, (reduce_sum<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk( cudaMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type),
-                                   cudaMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1, 1);
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = sizeof(Real_type)*block_size;
-      reduce_sum<block_size><<<grid_size, block_size,
-                  shmem, res.get_stream()>>>( x,
-                                                   dsum, m_sum_init,
-                                                   iend );
-      cudaErrchk( cudaGetLastError() );
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-      cudaErrchk( cudaMemcpyAsync( &m_sum, dsum, sizeof(Real_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+      RPlaunchCudaKernel( (reduce_sum<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, sum, m_sum_init, iend );
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(sum, hsum, 1, 1);
+      m_sum = hsum[0];
 
     }
     stopTimer();
 
-    deallocData(DataSpace::CudaDevice, dsum);
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum);
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void REDUCE_SUM::runCudaVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::cuda_reduce_atomic,
+      RAJA::cuda_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  REDUCE_SUM_DATA_SETUP;
 
-  } else if ( vid == RAJA_CUDA ) {
+  if ( vid == RAJA_CUDA ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> sum(m_sum_init);
+      RAJA::ReduceSum<reduction_policy, Real_type> sum(m_sum_init);
 
-      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
           REDUCE_SUM_BODY;
       });
@@ -190,11 +214,54 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid)
 
 }
 
+template < size_t block_size, typename MappingHelper >
+void REDUCE_SUM::runCudaVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  REDUCE_SUM_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type tsum = m_sum_init;
+
+      RAJA::forall<exec_policy>( res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tsum),
+        [=] __device__ (Index_type i, Real_type& sum) {
+          REDUCE_SUM_BODY;
+        }
+      );
+
+      m_sum = static_cast<Real_type>(tsum);
+
+    }
+    stopTimer();
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
 void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx)
 {
-  if ( vid == Base_CUDA ) {
+  size_t t = 0;
 
-    size_t t = 0;
+  if ( vid == Base_CUDA ) {
 
     if (tune_idx == t) {
 
@@ -204,39 +271,59 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx)
 
     t += 1;
 
+  }
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
     seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
 
       if (run_params.numValidGPUBlockSize() == 0u ||
           run_params.validGPUBlockSize(block_size)) {
 
-        if (tune_idx == t) {
-          setBlockSize(block_size);
-          runCudaVariantBlock<block_size>(vid);
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
 
-        }
+          if ( vid == Base_CUDA ) {
 
-        t += 1;
+            if (tune_idx == t) {
 
-      }
+              setBlockSize(block_size);
+              runCudaVariantBase<decltype(block_size){},
+                                 decltype(mapping_helper)>(vid);
 
-    });
+            }
 
-  } else if ( vid == RAJA_CUDA ) {
+            t += 1;
 
-    size_t t = 0;
+          } else if ( vid == RAJA_CUDA ) {
 
-    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
 
-      if (run_params.numValidGPUBlockSize() == 0u ||
-          run_params.validGPUBlockSize(block_size)) {
+              if (tune_idx == t) {
 
-        if (tune_idx == t) {
+                setBlockSize(block_size);
+                runCudaVariantRAJA<decltype(block_size){},
+                                   decltype(algorithm_helper),
+                                   decltype(mapping_helper)>(vid);
 
-          runCudaVariantBlock<block_size>(vid);
+              }
 
-        }
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantRAJANewReduce<decltype(block_size){},
+                                          decltype(mapping_helper)>(vid);
+
+            }
 
-        t += 1;
+            t += 1;
+
+          }
+
+        });
 
       }
 
@@ -256,31 +343,53 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid)
 
     addVariantTuningName(vid, "cub");
 
+  }
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
     seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
 
       if (run_params.numValidGPUBlockSize() == 0u ||
           run_params.validGPUBlockSize(block_size)) {
 
-        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
 
-      }
+          if ( vid == Base_CUDA ) {
 
-    });
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
 
-  } else if ( vid == RAJA_CUDA ) {
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning 
 
-    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+          } else if ( vid == RAJA_CUDA ) {
 
-      if (run_params.numValidGPUBlockSize() == 0u ||
-          run_params.validGPUBlockSize(block_size)) {
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
 
-        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          }
+
+        });
 
       }
 
     });
 
   }
+
 }
 
 } // end namespace algorithm
diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp
index 88a16f331..831978015 100644
--- a/src/algorithm/REDUCE_SUM-Hip.cpp
+++ b/src/algorithm/REDUCE_SUM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,6 +23,10 @@
 #include "common/HipDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -31,7 +35,7 @@ namespace algorithm
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init,
+__global__ void reduce_sum(Real_ptr x, Real_ptr sum, Real_type sum_init,
                            Index_type iend)
 {
   HIP_DYNAMIC_SHARED(Real_type, psum);
@@ -51,15 +55,9 @@ __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
-  if ( threadIdx.x == 0 ) {
-    RAJA::atomicAdd<RAJA::hip_atomic>( dsum, psum[ 0 ] );
-  }
-#else // this doesn't work due to data races
   if ( threadIdx.x == 0 ) {
-    *dsum += psum[ 0 ];
+    RAJA::atomicAdd<RAJA::hip_atomic>( sum, psum[ 0 ] );
   }
-#endif
 }
 
 
@@ -79,8 +77,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid)
 
     int len = iend - ibegin;
 
-    Real_type* sum_storage;
-    allocData(DataSpace::HipPinned, sum_storage, 1);
+    RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1);
 
     // Determine temporary device storage requirements
     void* d_temp_storage = nullptr;
@@ -89,7 +86,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid)
     hipErrchk(::rocprim::reduce(d_temp_storage,
                                 temp_storage_bytes,
                                 x+ibegin,
-                                sum_storage,
+                                sum,
                                 m_sum_init,
                                 len,
                                 rocprim::plus<Real_type>(),
@@ -98,7 +95,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid)
     hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage,
                                           temp_storage_bytes,
                                           x+ibegin,
-                                          sum_storage,
+                                          sum,
                                           len,
                                           ::cub::Sum(),
                                           m_sum_init,
@@ -119,7 +116,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid)
       hipErrchk(::rocprim::reduce(d_temp_storage,
                                   temp_storage_bytes,
                                   x+ibegin,
-                                  sum_storage,
+                                  sum,
                                   m_sum_init,
                                   len,
                                   rocprim::plus<Real_type>(),
@@ -128,22 +125,22 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid)
       hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage,
                                             temp_storage_bytes,
                                             x+ibegin,
-                                            sum_storage,
+                                            sum,
                                             len,
                                             ::cub::Sum(),
                                             m_sum_init,
                                             stream));
 #endif
 
-      hipErrchk(hipStreamSynchronize(stream));
-      m_sum = *sum_storage;
+      RAJAPERF_HIP_REDUCER_COPY_BACK(sum, hsum, 1, 1);
+      m_sum = hsum[0];
 
     }
     stopTimer();
 
     // Free temporary storage
     deallocData(DataSpace::HipDevice, temp_storage);
-    deallocData(DataSpace::HipPinned, sum_storage);
+    RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum);
 
   } else {
 
@@ -153,11 +150,10 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid)
 
 }
 
-template < size_t block_size >
-void REDUCE_SUM::runHipVariantBlock(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void REDUCE_SUM::runHipVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getHipResource()};
@@ -166,39 +162,68 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    Real_ptr dsum;
-    allocData(DataSpace::HipDevice, dsum, 1);
+    RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1);
+
+    constexpr size_t shmem = sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, (reduce_sum<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk( hipMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type),
-                                 hipMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1, 1);
+
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = sizeof(Real_type)*block_size;
-      hipLaunchKernelGGL( (reduce_sum<block_size>), dim3(grid_size), dim3(block_size),
-                          shmem, res.get_stream(),
-                          x, dsum, m_sum_init, iend );
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (reduce_sum<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         x, sum, m_sum_init, iend );
 
-      hipErrchk( hipMemcpyAsync( &m_sum, dsum, sizeof(Real_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+      RAJAPERF_HIP_REDUCER_COPY_BACK(sum, hsum, 1, 1);
+      m_sum = hsum[0];
 
     }
     stopTimer();
 
-    deallocData(DataSpace::HipDevice, dsum);
+    RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum);
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void REDUCE_SUM::runHipVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::hip_reduce_atomic,
+      RAJA::hip_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  REDUCE_SUM_DATA_SETUP;
 
-  } else if ( vid == RAJA_HIP ) {
+  if ( vid == RAJA_HIP ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::hip_reduce, Real_type> sum(m_sum_init);
+      RAJA::ReduceSum<reduction_policy, Real_type> sum(m_sum_init);
 
-      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
           REDUCE_SUM_BODY;
       });
@@ -216,11 +241,54 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid)
 
 }
 
+template < size_t block_size, typename MappingHelper >
+void REDUCE_SUM::runHipVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  REDUCE_SUM_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type tsum = m_sum_init;
+
+      RAJA::forall<exec_policy>( res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tsum),
+        [=] __device__ (Index_type i, Real_type& sum) {
+          REDUCE_SUM_BODY;
+        }
+      );
+
+      m_sum = static_cast<Real_type>(tsum);
+
+    }
+    stopTimer();
+
+  } else {
+
+    getCout() << "\n  REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
 void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx)
 {
-  if ( vid == Base_HIP ) {
+  size_t t = 0;
 
-    size_t t = 0;
+  if ( vid == Base_HIP ) {
 
     if (tune_idx == t) {
 
@@ -230,39 +298,59 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx)
 
     t += 1;
 
+  }
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
     seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
 
       if (run_params.numValidGPUBlockSize() == 0u ||
           run_params.validGPUBlockSize(block_size)) {
 
-        if (tune_idx == t) {
-          setBlockSize(block_size);
-          runHipVariantBlock<block_size>(vid);
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
 
-        }
+          if ( vid == Base_HIP ) {
 
-        t += 1;
+            if (tune_idx == t) {
 
-      }
+              setBlockSize(block_size);
+              runHipVariantBase<decltype(block_size){},
+                                decltype(mapping_helper)>(vid);
 
-    });
+            }
 
-  } else if ( vid == RAJA_HIP ) {
+            t += 1;
 
-    size_t t = 0;
+          } else if ( vid == RAJA_HIP ) {
 
-    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
 
-      if (run_params.numValidGPUBlockSize() == 0u ||
-          run_params.validGPUBlockSize(block_size)) {
+              if (tune_idx == t) {
 
-        if (tune_idx == t) {
+                setBlockSize(block_size);
+                runHipVariantRAJA<decltype(block_size){},
+                                  decltype(algorithm_helper),
+                                  decltype(mapping_helper)>(vid);
 
-          runHipVariantBlock<block_size>(vid);
+              }
 
-        }
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantRAJANewReduce<decltype(block_size){},
+                                         decltype(mapping_helper)>(vid);
 
-        t += 1;
+            }
+
+            t += 1;
+
+          }
+
+        });
 
       }
 
@@ -286,25 +374,45 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid)
     addVariantTuningName(vid, "cub");
 #endif
 
+  }
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
     seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
 
       if (run_params.numValidGPUBlockSize() == 0u ||
           run_params.validGPUBlockSize(block_size)) {
 
-        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
 
-      }
+          if ( vid == Base_HIP ) {
 
-    });
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
 
-  } else if ( vid == RAJA_HIP ) {
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
 
-    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+          } else if ( vid == RAJA_HIP ) {
 
-      if (run_params.numValidGPUBlockSize() == 0u ||
-          run_params.validGPUBlockSize(block_size)) {
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          }
 
-        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+        });
 
       }
 
diff --git a/src/algorithm/REDUCE_SUM-OMP.cpp b/src/algorithm/REDUCE_SUM-OMP.cpp
index 49d0d766e..1295887f5 100644
--- a/src/algorithm/REDUCE_SUM-OMP.cpp
+++ b/src/algorithm/REDUCE_SUM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace algorithm
 {
 
 
-void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t tune_idx)
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -76,21 +76,48 @@ void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune
 
     case RAJA_OpenMP : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceSum<RAJA::omp_reduce, Real_type> sum(m_sum_init);
+
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            [=](Index_type i) {
+              REDUCE_SUM_BODY;
+          });
+
+          m_sum = sum.get();
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
 
-        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> sum(m_sum_init);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(
-          RAJA::RangeSegment(ibegin, iend),
-          [=](Index_type i) {
-            REDUCE_SUM_BODY;
-        });
+          Real_type tsum = m_sum_init;
 
-        m_sum = sum.get();
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tsum),
+            [=] (Index_type i, Real_type& sum) {
+              REDUCE_SUM_BODY;
+            }
+          );
 
+          m_sum = static_cast<Real_type>(tsum);
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  REDUCE_SUM : Unknown OpenMP tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
+
 
       break;
     }
@@ -103,8 +130,17 @@ void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune
 
 #else
   RAJA_UNUSED_VAR(vid);
+  RAJA_UNUSED_VAR(tune_idx);
 #endif
 }
 
+void REDUCE_SUM::setOpenMPTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMP) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace algorithm
 } // end namespace rajaperf
diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp
index a8652099e..1c1be1ab7 100644
--- a/src/algorithm/REDUCE_SUM-OMPTarget.cpp
+++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,7 +27,7 @@ namespace algorithm
   const size_t threads_per_team = 256;
 
 
-void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -56,21 +56,47 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    if (tune_idx == 0) {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> sum(m_sum_init);
+
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+            REDUCE_SUM_BODY;
+        });
+
+        m_sum = sum.get();
+
+      }
+      stopTimer();
+
+    } else if (tune_idx == 1) {
 
-      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> sum(m_sum_init);
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
-        RAJA::RangeSegment(ibegin, iend),
-        [=](Index_type i) {
-          REDUCE_SUM_BODY;
-      });
+        Real_type tsum = m_sum_init;
 
-      m_sum = sum.get();
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend),
+          RAJA::expt::Reduce<RAJA::operators::plus>(&tsum),
+          [=] (Index_type i, Real_type& sum) {
+            REDUCE_SUM_BODY;
+          }
+        );
 
+        m_sum = static_cast<Real_type>(tsum);
+
+      }
+      stopTimer();
+
+    } else {
+      getCout() << "\n  REDUCE_SUM : Unknown OMP Target tuning index = " << tune_idx << std::endl;
     }
-    stopTimer();
 
   } else {
     getCout() << "\n  REDUCE_SUM : Unknown OMP Target variant id = " << vid << std::endl;
@@ -78,6 +104,14 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR
 
 }
 
+void REDUCE_SUM::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMPTarget) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace algorithm
 } // end namespace rajaperf
 
diff --git a/src/algorithm/REDUCE_SUM-Seq.cpp b/src/algorithm/REDUCE_SUM-Seq.cpp
index 8c7086057..8d4fdacb2 100644
--- a/src/algorithm/REDUCE_SUM-Seq.cpp
+++ b/src/algorithm/REDUCE_SUM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,8 +18,11 @@ namespace algorithm
 {
 
 
-void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE_SUM::runSeqVariant(VariantID vid, size_t tune_idx)
 {
+#if !defined(RUN_RAJA_SEQ)
+  RAJA_UNUSED_VAR(tune_idx);
+#endif
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
@@ -73,23 +76,48 @@ void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id
 
     case RAJA_Seq : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceSum<RAJA::seq_reduce, Real_type> sum(m_sum_init);
+
+          RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
+            [=](Index_type i) {
+              REDUCE_SUM_BODY;
+          });
+
+          m_sum = sum.get();
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
 
-        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> sum(m_sum_init);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
-          [=](Index_type i) {
-            REDUCE_SUM_BODY;
-        });
+          Real_type tsum = m_sum_init;
 
-        m_sum = sum.get();
+          RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tsum),
+            [=] (Index_type i, Real_type& sum) {
+              REDUCE_SUM_BODY;
+            }
+          );
 
+          m_sum = static_cast<Real_type>(tsum);
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  REDUCE_SUM : Unknown Seq tuning index = " << tune_idx << std::endl; 
       }
-      stopTimer();
 
       break;
-    }
+   }
 #endif
 
     default : {
@@ -100,5 +128,13 @@ void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id
 
 }
 
+void REDUCE_SUM::setSeqTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_Seq) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace algorithm
 } // end namespace rajaperf
diff --git a/src/algorithm/REDUCE_SUM-Sycl.cpp b/src/algorithm/REDUCE_SUM-Sycl.cpp
new file mode 100644
index 000000000..516048863
--- /dev/null
+++ b/src/algorithm/REDUCE_SUM-Sycl.cpp
@@ -0,0 +1,103 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE_SUM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+
+namespace rajaperf
+{
+namespace algorithm
+{
+
+template <size_t work_group_size >
+void REDUCE_SUM::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  REDUCE_SUM_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    Real_ptr sum;
+    allocAndInitSyclDeviceData(sum, &m_sum_init, 1, qu);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      initSyclDeviceData(sum, &m_sum_init, 1, qu); 
+
+      qu->submit([&] (sycl::handler& h) {
+
+        auto sumReduction = sycl::reduction(sum, sycl::plus<Real_type>());
+
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       sumReduction,
+                       [=] (sycl::nd_item<1> item, auto& sum) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            REDUCE_SUM_BODY;
+          }
+
+        });
+      });
+
+      Real_type lsum;
+      Real_ptr plsum = &lsum;
+      getSyclDeviceData(plsum, sum, 1, qu);
+      m_sum = lsum;
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       Real_type tsum = m_sum_init;
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( 
+         res,
+         RAJA::RangeSegment(ibegin, iend), 
+         RAJA::expt::Reduce<RAJA::operators::plus>(&tsum),
+         [=]  (Index_type i, Real_type& sum) {
+           REDUCE_SUM_BODY;
+         }
+       );
+
+       m_sum = static_cast<Real_type>(tsum);
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  REDUCE_SUM : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_SUM, Sycl)
+
+} // end namespace algorithm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp
index 3712f5ffa..4aebb5b0f 100644
--- a/src/algorithm/REDUCE_SUM.cpp
+++ b/src/algorithm/REDUCE_SUM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -51,6 +52,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 REDUCE_SUM::~REDUCE_SUM()
diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp
index ba9e9308b..c9f1a3c74 100644
--- a/src/algorithm/REDUCE_SUM.hpp
+++ b/src/algorithm/REDUCE_SUM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,19 +58,37 @@ class REDUCE_SUM : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   void runCudaVariantCub(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runCudaVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantRAJANewReduce(VariantID vid);
+
   void runHipVariantRocprim(VariantID vid);
-  template < size_t block_size >
-  void runCudaVariantBlock(VariantID vid);
-  template < size_t block_size >
-  void runHipVariantBlock(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runHipVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantRAJANewReduce(VariantID vid);
+
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_type m_sum_init;
diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp
index 674e25f5a..977c91e24 100644
--- a/src/algorithm/SCAN-Cuda.cpp
+++ b/src/algorithm/SCAN-Cuda.cpp
@@ -16,6 +16,7 @@
 #include "cub/util_allocator.cuh"
 
 #include "common/CudaDataUtils.hpp"
+#include "common/CudaGridScan.hpp"
 
 #include <iostream>
 
@@ -24,8 +25,52 @@ namespace rajaperf
 namespace algorithm
 {
 
+template < size_t block_size >
+using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type<
+    detail::cuda::grid_scan_max_items_per_thread<Real_type, block_size>::value+1,
+    integer::LessEqual<detail::cuda::grid_scan_max_items_per_thread<Real_type, block_size>::value>>;
 
-void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+
+template < size_t block_size, size_t items_per_thread >
+__launch_bounds__(block_size)
+__global__ void scan(Real_ptr x,
+                     Real_ptr y,
+                     Real_ptr block_counts,
+                     Real_ptr grid_counts,
+                     unsigned* block_readys,
+                     Index_type iend)
+{
+  // blocks do start running in order in cuda, so a block with a higher
+  // index can wait on a block with a lower index without deadlocking
+  // (replace with an atomicInc if this changes)
+  const int block_id = blockIdx.x;
+
+  Real_type vals[items_per_thread];
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x;
+    if (i < iend) {
+      vals[ti] = x[i];
+    } else {
+      vals[ti] = 0;
+    }
+  }
+
+  Real_type exclusives[items_per_thread];
+  Real_type inclusives[items_per_thread];
+  detail::cuda::GridScan<Real_type, block_size, items_per_thread>::grid_scan(
+      block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys);
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x;
+    if (i < iend) {
+      y[i] = exclusives[ti];
+    }
+  }
+}
+
+
+void SCAN::runCudaVariantLibrary(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -85,16 +130,175 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::exclusive_scan< RAJA::cuda_exec<default_gpu_block_size, true /*async*/> >(res, RAJA_SCAN_ARGS);
+      RAJA::exclusive_scan< RAJA::cuda_exec<0, true /*async*/> >(res, RAJA_SCAN_ARGS);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  SCAN : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, size_t items_per_thread >
+void SCAN::runCudaVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  SCAN_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread);
+    const size_t shmem_size = 0;
+
+    Real_ptr block_counts;
+    allocData(DataSpace::CudaDevice, block_counts, grid_size);
+    Real_ptr grid_counts;
+    allocData(DataSpace::CudaDevice, grid_counts, grid_size);
+    unsigned* block_readys;
+    allocData(DataSpace::CudaDevice, block_readys, grid_size);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size,
+                                  res.get_stream()) );
+      RPlaunchCudaKernel( (scan<block_size, items_per_thread>),
+                          grid_size, block_size,
+                          shmem_size, res.get_stream(),
+                          x+ibegin, y+ibegin,
+                          block_counts, grid_counts, block_readys,
+                          iend-ibegin );
 
     }
     stopTimer();
 
+    deallocData(DataSpace::CudaDevice, block_counts);
+    deallocData(DataSpace::CudaDevice, grid_counts);
+    deallocData(DataSpace::CudaDevice, block_readys);
+
   } else {
      getCout() << "\n  SCAN : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
+
+void SCAN::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    if (tune_idx == t) {
+
+      runCudaVariantLibrary(vid);
+
+    }
+
+    t += 1;
+
+    if ( vid == Base_CUDA ) {
+
+      seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+        if (run_params.numValidGPUBlockSize() == 0u ||
+            run_params.validGPUBlockSize(block_size)) {
+
+          using cuda_items_per_thread = cuda_items_per_thread_type<block_size>;
+
+          if (camp::size<cuda_items_per_thread>::value == 0) {
+
+            if (tune_idx == t) {
+
+              runCudaVariantImpl<decltype(block_size)::value,
+                                 detail::cuda::grid_scan_default_items_per_thread<
+                                    Real_type, block_size, RAJA_PERFSUITE_TUNING_CUDA_ARCH>::value
+                                 >(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+          seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) {
+
+            if (run_params.numValidItemsPerThread() == 0u ||
+                run_params.validItemsPerThread(block_size)) {
+
+              if (tune_idx == t) {
+
+                runCudaVariantImpl<decltype(block_size)::value, items_per_thread>(vid);
+
+              }
+
+              t += 1;
+
+            }
+
+          });
+
+        }
+
+      });
+
+    }
+
+  } else {
+
+    getCout() << "\n  SCAN : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+}
+
+void SCAN::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    addVariantTuningName(vid, "cub");
+
+    if ( vid == Base_CUDA ) {
+
+      seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+        if (run_params.numValidGPUBlockSize() == 0u ||
+            run_params.validGPUBlockSize(block_size)) {
+
+          using cuda_items_per_thread = cuda_items_per_thread_type<block_size>;
+
+          if (camp::size<cuda_items_per_thread>::value == 0) {
+
+            addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+          }
+
+          seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) {
+
+            if (run_params.numValidItemsPerThread() == 0u ||
+                run_params.validItemsPerThread(block_size)) {
+
+              addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_"
+                                        "block_"+std::to_string(block_size));
+
+            }
+
+          });
+
+        }
+
+      });
+
+    }
+
+  }
+}
+
 } // end namespace algorithm
 } // end namespace rajaperf
 
diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp
index 6e7135188..22f0bea57 100644
--- a/src/algorithm/SCAN-Hip.cpp
+++ b/src/algorithm/SCAN-Hip.cpp
@@ -21,6 +21,7 @@
 #endif
 
 #include "common/HipDataUtils.hpp"
+#include "common/HipGridScan.hpp"
 
 #include <iostream>
 
@@ -29,8 +30,52 @@ namespace rajaperf
 namespace algorithm
 {
 
+template < size_t block_size >
+using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type<
+    detail::hip::grid_scan_max_items_per_thread<Real_type, block_size>::value+1,
+    integer::LessEqual<detail::hip::grid_scan_max_items_per_thread<Real_type, block_size>::value>>;
 
-void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+
+template < size_t block_size, size_t items_per_thread >
+__launch_bounds__(block_size)
+__global__ void scan(Real_ptr x,
+                     Real_ptr y,
+                     Real_ptr block_counts,
+                     Real_ptr grid_counts,
+                     unsigned* block_readys,
+                     Index_type iend)
+{
+  // It looks like blocks do not start running in order in hip, so a block
+  // with a higher index can't wait on a block with a lower index without
+  // deadlocking (have to replace with an atomicInc)
+  const int block_id = blockIdx.x;
+
+  Real_type vals[items_per_thread];
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x;
+    if (i < iend) {
+      vals[ti] = x[i];
+    } else {
+      vals[ti] = 0;
+    }
+  }
+
+  Real_type exclusives[items_per_thread];
+  Real_type inclusives[items_per_thread];
+  detail::hip::GridScan<Real_type, block_size, items_per_thread>::grid_scan(
+      block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys);
+
+  for (size_t ti = 0; ti < items_per_thread; ++ti) {
+    Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x;
+    if (i < iend) {
+      y[i] = exclusives[ti];
+    }
+  }
+}
+
+
+void SCAN::runHipVariantLibrary(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -112,7 +157,7 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::exclusive_scan< RAJA::hip_exec<default_gpu_block_size, true /*async*/> >(res, RAJA_SCAN_ARGS);
+      RAJA::exclusive_scan< RAJA::hip_exec<0, true /*async*/> >(res, RAJA_SCAN_ARGS);
 
     }
     stopTimer();
@@ -122,6 +167,164 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
   }
 }
 
+template < size_t block_size, size_t items_per_thread >
+void SCAN::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  SCAN_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread);
+    const size_t shmem_size = 0;
+
+    Real_ptr block_counts;
+    allocData(DataSpace::HipDevice, block_counts, grid_size);
+    Real_ptr grid_counts;
+    allocData(DataSpace::HipDevice, grid_counts, grid_size);
+    unsigned* block_readys;
+    allocData(DataSpace::HipDevice, block_readys, grid_size);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size,
+                                res.get_stream()) );
+
+      RPlaunchHipKernel( (scan<block_size, items_per_thread>),
+                         grid_size, block_size,
+                         shmem_size, res.get_stream(),
+                         x+ibegin, y+ibegin,
+                         block_counts, grid_counts, block_readys,
+                         iend-ibegin );
+
+    }
+    stopTimer();
+
+    deallocData(DataSpace::HipDevice, block_counts);
+    deallocData(DataSpace::HipDevice, grid_counts);
+    deallocData(DataSpace::HipDevice, block_readys);
+
+  } else {
+     getCout() << "\n  SCAN : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+
+void SCAN::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    if (tune_idx == t) {
+
+      runHipVariantLibrary(vid);
+
+    }
+
+    t += 1;
+
+    if ( vid == Base_HIP ) {
+
+      seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+        if (run_params.numValidGPUBlockSize() == 0u ||
+            run_params.validGPUBlockSize(block_size)) {
+
+          using hip_items_per_thread = hip_items_per_thread_type<block_size>;
+
+          if (camp::size<hip_items_per_thread>::value == 0) {
+
+            if (tune_idx == t) {
+
+              runHipVariantImpl<decltype(block_size)::value,
+                                 detail::hip::grid_scan_default_items_per_thread<
+                                    Real_type, block_size, RAJA_PERFSUITE_TUNING_HIP_ARCH>::value
+                                 >(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+          seq_for(hip_items_per_thread{}, [&](auto items_per_thread) {
+
+            if (run_params.numValidItemsPerThread() == 0u ||
+                run_params.validItemsPerThread(block_size)) {
+
+              if (tune_idx == t) {
+
+                runHipVariantImpl<block_size, items_per_thread>(vid);
+
+              }
+
+              t += 1;
+
+            }
+
+          });
+
+        }
+
+      });
+    }
+
+  } else {
+
+    getCout() << "\n  SCAN : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+}
+
+void SCAN::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    addVariantTuningName(vid, "rocprim");
+
+    if ( vid == Base_HIP ) {
+
+      seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+        if (run_params.numValidGPUBlockSize() == 0u ||
+            run_params.validGPUBlockSize(block_size)) {
+
+          using hip_items_per_thread = hip_items_per_thread_type<block_size>;
+
+          if (camp::size<hip_items_per_thread>::value == 0) {
+
+            addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+          }
+
+          seq_for(hip_items_per_thread{}, [&](auto items_per_thread) {
+
+            if (run_params.numValidItemsPerThread() == 0u ||
+                run_params.validItemsPerThread(block_size)) {
+
+              addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_"
+                                        "block_"+std::to_string(block_size));
+
+            }
+
+          });
+
+        }
+
+      });
+
+    }
+
+  }
+}
+
 } // end namespace algorithm
 } // end namespace rajaperf
 
diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp
index 30cb534df..a5c04abc4 100644
--- a/src/algorithm/SCAN.cpp
+++ b/src/algorithm/SCAN.cpp
@@ -28,7 +28,9 @@ SCAN::SCAN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
   checksum_scale_factor = 1e-2 *
diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp
index 519789a55..f55381d21 100644
--- a/src/algorithm/SCAN.hpp
+++ b/src/algorithm/SCAN.hpp
@@ -10,9 +10,10 @@
 /// SCAN kernel reference implementation:
 ///
 /// // exclusive scan
-/// y[ibegin] = 0;
-/// for (Index_type i = ibegin+1; i < iend; ++i) {
-///   y[i] = y[i-1] + x[i-1];
+/// Real_type scan_var = 0;
+/// for (Index_type i = ibegin; i < iend; ++i) {
+///   y[i] = scan_var;
+///   scan_var += x[i];
 /// }
 ///
 
@@ -62,8 +63,18 @@ class SCAN : public KernelBase
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
 
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  void runCudaVariantLibrary(VariantID vid);
+  void runHipVariantLibrary(VariantID vid);
+  template < size_t block_size, size_t items_per_thread >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size, size_t items_per_thread >
+  void runHipVariantImpl(VariantID vid);
+
 private:
-  static const size_t default_gpu_block_size = 0;
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp
index 45cd40d63..4d77667d7 100644
--- a/src/algorithm/SORT-Cuda.cpp
+++ b/src/algorithm/SORT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp
index d87445413..c464bae4e 100644
--- a/src/algorithm/SORT-Hip.cpp
+++ b/src/algorithm/SORT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp
index 05b885d50..133b00a88 100644
--- a/src/algorithm/SORT-OMP.cpp
+++ b/src/algorithm/SORT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp
index c5e1503af..2d458ff4d 100644
--- a/src/algorithm/SORT-Seq.cpp
+++ b/src/algorithm/SORT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp
index b7738f264..bc99df634 100644
--- a/src/algorithm/SORT.cpp
+++ b/src/algorithm/SORT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ SORT::SORT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); // touched data size, not actual number of stores and loads
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature(Sort);
diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp
index b51bf12f9..9df61e411 100644
--- a/src/algorithm/SORT.hpp
+++ b/src/algorithm/SORT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp
index 57176e3db..1f102eb91 100644
--- a/src/algorithm/SORTPAIRS-Cuda.cpp
+++ b/src/algorithm/SORTPAIRS-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp
index aece079d4..467a3cbf4 100644
--- a/src/algorithm/SORTPAIRS-Hip.cpp
+++ b/src/algorithm/SORTPAIRS-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp
index 39705af9a..cdf0f044a 100644
--- a/src/algorithm/SORTPAIRS-OMP.cpp
+++ b/src/algorithm/SORTPAIRS-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp
index 91c094ce9..320e307f4 100644
--- a/src/algorithm/SORTPAIRS-Seq.cpp
+++ b/src/algorithm/SORTPAIRS-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp
index a07f1e79b..6315970b2 100644
--- a/src/algorithm/SORTPAIRS.cpp
+++ b/src/algorithm/SORTPAIRS.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ SORTPAIRS::SORTPAIRS(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (2*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); // touched data size, not actual number of stores and loads
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature(Sort);
diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp
index 4cfc3eb36..fa53a15c3 100644
--- a/src/algorithm/SORTPAIRS.hpp
+++ b/src/algorithm/SORTPAIRS.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp
index bade73b59..facb3d592 100644
--- a/src/apps/AppsData.cpp
+++ b/src/apps/AppsData.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,6 +16,40 @@ namespace rajaperf
 namespace apps
 {
 
+
+std::ostream& operator<<(std::ostream& stream, const ADomain& domain)
+{
+   return stream
+
+     << "ADomain"
+
+     << " ndims " << domain.ndims
+     << " NPNL " << domain.NPNL
+     << " NPNR " << domain.NPNR
+
+     << " imin " << domain.imin
+     << " jmin " << domain.jmin
+     << " kmin " << domain.kmin
+     << " imax " << domain.imax
+     << " jmax " << domain.jmax
+     << " kmax " << domain.kmax
+
+     << " jp " << domain.jp
+     << " kp " << domain.kp
+     << " nnalls " << domain.nnalls
+
+     << " fpn " << domain.fpn
+     << " lpn " << domain.lpn
+     << " frn " << domain.frn
+     << " lrn " << domain.lrn
+
+     << " fpz " << domain.fpz
+     << " lpz " << domain.lpz
+
+     << " n_real_zones " << domain.n_real_zones
+     << " n_real_nodes " << domain.n_real_nodes ;
+}
+
 //
 // Set zone indices for 2d mesh.
 //
@@ -38,10 +72,10 @@ void setRealZones_2d(Index_type* real_zones,
 
   for (Index_type j = jmin; j < jmax; j++) {
      for (Index_type i = imin; i < imax; i++) {
-        Index_type ip = i + j*jp ;
+        Index_type iz = i + j*jp ;
 
-        Index_type id = (i-imin) + (j-jmin)*j_stride ;
-        real_zones[id] = ip;
+        Index_type il = (i-imin) + (j-jmin)*j_stride ;
+        real_zones[il] = iz;
      }
   }
 }
@@ -73,10 +107,10 @@ void setRealZones_3d(Index_type* real_zones,
   for (Index_type k = kmin; k < kmax; k++) {
      for (Index_type j = jmin; j < jmax; j++) {
         for (Index_type i = imin; i < imax; i++) {
-           Index_type ip = i + j*jp + k*kp ;
+           Index_type iz = i + j*jp + k*kp ;
 
-           Index_type id = (i-imin) + (j-jmin)*j_stride + (k-kmin)*k_stride ;
-           real_zones[id] = ip;
+           Index_type il = (i-imin) + (j-jmin)*j_stride + (k-kmin)*k_stride ;
+           real_zones[il] = iz;
         }
      }
   }
@@ -104,20 +138,13 @@ void setMeshPositions_2d(Real_ptr x, Real_type dx,
   Index_type npnl = domain.NPNL;
   Index_type npnr = domain.NPNR;
 
-  Real_ptr x1, x2, x3, x4;
-  Real_ptr y1, y2, y3, y4;
-  NDSET2D(domain.jp, x, x1,x2,x3,x4) ;
-  NDSET2D(domain.jp, y, y1,y2,y3,y4) ;
+  for (Index_type j = jmin - npnl; j < jmax+1 + npnr; j++) {
+     for (Index_type i = imin - npnl; i < imax+1 + npnr; i++) {
+        Index_type in = i + j*jp ;
 
-  for (Index_type j = jmin - npnl; j < jmax + npnr; j++) {
-     for (Index_type i = imin - npnl; i < imax + npnr; i++) {
-        Index_type iz = i + j*jp ;
-
-        x3[iz] = x4[iz] = i*dx;
-        x1[iz] = x2[iz] = (i+1)*dx;
+        x[in] = i*dx;
 
-        y1[iz] = y4[iz] = j*dy;
-        y2[iz] = y3[iz] = (j+1)*dy;
+        y[in] = j*dy;
 
      }
   }
@@ -150,26 +177,16 @@ void setMeshPositions_3d(Real_ptr x, Real_type dx,
   Index_type npnl = domain.NPNL;
   Index_type npnr = domain.NPNR;
 
-  Real_ptr x0, x1, x2, x3, x4, x5, x6, x7;
-  Real_ptr y0, y1, y2, y3, y4, y5, y6, y7;
-  Real_ptr z0, z1, z2, z3, z4, z5, z6, z7;
-  NDPTRSET(domain.jp, domain.kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
-  NDPTRSET(domain.jp, domain.kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
-  NDPTRSET(domain.jp, domain.kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
-
-  for (Index_type k = kmin - npnl; k < kmax + npnr; k++) {
-     for (Index_type j = jmin - npnl; j < jmax + npnr; j++) {
-        for (Index_type i = imin - npnl; i < imax + npnr; i++) {
-           Index_type iz = i + j*jp + k*kp ;
+  for (Index_type k = kmin - npnl; k < kmax+1 + npnr; k++) {
+     for (Index_type j = jmin - npnl; j < jmax+1 + npnr; j++) {
+        for (Index_type i = imin - npnl; i < imax+1 + npnr; i++) {
+           Index_type in = i + j*jp + k*kp ;
 
-           x0[iz] = x2[iz] = x4[iz] = x6[iz] = i*dx;
-           x1[iz] = x3[iz] = x5[iz] = x7[iz] = (i+1)*dx;
+           x[in] = i*dx;
 
-           y0[iz] = y1[iz] = y4[iz] = y5[iz] = j*dy;
-           y2[iz] = y3[iz] = y6[iz] = y7[iz] = (j+1)*dy;
+           y[in] = j*dy;
 
-           z0[iz] = z1[iz] = z2[iz] = z3[iz] = k*dz;
-           z4[iz] = z5[iz] = z6[iz] = z7[iz] = (k+1)*dz;
+           z[in] = k*dz;
 
         }
      }
diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp
index a4b566c6b..b1908b7a5 100644
--- a/src/apps/AppsData.hpp
+++ b/src/apps/AppsData.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -9,6 +9,8 @@
 #ifndef RAJAPerf_AppsData_HPP
 #define RAJAPerf_AppsData_HPP
 
+#include <ostream>
+
 #include "common/RPTypes.hpp"
 
 namespace rajaperf
@@ -47,40 +49,57 @@ class ADomain
 
    ADomain() = delete;
 
-   ADomain( Index_type rzmax, Index_type ndims ) 
+   ADomain( Index_type real_nodes_per_dim, Index_type ndims )
       : ndims(ndims), NPNL(2), NPNR(1)
    {
-      imin = NPNL;
-      jmin = NPNL;
-      imax = rzmax + NPNR;
-      jmax = rzmax + NPNR;
-      jp = imax - imin + 1 + NPNL + NPNR;
-      n_real_zones = (imax - imin);
-      n_real_nodes = (imax+1 - imin);
-
-      if ( ndims == 2 ) {
-         kmin = 0;
-         kmax = 0;
-         kp = 0;
-         nnalls = jp * (jmax - jmin + 1 + NPNL + NPNR) ;
+      int NPZL = NPNL - 1;
+      int NPZR = NPNR+1 - 1;
+
+      if ( ndims >= 1 ) {
+         imin = NPNL;
+         imax = NPNL + real_nodes_per_dim-1;
+         nnalls = (imax+1 - imin + NPNL + NPNR);
+         n_real_zones = (imax - imin);
+         n_real_nodes = (imax+1 - imin);
+      } else {
+         imin = 0;
+         imax = 0;
+         nnalls = 0;
+      }
+
+      if ( ndims >= 2 ) {
+         jmin = NPNL;
+         jmax = NPNL + real_nodes_per_dim-1;
+         jp = nnalls;
+         nnalls *= (jmax+1 - jmin + NPNL + NPNR);
          n_real_zones *= (jmax - jmin);
          n_real_nodes *= (jmax+1 - jmin);
-      } else if ( ndims == 3 ) {
+      } else {
+         jmin = 0;
+         jmax = 0;
+         jp = 0;
+      }
+
+      if ( ndims >= 3 ) {
          kmin = NPNL;
-         kmax = rzmax + NPNR;
-         kp = jp * (jmax - jmin + 1 + NPNL + NPNR);
-         nnalls = kp * (kmax - kmin + 1 + NPNL + NPNR) ;
-         n_real_zones *= (jmax - jmin) * (kmax - kmin);
-         n_real_nodes *= (jmax+1 - jmin) * (kmax+1 - kmin);
+         kmax = NPNL + real_nodes_per_dim-1;
+         kp = nnalls;
+         nnalls *= (kmax+1 - kmin + NPNL + NPNR);
+         n_real_zones *= (kmax - kmin);
+         n_real_nodes *= (kmax+1 - kmin);
+      } else {
+         kmin = 0;
+         kmax = 0;
+         kp = 0;
       }
 
-      fpn = 0;
-      lpn = nnalls - 1;
-      frn = fpn + NPNL * (kp + jp) + NPNL;
-      lrn = lpn - NPNR * (kp + jp) - NPNR;
+      frn = kmin*kp + jmin*jp + imin;
+      lrn = kmax*kp + jmax*jp + imax;
+      fpn = (kmin - NPNL)*kp + (jmin - NPNL)*jp + (imin - NPNL);
+      lpn = (kmax + NPNR)*kp + (jmax + NPNR)*jp + (imax + NPNR);
 
-      fpz = frn - jp - kp - 1;
-      lpz = lrn;
+      fpz = (kmin - NPZL)*kp + (jmin - NPZL)*jp + (imin - NPZL);
+      lpz = (kmax-1 + NPZR)*kp + (jmax-1 + NPZR)*jp + (imax-1 + NPZR);
    }
 
    ~ADomain()
@@ -114,6 +133,8 @@ class ADomain
    Index_type  n_real_nodes;
 };
 
+std::ostream& operator<<(std::ostream& stream, const ADomain& domain);
+
 //
 // Routines for initializing real zone indices for 2d/3d domains.
 //
diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt
index dc3485354..4a0584e96 100644
--- a/src/apps/CMakeLists.txt
+++ b/src/apps/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -15,95 +15,103 @@ blt_add_library(
           CONVECTION3DPA-Seq.cpp
           CONVECTION3DPA-OMP.cpp
           CONVECTION3DPA-OMPTarget.cpp
-          DEL_DOT_VEC_2D.cpp 
-          DEL_DOT_VEC_2D-Seq.cpp 
-          DEL_DOT_VEC_2D-Hip.cpp 
-          DEL_DOT_VEC_2D-Cuda.cpp 
-          DEL_DOT_VEC_2D-OMP.cpp 
-          DEL_DOT_VEC_2D-OMPTarget.cpp 
+          CONVECTION3DPA-Sycl.cpp
+          DEL_DOT_VEC_2D.cpp
+          DEL_DOT_VEC_2D-Seq.cpp
+          DEL_DOT_VEC_2D-Hip.cpp
+          DEL_DOT_VEC_2D-Cuda.cpp
+          DEL_DOT_VEC_2D-OMP.cpp
+          DEL_DOT_VEC_2D-OMPTarget.cpp
+	  DEL_DOT_VEC_2D-Sycl.cpp
           DIFFUSION3DPA.cpp
           DIFFUSION3DPA-Cuda.cpp
           DIFFUSION3DPA-Hip.cpp
           DIFFUSION3DPA-Seq.cpp
           DIFFUSION3DPA-OMP.cpp
           DIFFUSION3DPA-OMPTarget.cpp
+          DIFFUSION3DPA-Sycl.cpp
           EDGE3D.cpp
           EDGE3D-Cuda.cpp
           EDGE3D-Hip.cpp
           EDGE3D-Seq.cpp
           EDGE3D-OMP.cpp
           EDGE3D-OMPTarget.cpp
+          EDGE3D-Sycl.cpp
           ENERGY.cpp
           ENERGY-Seq.cpp
-          ENERGY-Hip.cpp 
-          ENERGY-Cuda.cpp 
-          ENERGY-OMP.cpp 
-          ENERGY-OMPTarget.cpp 
+          ENERGY-Hip.cpp
+          ENERGY-Cuda.cpp
+          ENERGY-OMP.cpp
+          ENERGY-OMPTarget.cpp
+          ENERGY-Sycl.cpp
           FIR.cpp
           FIR-Seq.cpp
           FIR-Hip.cpp
           FIR-Cuda.cpp
           FIR-OMP.cpp
           FIR-OMPTarget.cpp
-          HALOEXCHANGE.cpp
-          HALOEXCHANGE-Seq.cpp
-          HALOEXCHANGE-Hip.cpp
-          HALOEXCHANGE-Cuda.cpp
-          HALOEXCHANGE-OMP.cpp
-          HALOEXCHANGE-OMPTarget.cpp
-          HALOEXCHANGE_FUSED.cpp
-          HALOEXCHANGE_FUSED-Seq.cpp
-          HALOEXCHANGE_FUSED-Hip.cpp
-          HALOEXCHANGE_FUSED-Cuda.cpp
-          HALOEXCHANGE_FUSED-OMP.cpp
-          HALOEXCHANGE_FUSED-OMPTarget.cpp
+          FIR-Sycl.cpp
           LTIMES.cpp
           LTIMES-Seq.cpp
           LTIMES-Hip.cpp
           LTIMES-Cuda.cpp
           LTIMES-OMP.cpp
           LTIMES-OMPTarget.cpp
+          LTIMES-Sycl.cpp
           LTIMES_NOVIEW.cpp
           LTIMES_NOVIEW-Seq.cpp
           LTIMES_NOVIEW-Hip.cpp
           LTIMES_NOVIEW-Cuda.cpp
           LTIMES_NOVIEW-OMP.cpp
           LTIMES_NOVIEW-OMPTarget.cpp
+          LTIMES_NOVIEW-Sycl.cpp
           MASS3DEA.cpp
           MASS3DEA-Cuda.cpp
           MASS3DEA-Hip.cpp
           MASS3DEA-Seq.cpp
           MASS3DEA-OMP.cpp
-          MASS3DEA-OMPTarget.cpp          
+          MASS3DEA-OMPTarget.cpp
+          MASS3DEA-Sycl.cpp
           MASS3DPA.cpp
           MASS3DPA-Cuda.cpp
           MASS3DPA-Hip.cpp
           MASS3DPA-Seq.cpp
           MASS3DPA-OMP.cpp
           MASS3DPA-OMPTarget.cpp
+          MASS3DPA-Sycl.cpp
+          MATVEC_3D_STENCIL.cpp
+          MATVEC_3D_STENCIL-Seq.cpp
+          MATVEC_3D_STENCIL-Hip.cpp
+          MATVEC_3D_STENCIL-Cuda.cpp
+          MATVEC_3D_STENCIL-OMP.cpp
+          MATVEC_3D_STENCIL-OMPTarget.cpp
+          MATVEC_3D_STENCIL-Sycl.cpp
           NODAL_ACCUMULATION_3D.cpp
           NODAL_ACCUMULATION_3D-Seq.cpp
           NODAL_ACCUMULATION_3D-Hip.cpp
           NODAL_ACCUMULATION_3D-Cuda.cpp
           NODAL_ACCUMULATION_3D-OMP.cpp
           NODAL_ACCUMULATION_3D-OMPTarget.cpp
-          PRESSURE.cpp 
-          PRESSURE-Seq.cpp 
-          PRESSURE-Hip.cpp 
-          PRESSURE-Cuda.cpp 
-          PRESSURE-OMP.cpp 
-          PRESSURE-OMPTarget.cpp 
+          PRESSURE.cpp
+          PRESSURE-Seq.cpp
+          PRESSURE-Hip.cpp
+          PRESSURE-Cuda.cpp
+          PRESSURE-OMP.cpp
+          PRESSURE-OMPTarget.cpp
+          PRESSURE-Sycl.cpp
           VOL3D.cpp
           VOL3D-Seq.cpp
-          VOL3D-Hip.cpp 
-          VOL3D-Cuda.cpp 
-          VOL3D-OMP.cpp 
-          VOL3D-OMPTarget.cpp 
+          VOL3D-Hip.cpp
+          VOL3D-Cuda.cpp
+          VOL3D-OMP.cpp
+          VOL3D-OMPTarget.cpp
+          VOL3D-Sycl.cpp
           ZONAL_ACCUMULATION_3D.cpp
           ZONAL_ACCUMULATION_3D-Seq.cpp
           ZONAL_ACCUMULATION_3D-Hip.cpp
           ZONAL_ACCUMULATION_3D-Cuda.cpp
           ZONAL_ACCUMULATION_3D-OMP.cpp
           ZONAL_ACCUMULATION_3D-OMPTarget.cpp
+          ZONAL_ACCUMULATION_3D-Sycl.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/apps/CONVECTION3DPA-Cuda.cpp b/src/apps/CONVECTION3DPA-Cuda.cpp
index 5b5e5f1f4..6160430c0 100644
--- a/src/apps/CONVECTION3DPA-Cuda.cpp
+++ b/src/apps/CONVECTION3DPA-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -138,16 +138,16 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {
 
   case Base_CUDA: {
 
-    dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D);
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D);
       constexpr size_t shmem = 0;
-      Convection3DPA<block_size><<<NE, nthreads_per_block, shmem, res.get_stream()>>>
-        (Basis, tBasis, dBasis, D, X, Y);
 
-      cudaErrchk(cudaGetLastError());
+      RPlaunchCudaKernel( (Convection3DPA<block_size>),
+                          NE, nthreads_per_block,
+                          shmem, res.get_stream(),
+                          Basis, tBasis, dBasis, D, X, Y );
     }
     stopTimer();
 
diff --git a/src/apps/CONVECTION3DPA-Hip.cpp b/src/apps/CONVECTION3DPA-Hip.cpp
index ed0eef3e4..12300f940 100644
--- a/src/apps/CONVECTION3DPA-Hip.cpp
+++ b/src/apps/CONVECTION3DPA-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -138,18 +138,16 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) {
 
   case Base_HIP: {
 
-    dim3 nblocks(NE);
-    dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D);
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((Convection3DPA<block_size>),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         Basis, tBasis, dBasis, D, X, Y);
-
-      hipErrchk(hipGetLastError());
+      
+      RPlaunchHipKernel( (Convection3DPA<block_size>),
+                         NE, nthreads_per_block,
+                         shmem, res.get_stream(),
+                         Basis, tBasis, dBasis, D, X, Y );      
     }
     stopTimer();
 
diff --git a/src/apps/CONVECTION3DPA-OMP.cpp b/src/apps/CONVECTION3DPA-OMP.cpp
index b414122cb..2826defd0 100644
--- a/src/apps/CONVECTION3DPA-OMP.cpp
+++ b/src/apps/CONVECTION3DPA-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/CONVECTION3DPA-OMPTarget.cpp b/src/apps/CONVECTION3DPA-OMPTarget.cpp
index e0317c930..6affba0c6 100644
--- a/src/apps/CONVECTION3DPA-OMPTarget.cpp
+++ b/src/apps/CONVECTION3DPA-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/CONVECTION3DPA-Seq.cpp b/src/apps/CONVECTION3DPA-Seq.cpp
index a62a93409..9f18a2da8 100644
--- a/src/apps/CONVECTION3DPA-Seq.cpp
+++ b/src/apps/CONVECTION3DPA-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/CONVECTION3DPA-Sycl.cpp b/src/apps/CONVECTION3DPA-Sycl.cpp
new file mode 100644
index 000000000..c01087818
--- /dev/null
+++ b/src/apps/CONVECTION3DPA-Sycl.cpp
@@ -0,0 +1,421 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "CONVECTION3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+template < size_t work_group_size >
+void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) {
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  CONVECTION3DPA_DATA_SETUP;
+
+  const ::sycl::range<3> workGroupSize(CPA_Q1D, CPA_Q1D, CPA_Q1D);
+  const ::sycl::range<3> gridSize(CPA_Q1D,CPA_Q1D,CPA_Q1D*NE);
+
+  constexpr size_t shmem = 0;
+
+  switch (vid) {
+
+  case Base_SYCL: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&](cl::sycl::handler& h) {
+
+        constexpr int max_D1D = CPA_D1D;
+        constexpr int max_Q1D = CPA_Q1D;
+        constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
+
+        auto sm0_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h);
+        auto sm1_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h);
+        auto sm2_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h);
+        auto sm3_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h);
+        auto sm4_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h);
+        auto sm5_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, workGroupSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+             const Index_type e = itm.get_group(2);
+
+             double *sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm1 = sm1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm2 = sm2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm3 = sm3_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm4 = sm4_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm5 = sm5_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+
+             double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0;
+             double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1;
+             double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2;
+             double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3;
+             double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4;
+             double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5;
+             double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0;
+             double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1;
+             double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2;
+             double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3;
+             double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4;
+             double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5;
+
+             SYCL_FOREACH_THREAD(dz,0,CPA_D1D)
+             {
+               SYCL_FOREACH_THREAD(dy,1,CPA_D1D)
+               {
+                 SYCL_FOREACH_THREAD(dx,2,CPA_D1D)
+                 {
+                   CONVECTION3DPA_1;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+
+             SYCL_FOREACH_THREAD(dz,0,CPA_D1D)
+             {
+               SYCL_FOREACH_THREAD(dy,1,CPA_D1D)
+               {
+                 SYCL_FOREACH_THREAD(qx,2,CPA_Q1D)
+                 {
+                   CONVECTION3DPA_2;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+
+             SYCL_FOREACH_THREAD(dz,0,CPA_D1D)
+             {
+               SYCL_FOREACH_THREAD(qx,2,CPA_Q1D)
+               {
+                 SYCL_FOREACH_THREAD(qy,1,CPA_Q1D)
+                 {
+                   CONVECTION3DPA_3;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+
+             SYCL_FOREACH_THREAD(qx,2,CPA_Q1D)
+             {
+               SYCL_FOREACH_THREAD(qy,1,CPA_Q1D)
+               {
+                 SYCL_FOREACH_THREAD(qz,0,CPA_Q1D)
+                 {
+                   CONVECTION3DPA_4;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+
+             SYCL_FOREACH_THREAD(qz,0,CPA_Q1D)
+             {
+               SYCL_FOREACH_THREAD(qy,1,CPA_Q1D)
+               {
+                 SYCL_FOREACH_THREAD(qx,2,CPA_Q1D)
+                 {
+                   CONVECTION3DPA_5;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+
+             SYCL_FOREACH_THREAD(qx,2,CPA_Q1D)
+             {
+               SYCL_FOREACH_THREAD(qy,1,CPA_Q1D)
+               {
+                 SYCL_FOREACH_THREAD(dz,0,CPA_D1D)
+                 {
+                   CONVECTION3DPA_6;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+
+             SYCL_FOREACH_THREAD(dz,0,CPA_D1D)
+             {
+               SYCL_FOREACH_THREAD(qx,2,CPA_Q1D)
+               {
+                 SYCL_FOREACH_THREAD(dy,1,CPA_D1D)
+                 {
+                   CONVECTION3DPA_7;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+
+             SYCL_FOREACH_THREAD(dz,0,CPA_D1D)
+             {
+               SYCL_FOREACH_THREAD(dy,1,CPA_D1D)
+               {
+                 SYCL_FOREACH_THREAD(dx,2,CPA_D1D)
+                 {
+                   CONVECTION3DPA_8;
+                 }
+               }
+             }
+           });
+
+      });
+
+
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_SYCL: {
+
+    constexpr bool async = true;
+
+    using launch_policy =
+      RAJA::LaunchPolicy<RAJA::sycl_launch_t<async>>;
+
+    using outer_x =
+      RAJA::LoopPolicy<RAJA::sycl_group_2_loop>;
+
+    using inner_x =
+      RAJA::LoopPolicy<RAJA::sycl_local_2_loop>;
+
+    using inner_y =
+      RAJA::LoopPolicy<RAJA::sycl_local_1_loop>;
+
+    using inner_z =
+      RAJA::LoopPolicy<RAJA::sycl_local_0_loop>;
+
+    //Caclulate amount of shared memory needed
+    size_t shmem = 0;
+    {
+      constexpr int max_D1D = CPA_D1D;
+      constexpr int max_Q1D = CPA_Q1D;
+      constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
+
+      constexpr int no_mats = 6;
+      shmem += max_DQ*max_DQ*max_DQ  * no_mats * sizeof(double);
+    }
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::launch<launch_policy>( res,
+          RAJA::LaunchParams(RAJA::Teams(NE),
+                             RAJA::Threads(CPA_Q1D, CPA_Q1D, CPA_Q1D), shmem),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+          RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
+            [&](int e) {
+
+              //Redefine inside the lambda to keep consistent with base version
+              constexpr int max_D1D = CPA_D1D;
+              constexpr int max_Q1D = CPA_Q1D;
+              constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
+
+              double * sm0 = ctx.getSharedMemory<double>(max_DQ*max_DQ*max_DQ);
+              double * sm1 = ctx.getSharedMemory<double>(max_DQ*max_DQ*max_DQ);
+              double * sm2 = ctx.getSharedMemory<double>(max_DQ*max_DQ*max_DQ);
+              double * sm3 = ctx.getSharedMemory<double>(max_DQ*max_DQ*max_DQ);
+              double * sm4 = ctx.getSharedMemory<double>(max_DQ*max_DQ*max_DQ);
+              double * sm5 = ctx.getSharedMemory<double>(max_DQ*max_DQ*max_DQ);
+
+              double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0;
+              double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1;
+              double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2;
+              double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3;
+              double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4;
+              double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5;
+              double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0;
+              double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1;
+              double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2;
+              double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3;
+              double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4;
+              double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5;
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                [&](int dz) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                    [&](int dy) {
+                      RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                        [&](int dx) {
+
+                          CONVECTION3DPA_1;
+
+                        } // lambda (dx)
+                      ); // RAJA::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                [&](int dz) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                    [&](int dy) {
+                      RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                        [&](int qx) {
+
+                          CONVECTION3DPA_2;
+
+                        } // lambda (dx)
+                      ); // RAJA::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+             ctx.teamSync();
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                [&](int dz) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                    [&](int qx) {
+                      RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                        [&](int qy) {
+
+                          CONVECTION3DPA_3;
+
+                        } // lambda (dy)
+                      ); // RAJA::loop<inner_y>
+                    } // lambda (dx)
+                  );  //RAJA::loop<inner_x>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+             ctx.teamSync();
+
+              RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                [&](int qx) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                    [&](int qy) {
+                      RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                        [&](int qz) {
+
+                          CONVECTION3DPA_4;
+
+                        } // lambda (qz)
+                      ); // RAJA::loop<inner_z>
+                    } // lambda (qy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (qx)
+              );  //RAJA::loop<inner_x>
+
+             ctx.teamSync();
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                [&](int qz) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                    [&](int qy) {
+                      RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                        [&](int qx) {
+
+                          CONVECTION3DPA_5;
+
+                        } // lambda (qx)
+                      ); // RAJA::loop<inner_x>
+                    } // lambda (qy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (qz)
+              );  //RAJA::loop<inner_z>
+
+             ctx.teamSync();
+
+              RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                [&](int qx) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                    [&](int qy) {
+                      RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                        [&](int dz) {
+
+                          CONVECTION3DPA_6;
+
+                        } // lambda (dz)
+                      ); // RAJA::loop<inner_z>
+                    } // lambda (qy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (qx)
+              );  //RAJA::loop<inner_x>
+
+             ctx.teamSync();
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                [&](int dz) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
+                    [&](int qx) {
+                      RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                        [&](int dy) {
+
+                          CONVECTION3DPA_7;
+
+                        } // lambda (dy)
+                      ); // RAJA::loop<inner_y>
+                    } // lambda (qx)
+                  );  //RAJA::loop<inner_x>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+            ctx.teamSync();
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                [&](int dz) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                    [&](int dy) {
+                      RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_D1D),
+                        [&](int dx) {
+
+                          CONVECTION3DPA_8;
+
+                        } // lambda (dx)
+                      ); // RAJA::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+            } // lambda (e)
+          ); // RAJA::loop<outer_x>
+
+        }  // outer lambda (ctx)
+      );  // RAJA::launch
+
+    } // loop over kernel reps
+    stopTimer();
+
+    break;
+  }
+
+  default: {
+
+    getCout() << "\n CONVECTION3DPA : Unknown Sycl variant id = " << vid
+              << std::endl;
+    break;
+  }
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(CONVECTION3DPA, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_SYCL
diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp
index 43ed5d539..8213c2c90 100644
--- a/src/apps/CONVECTION3DPA.cpp
+++ b/src/apps/CONVECTION3DPA.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,17 +28,18 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params)
   setDefaultProblemSize(m_NE_default*CPA_Q1D*CPA_Q1D*CPA_Q1D);
   setDefaultReps(50);
 
-  m_NE = std::max(getTargetProblemSize()/(CPA_Q1D*CPA_Q1D*CPA_Q1D), Index_type(1));
+  m_NE = std::max((getTargetProblemSize() + (CPA_Q1D*CPA_Q1D*CPA_Q1D)/2) / (CPA_Q1D*CPA_Q1D*CPA_Q1D), Index_type(1));
 
   setActualProblemSize( m_NE*CPA_Q1D*CPA_Q1D*CPA_Q1D );
 
   setItsPerRep(getActualProblemSize());
   setKernelsPerRep(1);
 
-  setBytesPerRep( 3*CPA_Q1D*CPA_D1D*sizeof(Real_type)  +
-                  CPA_VDIM*CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE*sizeof(Real_type) +
-                  CPA_D1D*CPA_D1D*CPA_D1D*m_NE*sizeof(Real_type) +
-                  CPA_D1D*CPA_D1D*CPA_D1D*m_NE*sizeof(Real_type) );
+  setBytesReadPerRep( 3*sizeof(Real_type) * CPA_Q1D*CPA_D1D + // b, bt, g
+                      2*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x, y
+               CPA_VDIM*sizeof(Real_type) * CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE ); // d
+  setBytesWrittenPerRep( 1*sizeof(Real_type) + CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y
+  setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (
                          4 * CPA_D1D * CPA_Q1D * CPA_D1D * CPA_D1D + //2
@@ -64,6 +65,9 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
 }
 
 CONVECTION3DPA::~CONVECTION3DPA()
diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp
index 784b2d4cd..38629b28c 100644
--- a/src/apps/CONVECTION3DPA.hpp
+++ b/src/apps/CONVECTION3DPA.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -378,17 +378,22 @@ class CONVECTION3DPA : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = CPA_Q1D * CPA_Q1D * CPA_Q1D;
-  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::list_type<default_gpu_block_size>;
 
   Real_ptr m_B;
   Real_ptr m_Bt;
diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp
index 64094c4ab..3c7edcd40 100644
--- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,6 +52,7 @@ template < size_t block_size >
 void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
   const Index_type iend = m_domain->n_real_zones;
 
   auto res{getCudaResource()};
@@ -64,17 +65,19 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-
       constexpr size_t shmem = 0;
-      deldotvec2d<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(div,
-                                             x1, x2, x3, x4,
-                                             y1, y2, y3, y4,
-                                             fx1, fx2, fx3, fx4,
-                                             fy1, fy2, fy3, fy4,
-                                             real_zones,
-                                             half, ptiny,
-                                             iend);
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (deldotvec2d<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          div,
+                          x1, x2, x3, x4,
+                          y1, y2, y3, y4,
+                          fx1, fx2, fx3, fx4,
+                          fy1, fy2, fy3, fy4,
+                          real_zones,
+                          half, ptiny,
+                          iend );
 
     }
     stopTimer();
@@ -84,17 +87,20 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      auto deldotvec2d_lambda = [=] __device__ (Index_type ii) {
+        DEL_DOT_VEC_2D_BODY_INDEX;
+        DEL_DOT_VEC_2D_BODY;
+      };
 
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        0, iend,
-        [=] __device__ (Index_type ii) {
 
-        DEL_DOT_VEC_2D_BODY_INDEX;
-        DEL_DOT_VEC_2D_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(deldotvec2d_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend,
+                          deldotvec2d_lambda );
 
     }
     stopTimer();
diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp
index 590ea31b2..79cef6b09 100644
--- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,6 +52,7 @@ template < size_t block_size >
 void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
   const Index_type iend = m_domain->n_real_zones;
 
   auto res{getHipResource()};
@@ -64,17 +65,19 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((deldotvec2d<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), div,
-                                             x1, x2, x3, x4,
-                                             y1, y2, y3, y4,
-                                             fx1, fx2, fx3, fx4,
-                                             fy1, fy2, fy3, fy4,
-                                             real_zones,
-                                             half, ptiny,
-                                             iend);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (deldotvec2d<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         div,
+                         x1, x2, x3, x4,
+                         y1, y2, y3, y4,
+                         fx1, fx2, fx3, fx4,
+                         fy1, fy2, fy3, fy4,
+                         real_zones,
+                         half, ptiny,
+                         iend );
 
     }
     stopTimer();
@@ -85,18 +88,19 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       auto deldotvec2d_lambda = [=] __device__ (Index_type ii) {
-
         DEL_DOT_VEC_2D_BODY_INDEX;
         DEL_DOT_VEC_2D_BODY;
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(deldotvec2d_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(),
-        0, iend, deldotvec2d_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(deldotvec2d_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend,
+                         deldotvec2d_lambda );
 
     }
     stopTimer();
diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp
index 1fc9b5775..730b49887 100644
--- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp
index 8dfa12e6c..b3527802a 100644
--- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp
index ffb533e3a..76b04a96f 100644
--- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp
new file mode 100644
index 000000000..13f59e29d
--- /dev/null
+++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp
@@ -0,0 +1,87 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DEL_DOT_VEC_2D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+template <size_t work_group_size >
+void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  DEL_DOT_VEC_2D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type ii = item.get_global_id(0);
+          if (ii < iend) {
+            DEL_DOT_VEC_2D_BODY_INDEX
+            DEL_DOT_VEC_2D_BODY
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    RAJA::TypedListSegment<Index_type> zones(real_zones, iend,
+                                             res, RAJA::Unowned);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         zones, [=] (Index_type i) {
+         DEL_DOT_VEC_2D_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  DEL_DOT_VEC_2D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DEL_DOT_VEC_2D, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp
index ffe5edeb2..8c72474bc 100644
--- a/src/apps/DEL_DOT_VEC_2D.cpp
+++ b/src/apps/DEL_DOT_VEC_2D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,7 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params)
   setDefaultProblemSize(1000*1000);  // See rzmax in ADomain struct
   setDefaultReps(100);
 
-  Index_type rzmax = std::sqrt(getTargetProblemSize())+1;
+  Index_type rzmax = std::sqrt(getTargetProblemSize()) + 1 + std::sqrt(2)-1;
   m_domain = new ADomain(rzmax, /* ndims = */ 2);
 
   m_array_length = m_domain->nnalls;
@@ -37,9 +37,10 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() +
-                  (1*sizeof(Real_type)  + 0*sizeof(Real_type) ) * getItsPerRep() +
-                  (0*sizeof(Real_type)  + 4*sizeof(Real_type) ) * m_domain->n_real_nodes ) ; // touched data size, not actual number of stores and loads
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
+                      4*sizeof(Real_type) * m_domain->n_real_nodes ); // 4 variables with 2d nodal stencil pattern: 4 touches per iterate
+  setBytesWrittenPerRep( 1*sizeof(Index_type) * getItsPerRep() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(54 * m_domain->n_real_zones);
 
   setUsesFeature(Forall);
@@ -62,6 +63,9 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D()
diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp
index d82efc12f..d7c0d20f6 100644
--- a/src/apps/DEL_DOT_VEC_2D.hpp
+++ b/src/apps/DEL_DOT_VEC_2D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -118,17 +118,22 @@ class DEL_DOT_VEC_2D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp
index 863f83854..90a55905b 100644
--- a/src/apps/DIFFUSION3DPA-Cuda.cpp
+++ b/src/apps/DIFFUSION3DPA-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -117,16 +117,16 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) {
 
   case Base_CUDA: {
 
-    dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D);
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D);
       constexpr size_t shmem = 0;
-      Diffusion3DPA<block_size><<<NE, nthreads_per_block, shmem, res.get_stream()>>>(
-          Basis, dBasis, D, X, Y, symmetric);
 
-      cudaErrchk(cudaGetLastError());
+      RPlaunchCudaKernel( (Diffusion3DPA<block_size>),
+                          NE, nthreads_per_block,
+                          shmem, res.get_stream(),
+                          Basis, dBasis, D, X, Y, symmetric );
     }
     stopTimer();
 
diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp
index 7f0dd77b1..15e27ed78 100644
--- a/src/apps/DIFFUSION3DPA-Hip.cpp
+++ b/src/apps/DIFFUSION3DPA-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -117,18 +117,16 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) {
 
   case Base_HIP: {
 
-    dim3 nblocks(NE);
-    dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D);
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((Diffusion3DPA<block_size>),
-          dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-          Basis, dBasis, D, X, Y, symmetric);
 
-      hipErrchk(hipGetLastError());
+      RPlaunchHipKernel( (Diffusion3DPA<block_size>),
+                         NE, nthreads_per_block,
+                         shmem, res.get_stream(),
+                         Basis, dBasis, D, X, Y, symmetric );
     }
     stopTimer();
 
diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp
index a1dcdbe04..04f27ec63 100644
--- a/src/apps/DIFFUSION3DPA-OMP.cpp
+++ b/src/apps/DIFFUSION3DPA-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp
index 03a1811a3..be5bf5ecf 100644
--- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp
+++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp
index 9e2818de1..c384b0695 100644
--- a/src/apps/DIFFUSION3DPA-Seq.cpp
+++ b/src/apps/DIFFUSION3DPA-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/DIFFUSION3DPA-Sycl.cpp b/src/apps/DIFFUSION3DPA-Sycl.cpp
new file mode 100644
index 000000000..fccc14260
--- /dev/null
+++ b/src/apps/DIFFUSION3DPA-Sycl.cpp
@@ -0,0 +1,440 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+// Uncomment to add compiler directives for loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
+#include "DIFFUSION3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+template < size_t work_group_size >
+void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) {
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  DIFFUSION3DPA_DATA_SETUP;
+
+  switch (vid) {
+
+  case Base_SYCL: {
+
+    const ::sycl::range<3> workGroupSize(DPA_Q1D, DPA_Q1D, DPA_Q1D);
+    const ::sycl::range<3> gridSize(DPA_Q1D,DPA_Q1D,DPA_Q1D*NE);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&](cl::sycl::handler& h) {
+
+        constexpr int MQ1 = DPA_Q1D;
+        constexpr int MD1 = DPA_D1D;
+        constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
+
+        auto sBG_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MQ1*MD1), h);
+
+        auto sm0_0_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MDQ*MDQ*MDQ), h);
+        auto sm0_1_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MDQ*MDQ*MDQ), h);
+        auto sm0_2_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MDQ*MDQ*MDQ), h);
+        auto sm1_0_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MDQ*MDQ*MDQ), h);
+        auto sm1_1_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MDQ*MDQ*MDQ), h);
+        auto sm1_2_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MDQ*MDQ*MDQ), h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, workGroupSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+             const Index_type e = itm.get_group(2);
+
+             double *sBG = sBG_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+
+             double *sm0_0 = sm0_0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm0_1 = sm0_1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm0_2 = sm0_2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm1_0 = sm1_0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm1_1 = sm1_1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm1_2 = sm1_2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+
+             double (*B)[MD1] = (double (*)[MD1]) sBG;
+             double (*G)[MD1] = (double (*)[MD1]) sBG;
+             double (*Bt)[MQ1] = (double (*)[MQ1]) sBG;
+             double (*Gt)[MQ1] = (double (*)[MQ1]) sBG;
+
+             double (*s_X)[MD1][MD1]    = (double (*)[MD1][MD1]) (sm0_2);
+             double (*DDQ0)[MD1][MQ1]   = (double (*)[MD1][MQ1]) (sm0_0);
+             double (*DDQ1)[MD1][MQ1]   = (double (*)[MD1][MQ1]) (sm0_1);
+             double (*DQQ0)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm1_0);
+             double (*DQQ1)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm1_1);
+             double (*DQQ2)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm1_2);
+             double (*QQQ0)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm0_0);
+             double (*QQQ1)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm0_1);
+             double (*QQQ2)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm0_2);
+             double (*QQD0)[MQ1][MD1]   = (double (*)[MQ1][MD1]) (sm1_0);
+             double (*QQD1)[MQ1][MD1]   = (double (*)[MQ1][MD1]) (sm1_1);
+             double (*QQD2)[MQ1][MD1]   = (double (*)[MQ1][MD1]) (sm1_2);
+             double (*QDD0)[MD1][MD1]   = (double (*)[MD1][MD1]) (sm0_0);
+             double (*QDD1)[MD1][MD1]   = (double (*)[MD1][MD1]) (sm0_1);
+             double (*QDD2)[MD1][MD1]   = (double (*)[MD1][MD1]) (sm0_2);
+
+             SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) {
+               SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) {
+                 SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) {
+                   DIFFUSION3DPA_1;
+                 }
+               }
+             }
+
+             if (itm.get_local_id(0) == 0)
+             {
+               SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) {
+                 SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) {
+                   DIFFUSION3DPA_2;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) {
+               SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) {
+                 SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) {
+                   DIFFUSION3DPA_3;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) {
+               SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) {
+                 SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) {
+                   DIFFUSION3DPA_4;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) {
+               SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) {
+                 SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) {
+                   DIFFUSION3DPA_5;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             if (itm.get_local_id(0) == 0)
+               {
+               SYCL_FOREACH_THREAD(d, 1, DPA_D1D) {
+                 SYCL_FOREACH_THREAD(q, 2, DPA_Q1D) {
+                   DIFFUSION3DPA_6;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) {
+               SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) {
+                 SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) {
+                   DIFFUSION3DPA_7;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) {
+               SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) {
+                 SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) {
+                   DIFFUSION3DPA_8;
+                 }
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) {
+               SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) {
+                 SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) {
+                   DIFFUSION3DPA_9;
+                 }
+               }
+             }
+
+           });
+        });
+
+
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_SYCL: {
+
+    constexpr bool async = true;
+
+    using launch_policy =
+        RAJA::LaunchPolicy<RAJA::sycl_launch_t<async>>;
+
+    using outer_x =
+        RAJA::LoopPolicy<RAJA::sycl_group_2_direct>;
+
+    using inner_x =
+        RAJA::LoopPolicy<RAJA::sycl_local_2_loop>;
+
+    using inner_y =
+        RAJA::LoopPolicy<RAJA::sycl_local_1_loop>;
+
+    using inner_z =
+        RAJA::LoopPolicy<RAJA::sycl_local_0_loop>;
+
+    size_t shmem = 0;
+    {
+      constexpr int MQ1 = DPA_Q1D;
+      constexpr int MD1 = DPA_D1D;
+      constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
+
+      const size_t local_mats = 6;
+      shmem += MQ1*MD1*sizeof(double) + local_mats*MDQ*MDQ*MDQ*sizeof(double);
+    }
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::launch<launch_policy>( res,
+                             RAJA::LaunchParams(RAJA::Teams(NE),
+                             RAJA::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D), shmem),
+          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+            const bool symmetric = true;
+
+          RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
+            [&](int e) {
+
+              //Redefine inside the lambda to keep consistent with base version
+              constexpr int MQ1 = DPA_Q1D;
+              constexpr int MD1 = DPA_D1D;
+              constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
+
+              double *sBG = ctx.getSharedMemory<double>(MQ1*MD1);
+              double *sm0_0 = ctx.getSharedMemory<double>(MDQ*MDQ*MDQ);
+              double *sm0_1 = ctx.getSharedMemory<double>(MDQ*MDQ*MDQ);
+              double *sm0_2 = ctx.getSharedMemory<double>(MDQ*MDQ*MDQ);
+              double *sm1_0 = ctx.getSharedMemory<double>(MDQ*MDQ*MDQ);
+              double *sm1_1 = ctx.getSharedMemory<double>(MDQ*MDQ*MDQ);
+              double *sm1_2 = ctx.getSharedMemory<double>(MDQ*MDQ*MDQ);
+
+             double (*B)[MD1] = (double (*)[MD1]) sBG;
+             double (*G)[MD1] = (double (*)[MD1]) sBG;
+             double (*Bt)[MQ1] = (double (*)[MQ1]) sBG;
+             double (*Gt)[MQ1] = (double (*)[MQ1]) sBG;
+
+             double (*s_X)[MD1][MD1]    = (double (*)[MD1][MD1]) (sm0_2);
+             double (*DDQ0)[MD1][MQ1]   = (double (*)[MD1][MQ1]) (sm0_0);
+             double (*DDQ1)[MD1][MQ1]   = (double (*)[MD1][MQ1]) (sm0_1);
+             double (*DQQ0)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm1_0);
+             double (*DQQ1)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm1_1);
+             double (*DQQ2)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm1_2);
+             double (*QQQ0)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm0_0);
+             double (*QQQ1)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm0_1);
+             double (*QQQ2)[MQ1][MQ1]   = (double (*)[MQ1][MQ1]) (sm0_2);
+             double (*QQD0)[MQ1][MD1]   = (double (*)[MQ1][MD1]) (sm1_0);
+             double (*QQD1)[MQ1][MD1]   = (double (*)[MQ1][MD1]) (sm1_1);
+             double (*QQD2)[MQ1][MD1]   = (double (*)[MQ1][MD1]) (sm1_2);
+             double (*QDD0)[MD1][MD1]   = (double (*)[MD1][MD1]) (sm0_0);
+             double (*QDD1)[MD1][MD1]   = (double (*)[MD1][MD1]) (sm0_1);
+             double (*QDD2)[MD1][MD1]   = (double (*)[MD1][MD1]) (sm0_2);
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                        [&](int dx) {
+
+                          DIFFUSION3DPA_1;
+
+                        } // lambda (dx)
+                      ); // RAJA::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+                [&](int RAJA_UNUSED_ARG(dz)) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_2;
+
+                        } // lambda (qx)
+                      ); // RAJA::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                    [&](int dy) {
+                      RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_3;
+
+                        } // lambda (qx)
+                      ); // RAJA::loop<inner_x>
+                    } // lambda (dy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                [&](int dz) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                    [&](int qy) {
+                      RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                        [&](int qx) {
+
+                          DIFFUSION3DPA_4;
+
+                        } // lambda (qx)
+                      ); // RAJA::loop<inner_x>
+                    } // lambda (qy)
+                  );  //RAJA::loop<inner_y>
+                } // lambda (dz)
+              );  //RAJA::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int qx) {
+
+                         DIFFUSION3DPA_5;
+
+                       } // lambda (qx)
+                     ); // RAJA::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+               [&](int RAJA_UNUSED_ARG(dz)) {
+                 RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int d) {
+                     RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                       [&](int q) {
+
+                         DIFFUSION3DPA_6;
+
+                       } // lambda (q)
+                     ); // RAJA::loop<inner_x>
+                   } // lambda (d)
+                 );  //RAJA::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+                   [&](int qy) {
+                     RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_7;
+
+                       } // lambda (dx)
+                     ); // RAJA::loop<inner_x>
+                   } // lambda (qy)
+                 );  //RAJA::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
+               [&](int qz) {
+                 RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_8;
+
+                       } // lambda (dx)
+                     ); // RAJA::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::loop<inner_y>
+               } // lambda (qz)
+             );  //RAJA::loop<inner_z>
+
+             ctx.teamSync();
+
+             RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+               [&](int dz) {
+                 RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                   [&](int dy) {
+                     RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
+                       [&](int dx) {
+
+                         DIFFUSION3DPA_9;
+
+                       } // lambda (dx)
+                     ); // RAJA::loop<inner_x>
+                   } // lambda (dy)
+                 );  //RAJA::loop<inner_y>
+               } // lambda (dz)
+             );  //RAJA::loop<inner_z>
+
+            } // lambda (e)
+          ); // RAJA::loop<outer_x>
+
+        }  // outer lambda (ctx)
+      );  // RAJA::launch
+
+    } // loop over kernel reps
+    stopTimer();
+
+    break;
+  }
+
+  default: {
+
+    getCout() << "\n DIFFUSION3DPA : Unknown Sycl variant id = " << vid
+              << std::endl;
+    break;
+  }
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DIFFUSION3DPA, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_SYCL
diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp
index e0cd0f6d0..16cf307b5 100644
--- a/src/apps/DIFFUSION3DPA.cpp
+++ b/src/apps/DIFFUSION3DPA.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,17 +28,18 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params)
   setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D);
   setDefaultReps(50);
 
-  m_NE = std::max(getTargetProblemSize()/(DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1));
+  m_NE = std::max((getTargetProblemSize() + (DPA_Q1D*DPA_Q1D*DPA_Q1D)/2) / (DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1));
 
   setActualProblemSize( m_NE*DPA_Q1D*DPA_Q1D*DPA_Q1D );
 
   setItsPerRep(getActualProblemSize());
   setKernelsPerRep(1);
 
-  setBytesPerRep( 2*DPA_Q1D*DPA_D1D*sizeof(Real_type)  +
-                  DPA_Q1D*DPA_Q1D*DPA_Q1D*SYM*m_NE*sizeof(Real_type) +
-                  DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) +
-                  DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) );
+  setBytesReadPerRep( 2*sizeof(Real_type) * DPA_Q1D*DPA_D1D + // b, g
+                      2*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x, y
+                    SYM*sizeof(Real_type) * DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE ); // d
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y
+  setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D +
                          5 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_D1D +
@@ -65,6 +66,9 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
 }
 
 DIFFUSION3DPA::~DIFFUSION3DPA()
diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp
index 62967d5c0..5b587279c 100644
--- a/src/apps/DIFFUSION3DPA.hpp
+++ b/src/apps/DIFFUSION3DPA.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -481,17 +481,22 @@ class DIFFUSION3DPA : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D;
-  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::list_type<default_gpu_block_size>;
 
   Real_ptr m_B;
   Real_ptr m_Bt;
diff --git a/src/apps/EDGE3D-Cuda.cpp b/src/apps/EDGE3D-Cuda.cpp
index 9136dc961..5f212fb9b 100644
--- a/src/apps/EDGE3D-Cuda.cpp
+++ b/src/apps/EDGE3D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -66,12 +66,14 @@ void EDGE3D::runCudaVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      edge3d<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(sum,
-                                       x0, x1, x2, x3, x4, x5, x6, x7,
-                                       y0, y1, y2, y3, y4, y5, y6, y7,
-                                       z0, z1, z2, z3, z4, z5, z6, z7,
-                                       ibegin, iend);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (edge3d<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          sum,
+                          x0, x1, x2, x3, x4, x5, x6, x7,
+                          y0, y1, y2, y3, y4, y5, y6, y7,
+                          z0, z1, z2, z3, z4, z5, z6, z7,
+                          ibegin, iend );
 
     }
     stopTimer();
@@ -81,14 +83,17 @@ void EDGE3D::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto edge3d_lambda = [=] __device__ (Index_type i) { EDGE3D_BODY; };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      auto edge3d_lam = [=] __device__ (Index_type i) { EDGE3D_BODY; };
-
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, edge3d_lam);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(edge3d_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, 
+                          edge3d_lambda );
 
     }
     stopTimer();
diff --git a/src/apps/EDGE3D-Hip.cpp b/src/apps/EDGE3D-Hip.cpp
index 5da3606c4..56ff054d8 100644
--- a/src/apps/EDGE3D-Hip.cpp
+++ b/src/apps/EDGE3D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -65,13 +65,15 @@ void EDGE3D::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-
-      hipLaunchKernelGGL((edge3d<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), sum,
-                                       x0, x1, x2, x3, x4, x5, x6, x7,
-                                       y0, y1, y2, y3, y4, y5, y6, y7,
-                                       z0, z1, z2, z3, z4, z5, z6, z7,
-                                       ibegin, iend);
-      hipErrchk( hipGetLastError() );
+     
+      RPlaunchHipKernel( (edge3d<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         sum,
+                         x0, x1, x2, x3, x4, x5, x6, x7,
+                         y0, y1, y2, y3, y4, y5, y6, y7,
+                         z0, z1, z2, z3, z4, z5, z6, z7,
+                         ibegin, iend );
 
     }
     stopTimer();
@@ -81,16 +83,17 @@ void EDGE3D::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto edge3d_lambda = [=] __device__ (Index_type i) { EDGE3D_BODY; };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      auto edge3d_lam = [=] __device__ (Index_type i) { EDGE3D_BODY; };
-
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(edge3d_lam)>),
-        grid_size, block_size, shmem, res.get_stream(),
-        ibegin, iend,  edge3d_lam);
-
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(edge3d_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend,
+                         edge3d_lambda );
 
     }
     stopTimer();
diff --git a/src/apps/EDGE3D-OMP.cpp b/src/apps/EDGE3D-OMP.cpp
index bb79de639..1671872b4 100644
--- a/src/apps/EDGE3D-OMP.cpp
+++ b/src/apps/EDGE3D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/EDGE3D-OMPTarget.cpp b/src/apps/EDGE3D-OMPTarget.cpp
index bf86d856c..2a348ec28 100644
--- a/src/apps/EDGE3D-OMPTarget.cpp
+++ b/src/apps/EDGE3D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -37,11 +37,6 @@ void EDGE3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
 
   EDGE3D_DATA_SETUP;
 
-  auto edge3d_lam =
-    [=](Index_type i) {
-      EDGE3D_BODY;
-    };
-
   if ( vid == Base_OpenMPTarget ) {
 
     startTimer();
@@ -61,8 +56,6 @@ void EDGE3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
-    EDGE3D_DATA_SETUP_OMP_TARGET;
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
diff --git a/src/apps/EDGE3D-Seq.cpp b/src/apps/EDGE3D-Seq.cpp
index 6658650b1..5f7114127 100644
--- a/src/apps/EDGE3D-Seq.cpp
+++ b/src/apps/EDGE3D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,9 +28,11 @@ void EDGE3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 
   EDGE3D_DATA_SETUP;
 
+#if defined(RUN_RAJA_SEQ)
   auto edge3d_lam = [=](Index_type i) {
                      EDGE3D_BODY;
                    };
+#endif
 
   switch ( vid ) {
 
@@ -70,7 +72,7 @@ void EDGE3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::loop_exec>(
+        RAJA::forall<RAJA::seq_exec>(
           RAJA::RangeSegment(ibegin, iend), edge3d_lam);
 
       }
diff --git a/src/apps/EDGE3D-Sycl.cpp b/src/apps/EDGE3D-Sycl.cpp
new file mode 100644
index 000000000..6b60bbc3c
--- /dev/null
+++ b/src/apps/EDGE3D-Sycl.cpp
@@ -0,0 +1,84 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "EDGE3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+template < size_t work_group_size >
+void EDGE3D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = m_domain->fpz;
+  const Index_type iend = m_domain->lpz+1;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  EDGE3D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0) + ibegin;
+          if (i < iend) {
+            EDGE3D_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        EDGE3D_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  EDGE3D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(EDGE3D, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp
index 3b93d281b..d917bb321 100644
--- a/src/apps/EDGE3D.cpp
+++ b/src/apps/EDGE3D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,23 +27,22 @@ EDGE3D::EDGE3D(const RunParams& params)
 {
   setDefaultProblemSize(100*100*100);  // See rzmax in ADomain struct
   setDefaultReps(10);
-  Index_type rzmax = std::cbrt(getTargetProblemSize())+1;
+  Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1;
   m_domain = new ADomain(rzmax, /* ndims = */ 3);
 
   m_array_length = m_domain->nnalls;
   size_t number_of_elements = m_domain->lpz+1 - m_domain->fpz;
 
-  setActualProblemSize( number_of_elements );
+  setActualProblemSize( m_domain->n_real_zones );
 
   setItsPerRep( number_of_elements );
   setKernelsPerRep(1);
 
   // touched data size, not actual number of stores and loads
   // see VOL3D.cpp
-  size_t reads_per_node = 3*sizeof(Real_type);
-  size_t writes_per_zone = 1*sizeof(Real_type);
-  setBytesPerRep( writes_per_zone * getItsPerRep() +
-                  reads_per_node * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
+  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr size_t flops_k_loop = 15
                                   + 6*flops_Jxx()
@@ -83,6 +82,9 @@ EDGE3D::EDGE3D(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 EDGE3D::~EDGE3D()
diff --git a/src/apps/EDGE3D.hpp b/src/apps/EDGE3D.hpp
index 82e07c6a5..a5ae54cfa 100644
--- a/src/apps/EDGE3D.hpp
+++ b/src/apps/EDGE3D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -417,17 +417,22 @@ class EDGE3D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp
index a62974a15..c33321ea7 100644
--- a/src/apps/ENERGY-Cuda.cpp
+++ b/src/apps/ENERGY-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -123,51 +123,63 @@ void ENERGY::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       constexpr size_t shmem = 0;
-
-       energycalc1<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( e_new, e_old, delvc,
-                                               p_old, q_old, work,
-                                               iend );
-       cudaErrchk( cudaGetLastError() );
-
-       energycalc2<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( delvc, q_new,
-                                               compHalfStep, pHalfStep,
-                                               e_new, bvc, pbvc,
-                                               ql_old, qq_old,
-                                               rho0,
-                                               iend );
-       cudaErrchk( cudaGetLastError() );
-
-       energycalc3<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( e_new, delvc,
-                                               p_old, q_old,
-                                               pHalfStep, q_new,
-                                               iend );
-       cudaErrchk( cudaGetLastError() );
-
-       energycalc4<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( e_new, work,
-                                               e_cut, emin,
-                                               iend );
-       cudaErrchk( cudaGetLastError() );
-
-       energycalc5<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( delvc,
-                                               pbvc, e_new, vnewc,
-                                               bvc, p_new,
-                                               ql_old, qq_old,
-                                               p_old, q_old,
-                                               pHalfStep, q_new,
-                                               rho0, e_cut, emin,
-                                               iend );
-       cudaErrchk( cudaGetLastError() );
-
-       energycalc6<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( delvc,
-                                               pbvc, e_new, vnewc,
-                                               bvc, p_new,
-                                               q_new,
-                                               ql_old, qq_old,
-                                               rho0, q_cut,
-                                               iend );
-       cudaErrchk( cudaGetLastError() );
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchCudaKernel( (energycalc1<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          e_new, e_old, delvc,
+                          p_old, q_old, work,
+                          iend );
+
+      RPlaunchCudaKernel( (energycalc2<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          delvc, q_new,
+                          compHalfStep, pHalfStep,
+                          e_new, bvc, pbvc,
+                          ql_old, qq_old,
+                          rho0,
+                          iend );
+
+      RPlaunchCudaKernel( (energycalc3<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          e_new, delvc,
+                          p_old, q_old,
+                          pHalfStep, q_new,
+                          iend );
+
+      RPlaunchCudaKernel( (energycalc4<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          e_new, work,
+                          e_cut, emin,
+                          iend );
+
+      RPlaunchCudaKernel( (energycalc5<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          delvc,
+                          pbvc, e_new, vnewc,
+                          bvc, p_new,
+                          ql_old, qq_old,
+                          p_old, q_old,
+                          pHalfStep, q_new,
+                          rho0, e_cut, emin,
+                          iend );
+
+      RPlaunchCudaKernel( (energycalc6<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          delvc,
+                          pbvc, e_new, vnewc,
+                          bvc, p_new,
+                          q_new,
+                          ql_old, qq_old,
+                          rho0, q_cut,
+                          iend );
 
     }
     stopTimer();
diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp
index c7064e591..e0424d55a 100644
--- a/src/apps/ENERGY-Hip.cpp
+++ b/src/apps/ENERGY-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -123,51 +123,63 @@ void ENERGY::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       constexpr size_t shmem = 0;
-
-       hipLaunchKernelGGL((energycalc1<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  e_new, e_old, delvc,
-                                               p_old, q_old, work,
-                                               iend );
-       hipErrchk( hipGetLastError() );
-
-       hipLaunchKernelGGL((energycalc2<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  delvc, q_new,
-                                               compHalfStep, pHalfStep,
-                                               e_new, bvc, pbvc,
-                                               ql_old, qq_old,
-                                               rho0,
-                                               iend );
-       hipErrchk( hipGetLastError() );
-
-       hipLaunchKernelGGL((energycalc3<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  e_new, delvc,
-                                               p_old, q_old,
-                                               pHalfStep, q_new,
-                                               iend );
-       hipErrchk( hipGetLastError() );
-
-       hipLaunchKernelGGL((energycalc4<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  e_new, work,
-                                               e_cut, emin,
-                                               iend );
-       hipErrchk( hipGetLastError() );
-
-       hipLaunchKernelGGL((energycalc5<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  delvc,
-                                               pbvc, e_new, vnewc,
-                                               bvc, p_new,
-                                               ql_old, qq_old,
-                                               p_old, q_old,
-                                               pHalfStep, q_new,
-                                               rho0, e_cut, emin,
-                                               iend );
-       hipErrchk( hipGetLastError() );
-
-       hipLaunchKernelGGL((energycalc6<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  delvc,
-                                               pbvc, e_new, vnewc,
-                                               bvc, p_new,
-                                               q_new,
-                                               ql_old, qq_old,
-                                               rho0, q_cut,
-                                               iend );
-       hipErrchk( hipGetLastError() );
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchHipKernel( (energycalc1<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         e_new, e_old, delvc,
+                         p_old, q_old, work,
+                         iend );
+
+      RPlaunchHipKernel( (energycalc2<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         delvc, q_new,
+                         compHalfStep, pHalfStep,
+                         e_new, bvc, pbvc,
+                         ql_old, qq_old,
+                         rho0,
+                         iend );
+
+      RPlaunchHipKernel( (energycalc3<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         e_new, delvc,
+                         p_old, q_old,
+                         pHalfStep, q_new,
+                         iend );
+
+      RPlaunchHipKernel( (energycalc4<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         e_new, work,
+                         e_cut, emin,
+                         iend ); 
+
+      RPlaunchHipKernel( (energycalc5<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         delvc,
+                         pbvc, e_new, vnewc,
+                         bvc, p_new,
+                         ql_old, qq_old,
+                         p_old, q_old,
+                         pHalfStep, q_new,
+                         rho0, e_cut, emin,
+                         iend );
+
+      RPlaunchHipKernel( (energycalc6<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         delvc,
+                         pbvc, e_new, vnewc,
+                         bvc, p_new,
+                         q_new,
+                         ql_old, qq_old,
+                         rho0, q_cut,
+                         iend );
 
     }
     stopTimer();
diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp
index 235386ff0..687f69d25 100644
--- a/src/apps/ENERGY-OMP.cpp
+++ b/src/apps/ENERGY-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp
index 83ce48357..786623a8f 100644
--- a/src/apps/ENERGY-OMPTarget.cpp
+++ b/src/apps/ENERGY-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp
index c7e3ffdf2..bbf9d73c0 100644
--- a/src/apps/ENERGY-Seq.cpp
+++ b/src/apps/ENERGY-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp
new file mode 100644
index 000000000..7ebc7f3c7
--- /dev/null
+++ b/src/apps/ENERGY-Sycl.cpp
@@ -0,0 +1,174 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ENERGY.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+template <size_t work_group_size >
+void ENERGY::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  ENERGY_DATA_SETUP;
+
+  using sycl::sqrt;
+  using sycl::fabs;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); 
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if(i < iend) {
+            ENERGY_BODY1
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+            
+          Index_type i = item.get_global_id(0);            
+          if(i < iend) {
+            ENERGY_BODY2
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if(i < iend) {
+            ENERGY_BODY3
+          }
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if(i < iend) {
+            ENERGY_BODY4
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if(i < iend) {
+            ENERGY_BODY5
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if(i < iend) {
+            ENERGY_BODY6
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    const bool async = true;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::region<RAJA::seq_region>( [=]() {
+
+        RAJA::forall< RAJA::sycl_exec<work_group_size, async> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+          ENERGY_BODY1;
+        });
+
+        RAJA::forall< RAJA::sycl_exec<work_group_size, async> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+          ENERGY_BODY2;
+        });
+
+        RAJA::forall< RAJA::sycl_exec<work_group_size, async> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+          ENERGY_BODY3;
+        });
+
+        RAJA::forall< RAJA::sycl_exec<work_group_size, async> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+          ENERGY_BODY4;
+        });
+
+        RAJA::forall< RAJA::sycl_exec<work_group_size, async> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+          ENERGY_BODY5;
+        });
+
+        RAJA::forall< RAJA::sycl_exec<work_group_size, async> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+          ENERGY_BODY6;
+        });
+
+      }); // end sequential region (for single-source code)
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  ENERGY : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ENERGY, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp
index fd1988300..7f480d00f 100644
--- a/src/apps/ENERGY.cpp
+++ b/src/apps/ENERGY.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,13 +29,22 @@ ENERGY::ENERGY(const RunParams& params)
   setItsPerRep( 6 * getActualProblemSize() );
   setKernelsPerRep(6);
   // some branches are never taken due to the nature of the initialization of delvc
-  // the additional reads and writes that would be done if those branches were taken are noted in the comments
-  setBytesPerRep( (1*sizeof(Real_type) + 5*sizeof(Real_type)) * getActualProblemSize() +
-                  (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() + /* 1 + 8 */
-                  (1*sizeof(Real_type) + 6*sizeof(Real_type)) * getActualProblemSize() +
-                  (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() +
-                  (1*sizeof(Real_type) + 7*sizeof(Real_type)) * getActualProblemSize() + /* 1 + 12 */
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); /* 1 + 8 */
+  // the additional reads that would be done if those branches were taken are noted in the comments
+  setBytesReadPerRep((5*sizeof(Real_type) +
+                      1*sizeof(Real_type) + // 8
+                      6*sizeof(Real_type) +
+                      2*sizeof(Real_type) +
+                      7*sizeof(Real_type) + // 12
+                      1*sizeof(Real_type)   // 8
+                      ) * getActualProblemSize() );
+  setBytesWrittenPerRep((1*sizeof(Real_type) +
+                         1*sizeof(Real_type) +
+                         1*sizeof(Real_type) +
+                         1*sizeof(Real_type) +
+                         1*sizeof(Real_type) +
+                         0*sizeof(Real_type)
+                         ) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((6  +
                   11 + // 1 sqrt
                   8  +
@@ -62,6 +71,9 @@ ENERGY::ENERGY(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 ENERGY::~ENERGY()
diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp
index 22af34867..ba5b69949 100644
--- a/src/apps/ENERGY.hpp
+++ b/src/apps/ENERGY.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -203,17 +203,22 @@ class ENERGY : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_e_new;
   Real_ptr m_e_old;
diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp
index f88e7b55d..fd4386324 100644
--- a/src/apps/FEM_MACROS.hpp
+++ b/src/apps/FEM_MACROS.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -24,6 +24,11 @@
   for (int i = threadIdx.k; i < N; i += blockDim.k)
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+#define SYCL_FOREACH_THREAD(i, k, N) \
+  for (int i = itm.get_local_id(k); i < N; i += itm.get_local_range(k))
+#endif
+
 #define CPU_FOREACH(i, k, N) for (int i = 0; i < N; i++)
 
 #endif // closing endif for header file include guard
diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp
index 01266c7d6..9605d85b1 100644
--- a/src/apps/FIR-Cuda.cpp
+++ b/src/apps/FIR-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -83,7 +83,7 @@ void FIR::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
-  const Index_type iend = getActualProblemSize() - m_coefflen;
+  const Index_type iend = getActualProblemSize();
 
   auto res{getCudaResource()};
 
@@ -98,20 +98,24 @@ void FIR::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       constexpr size_t shmem = 0;
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
 
 #if defined(USE_CUDA_CONSTANT_MEMORY)
-       fir<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( out, in,
-                                       coefflen,
-                                       iend );
-       cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (fir<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          out, in,
+                          coefflen,
+                          iend ); 
 #else
-       fir<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( out, in,
-                                       coeff,
-                                       coefflen,
-                                       iend );
-       cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (fir<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          out, in,
+                          coeff,
+                          coefflen,
+                          iend );
 #endif
 
     }
diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp
index 47dc40efb..a3272cb23 100644
--- a/src/apps/FIR-Hip.cpp
+++ b/src/apps/FIR-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -81,7 +81,7 @@ void FIR::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
-  const Index_type iend = getActualProblemSize() - m_coefflen;
+  const Index_type iend = getActualProblemSize();
 
   auto res{getHipResource()};
 
@@ -96,20 +96,24 @@ void FIR::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       constexpr size_t shmem = 0;
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
 
 #if defined(USE_HIP_CONSTANT_MEMORY)
-       hipLaunchKernelGGL((fir<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  out, in,
-                                       coefflen,
-                                       iend );
-       hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (fir<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         out, in,
+                         coefflen,
+                         iend ); 
 #else
-       hipLaunchKernelGGL((fir<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  out, in,
-                                       coeff,
-                                       coefflen,
-                                       iend );
-       hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (fir<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         out, in,
+                         coeff,
+                         coefflen,
+                         iend ); 
 #endif
 
     }
diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp
index 5b3cc2a35..5fcad1616 100644
--- a/src/apps/FIR-OMP.cpp
+++ b/src/apps/FIR-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,7 +25,7 @@ void FIR::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
-  const Index_type iend = getActualProblemSize() - m_coefflen;
+  const Index_type iend = getActualProblemSize();
 
   FIR_COEFF;
 
diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp
index 5715f884a..3ba913846 100644
--- a/src/apps/FIR-OMPTarget.cpp
+++ b/src/apps/FIR-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -43,7 +43,7 @@ void FIR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
-  const Index_type iend = getActualProblemSize() - m_coefflen;
+  const Index_type iend = getActualProblemSize();
 
   FIR_DATA_SETUP;
 
diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp
index b13d30818..001ffd194 100644
--- a/src/apps/FIR-Seq.cpp
+++ b/src/apps/FIR-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,7 +23,7 @@ void FIR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
-  const Index_type iend = getActualProblemSize() - m_coefflen;
+  const Index_type iend = getActualProblemSize();
 
   FIR_COEFF;
 
diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp
new file mode 100644
index 000000000..eee240a5f
--- /dev/null
+++ b/src/apps/FIR-Sycl.cpp
@@ -0,0 +1,109 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "FIR.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <algorithm>
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+#define FIR_DATA_SETUP_SYCL \
+  Real_ptr coeff; \
+\
+  allocAndInitSyclDeviceData(in, m_in, getActualProblemSize(), qu); \
+  allocAndInitSyclDeviceData(out, m_out, getActualProblemSize(), qu); \
+  Real_ptr tcoeff = &coeff_array[0]; \
+  allocAndInitSyclDeviceData(coeff, tcoeff, FIR_COEFFLEN, qu);
+
+#define FIR_DATA_TEARDOWN_SYCL \
+  getSyclDeviceData(m_out, out, getActualProblemSize(), qu); \
+  deallocSyclDeviceData(in, qu); \
+  deallocSyclDeviceData(out, qu); \
+  deallocSyclDeviceData(coeff, qu);
+
+
+template <size_t work_group_size >
+void FIR::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  FIR_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    FIR_COEFF;
+
+    FIR_DATA_SETUP_SYCL;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            FIR_BODY
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+    FIR_DATA_TEARDOWN_SYCL;
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    FIR_COEFF;
+
+    FIR_DATA_SETUP_SYCL;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         FIR_BODY;
+       });
+
+    }
+    stopTimer();
+
+    FIR_DATA_TEARDOWN_SYCL;
+
+  } else {
+     std::cout << "\n  FIR : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIR, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp
index 7b51aaebc..f4a2de7e8 100644
--- a/src/apps/FIR.cpp
+++ b/src/apps/FIR.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,11 +28,13 @@ FIR::FIR(const RunParams& params)
 
   setActualProblemSize( getTargetProblemSize() );
 
-  setItsPerRep( getActualProblemSize() - m_coefflen );
+  setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
-  setFLOPsPerRep((2 * m_coefflen) * (getActualProblemSize() - m_coefflen));
+  setBytesReadPerRep( m_coefflen*sizeof(Real_type) +
+                      1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
+  setFLOPsPerRep((2 * m_coefflen) * getActualProblemSize());
 
   checksum_scale_factor = 0.0001 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
@@ -56,6 +58,9 @@ FIR::FIR(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 FIR::~FIR()
@@ -64,7 +69,7 @@ FIR::~FIR()
 
 void FIR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
-  allocAndInitData(m_in, getActualProblemSize(), vid);
+  allocAndInitData(m_in, getActualProblemSize() + m_coefflen-1, vid);
   allocAndInitDataConst(m_out, getActualProblemSize(), 0.0, vid);
 }
 
diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp
index 3ca8a1cef..72968045f 100644
--- a/src/apps/FIR.hpp
+++ b/src/apps/FIR.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -78,17 +78,22 @@ class FIR : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_in;
   Real_ptr m_out;
diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp
deleted file mode 100644
index 58534da21..000000000
--- a/src/apps/HALOEXCHANGE.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
-// and RAJA Performance Suite project contributors.
-// See the RAJAPerf/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include "HALOEXCHANGE.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-#include "common/DataUtils.hpp"
-
-#include <cmath>
-
-namespace rajaperf
-{
-namespace apps
-{
-
-HALOEXCHANGE::HALOEXCHANGE(const RunParams& params)
-  : KernelBase(rajaperf::Apps_HALOEXCHANGE, params)
-{
-  m_grid_dims_default[0] = 100;
-  m_grid_dims_default[1] = 100;
-  m_grid_dims_default[2] = 100;
-  m_halo_width_default   = 1;
-  m_num_vars_default     = 3;
-
-  setDefaultProblemSize( m_grid_dims_default[0] *
-                         m_grid_dims_default[1] *
-                         m_grid_dims_default[2] );
-  setDefaultReps(50);
-
-  double cbrt_run_size = std::cbrt(getTargetProblemSize());
-
-  m_grid_dims[0] = cbrt_run_size;
-  m_grid_dims[1] = cbrt_run_size;
-  m_grid_dims[2] = cbrt_run_size;
-  m_halo_width = m_halo_width_default;
-  m_num_vars   = m_num_vars_default;
-
-  m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width;
-  m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width;
-  m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width;
-  m_var_size = m_grid_plus_halo_dims[0] *
-               m_grid_plus_halo_dims[1] *
-               m_grid_plus_halo_dims[2] ;
-
-  setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] );
-
-  setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) );
-  setKernelsPerRep( 2 * s_num_neighbors * m_num_vars );
-  setBytesPerRep( (0*sizeof(Int_type)  + 1*sizeof(Int_type) ) * getItsPerRep() +
-                  (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() +
-                  (0*sizeof(Int_type)  + 1*sizeof(Int_type) ) * getItsPerRep() +
-                  (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() );
-  setFLOPsPerRep(0);
-
-  setUsesFeature(Forall);
-
-  setVariantDefined( Base_Seq );
-  setVariantDefined( Lambda_Seq );
-  setVariantDefined( RAJA_Seq );
-
-  setVariantDefined( Base_OpenMP );
-  setVariantDefined( Lambda_OpenMP );
-  setVariantDefined( RAJA_OpenMP );
-
-  setVariantDefined( Base_OpenMPTarget );
-  setVariantDefined( RAJA_OpenMPTarget );
-
-  setVariantDefined( Base_CUDA );
-  setVariantDefined( RAJA_CUDA );
-
-  setVariantDefined( Base_HIP );
-  setVariantDefined( RAJA_HIP );
-}
-
-HALOEXCHANGE::~HALOEXCHANGE()
-{
-}
-
-void HALOEXCHANGE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
-{
-  m_vars.resize(m_num_vars, nullptr);
-  for (Index_type v = 0; v < m_num_vars; ++v) {
-    allocAndInitData(m_vars[v], m_var_size, vid);
-    auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid);
-
-    Real_ptr var = m_vars[v];
-
-    for (Index_type i = 0; i < m_var_size; i++) {
-      var[i] = i + v;
-    }
-  }
-
-  m_pack_index_lists.resize(s_num_neighbors, nullptr);
-  m_pack_index_list_lengths.resize(s_num_neighbors, 0);
-  create_pack_lists(m_pack_index_lists, m_pack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid);
-
-  m_unpack_index_lists.resize(s_num_neighbors, nullptr);
-  m_unpack_index_list_lengths.resize(s_num_neighbors, 0);
-  create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid);
-
-  m_buffers.resize(s_num_neighbors, nullptr);
-  for (Index_type l = 0; l < s_num_neighbors; ++l) {
-    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
-    allocAndInitData(m_buffers[l], buffer_len, vid);
-  }
-}
-
-void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx)
-{
-  for (Real_ptr var : m_vars) {
-    checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid);
-  }
-}
-
-void HALOEXCHANGE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
-{
-  for (int l = 0; l < s_num_neighbors; ++l) {
-    deallocData(m_buffers[l], vid);
-  }
-  m_buffers.clear();
-
-  destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid);
-  m_unpack_index_list_lengths.clear();
-  m_unpack_index_lists.clear();
-
-  destroy_pack_lists(m_pack_index_lists, s_num_neighbors, vid);
-  m_pack_index_list_lengths.clear();
-  m_pack_index_lists.clear();
-
-  for (int v = 0; v < m_num_vars; ++v) {
-    deallocData(m_vars[v], vid);
-  }
-  m_vars.clear();
-}
-
-namespace {
-
-struct Extent
-{
-  Index_type i_min;
-  Index_type i_max;
-  Index_type j_min;
-  Index_type j_max;
-  Index_type k_min;
-  Index_type k_max;
-};
-
-}
-
-//
-// Function to generate index lists for packing.
-//
-void HALOEXCHANGE::create_pack_lists(
-    std::vector<Int_ptr>& pack_index_lists,
-    std::vector<Index_type >& pack_index_list_lengths,
-    const Index_type halo_width, const Index_type* grid_dims,
-    const Index_type num_neighbors,
-    VariantID vid)
-{
-  std::vector<Extent> pack_index_list_extents(num_neighbors);
-
-  // faces
-  pack_index_list_extents[0]  = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[1]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[2]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[3]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[4]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[5]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-
-  // edges
-  pack_index_list_extents[6]  = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[7]  = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[8]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[9]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[10] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[11] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[14] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[15] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[16] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[17] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-
-  // corners
-  pack_index_list_extents[18] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[19] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[20] = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[21] = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-
-  const Index_type grid_i_stride = 1;
-  const Index_type grid_j_stride = grid_dims[0] + 2*halo_width;
-  const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
-
-  for (Index_type l = 0; l < num_neighbors; ++l) {
-
-    Extent extent = pack_index_list_extents[l];
-
-    pack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
-                                 (extent.j_max - extent.j_min) *
-                                 (extent.k_max - extent.k_min) ;
-
-    allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid);
-    auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid);
-
-    Int_ptr pack_list = pack_index_lists[l];
-
-    Index_type list_idx = 0;
-    for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) {
-      for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) {
-        for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) {
-
-          Index_type pack_idx = ii * grid_i_stride +
-                         jj * grid_j_stride +
-                         kk * grid_k_stride ;
-
-          pack_list[list_idx] = pack_idx;
-
-          list_idx += 1;
-        }
-      }
-    }
-  }
-}
-
-//
-// Function to destroy packing index lists.
-//
-void HALOEXCHANGE::destroy_pack_lists(
-    std::vector<Int_ptr>& pack_index_lists,
-    const Index_type num_neighbors,
-    VariantID vid)
-{
-  (void) vid;
-
-  for (Index_type l = 0; l < num_neighbors; ++l) {
-    deallocData(pack_index_lists[l], vid);
-  }
-}
-
-//
-// Function to generate index lists for unpacking.
-//
-void HALOEXCHANGE::create_unpack_lists(
-    std::vector<Int_ptr>& unpack_index_lists,
-    std::vector<Index_type >& unpack_index_list_lengths,
-    const Index_type halo_width, const Index_type* grid_dims,
-    const Index_type num_neighbors,
-    VariantID vid)
-{
-  std::vector<Extent> unpack_index_list_extents(num_neighbors);
-
-  // faces
-  unpack_index_list_extents[0]  = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[1]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[2]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[3]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[4]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[5]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-
-  // edges
-  unpack_index_list_extents[6]  = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[7]  = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[8]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[9]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[10] = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[11] = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[14] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[15] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[16] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[17] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-
-  // corners
-  unpack_index_list_extents[18] = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[19] = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[20] = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[21] = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-
-  const Index_type grid_i_stride = 1;
-  const Index_type grid_j_stride = grid_dims[0] + 2*halo_width;
-  const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
-
-  for (Index_type l = 0; l < num_neighbors; ++l) {
-
-    Extent extent = unpack_index_list_extents[l];
-
-    unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
-                                   (extent.j_max - extent.j_min) *
-                                   (extent.k_max - extent.k_min) ;
-
-    allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid);
-    auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid);
-
-    Int_ptr unpack_list = unpack_index_lists[l];
-
-    Index_type list_idx = 0;
-    for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) {
-      for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) {
-        for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) {
-
-          Index_type unpack_idx = ii * grid_i_stride +
-                           jj * grid_j_stride +
-                           kk * grid_k_stride ;
-
-          unpack_list[list_idx] = unpack_idx;
-
-          list_idx += 1;
-        }
-      }
-    }
-  }
-}
-
-//
-// Function to destroy unpacking index lists.
-//
-void HALOEXCHANGE::destroy_unpack_lists(
-    std::vector<Int_ptr>& unpack_index_lists,
-    const Index_type num_neighbors,
-    VariantID vid)
-{
-  (void) vid;
-
-  for (Index_type l = 0; l < num_neighbors; ++l) {
-    deallocData(unpack_index_lists[l], vid);
-  }
-}
-
-} // end namespace apps
-} // end namespace rajaperf
diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp
deleted file mode 100644
index 1f21d9616..000000000
--- a/src/apps/HALOEXCHANGE.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
-// and RAJA Performance Suite project contributors.
-// See the RAJAPerf/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// HALOEXCHANGE kernel reference implementation:
-///
-/// // pack message for each neighbor
-/// for (Index_type l = 0; l < num_neighbors; ++l) {
-///   Real_ptr buffer = buffers[l];
-///   Int_ptr list = pack_index_lists[l];
-///   Index_type  len  = pack_index_list_lengths[l];
-///   // pack part of each variable
-///   for (Index_type v = 0; v < num_vars; ++v) {
-///     Real_ptr var = vars[v];
-///     for (Index_type i = 0; i < len; i++) {
-///       HALOEXCHANGE_PACK_BODY;
-///     }
-///     buffer += len;
-///   }
-///   // send message to neighbor
-/// }
-///
-/// // unpack messages for each neighbor
-/// for (Index_type l = 0; l < num_neighbors; ++l) {
-///   // receive message from neighbor
-///   Real_ptr buffer = buffers[l];
-///   Int_ptr list = unpack_index_lists[l];
-///   Index_type  len  = unpack_index_list_lengths[l];
-///   // unpack part of each variable
-///   for (Index_type v = 0; v < num_vars; ++v) {
-///     Real_ptr var = vars[v];
-///     for (Index_type i = 0; i < len; i++) {
-///       HALOEXCHANGE_UNPACK_BODY;
-///     }
-///     buffer += len;
-///   }
-/// }
-///
-
-#ifndef RAJAPerf_Apps_HALOEXCHANGE_HPP
-#define RAJAPerf_Apps_HALOEXCHANGE_HPP
-
-#define HALOEXCHANGE_DATA_SETUP \
-  std::vector<Real_ptr> vars = m_vars; \
-  std::vector<Real_ptr> buffers = m_buffers; \
-\
-  Index_type num_neighbors = s_num_neighbors; \
-  Index_type num_vars = m_num_vars; \
-  std::vector<Int_ptr> pack_index_lists = m_pack_index_lists; \
-  std::vector<Index_type> pack_index_list_lengths = m_pack_index_list_lengths; \
-  std::vector<Int_ptr> unpack_index_lists = m_unpack_index_lists; \
-  std::vector<Index_type> unpack_index_list_lengths = m_unpack_index_list_lengths;
-
-#define HALOEXCHANGE_PACK_BODY \
-  buffer[i] = var[list[i]];
-
-#define HALOEXCHANGE_UNPACK_BODY \
-  var[list[i]] = buffer[i];
-
-
-#include "common/KernelBase.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-#include <vector>
-
-namespace rajaperf
-{
-class RunParams;
-
-namespace apps
-{
-
-class HALOEXCHANGE : public KernelBase
-{
-public:
-
-  HALOEXCHANGE(const RunParams& params);
-
-  ~HALOEXCHANGE();
-
-  void setUp(VariantID vid, size_t tune_idx);
-  void updateChecksum(VariantID vid, size_t tune_idx);
-  void tearDown(VariantID vid, size_t tune_idx);
-
-  void runSeqVariant(VariantID vid, size_t tune_idx);
-  void runOpenMPVariant(VariantID vid, size_t tune_idx);
-  void runCudaVariant(VariantID vid, size_t tune_idx);
-  void runHipVariant(VariantID vid, size_t tune_idx);
-  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
-
-  void setCudaTuningDefinitions(VariantID vid);
-  void setHipTuningDefinitions(VariantID vid);
-  template < size_t block_size >
-  void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size >
-  void runHipVariantImpl(VariantID vid);
-
-private:
-  static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
-
-  static const int s_num_neighbors = 26;
-
-  Index_type m_grid_dims[3];
-  Index_type m_halo_width;
-  Index_type m_num_vars;
-
-  Index_type m_grid_dims_default[3];
-  Index_type m_halo_width_default;
-  Index_type m_num_vars_default;
-
-  Index_type m_grid_plus_halo_dims[3];
-  Index_type m_var_size;
-  Index_type m_var_halo_size;
-
-  std::vector<Real_ptr> m_vars;
-  std::vector<Real_ptr> m_buffers;
-
-  std::vector<Int_ptr> m_pack_index_lists;
-  std::vector<Index_type > m_pack_index_list_lengths;
-  std::vector<Int_ptr> m_unpack_index_lists;
-  std::vector<Index_type > m_unpack_index_list_lengths;
-
-  void create_pack_lists(std::vector<Int_ptr>& pack_index_lists,
-                         std::vector<Index_type >& pack_index_list_lengths,
-                         const Index_type halo_width, const Index_type* grid_dims,
-                         const Index_type num_neighbors,
-                         VariantID vid);
-  void destroy_pack_lists(std::vector<Int_ptr>& pack_index_lists,
-                          const Index_type num_neighbors,
-                          VariantID vid);
-  void create_unpack_lists(std::vector<Int_ptr>& unpack_index_lists,
-                           std::vector<Index_type >& unpack_index_list_lengths,
-                           const Index_type halo_width, const Index_type* grid_dims,
-                           const Index_type num_neighbors,
-                           VariantID vid);
-  void destroy_unpack_lists(std::vector<Int_ptr>& unpack_index_lists,
-                            const Index_type num_neighbors,
-                            VariantID vid);
-};
-
-} // end namespace apps
-} // end namespace rajaperf
-
-#endif // closing endif for header file include guard
diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp
deleted file mode 100644
index 6be241d43..000000000
--- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
-// and RAJA Performance Suite project contributors.
-// See the RAJAPerf/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include "HALOEXCHANGE_FUSED.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-#if defined(RAJA_ENABLE_HIP)
-
-#include "common/HipDataUtils.hpp"
-
-#include <iostream>
-
-namespace rajaperf
-{
-namespace apps
-{
-
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \
-  Real_ptr*   pack_buffer_ptrs; \
-  Int_ptr*    pack_list_ptrs; \
-  Real_ptr*   pack_var_ptrs; \
-  Index_type* pack_len_ptrs; \
-  allocData(DataSpace::HipPinned, pack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinned, pack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinned, pack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinned, pack_len_ptrs,    num_neighbors * num_vars); \
-  Real_ptr*   unpack_buffer_ptrs; \
-  Int_ptr*    unpack_list_ptrs; \
-  Real_ptr*   unpack_var_ptrs; \
-  Index_type* unpack_len_ptrs; \
-  allocData(DataSpace::HipPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinned, unpack_list_ptrs,   num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinned, unpack_var_ptrs,    num_neighbors * num_vars); \
-  allocData(DataSpace::HipPinned, unpack_len_ptrs,    num_neighbors * num_vars);
-
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \
-  deallocData(DataSpace::HipPinned, pack_buffer_ptrs); \
-  deallocData(DataSpace::HipPinned, pack_list_ptrs); \
-  deallocData(DataSpace::HipPinned, pack_var_ptrs); \
-  deallocData(DataSpace::HipPinned, pack_len_ptrs); \
-  deallocData(DataSpace::HipPinned, unpack_buffer_ptrs); \
-  deallocData(DataSpace::HipPinned, unpack_list_ptrs); \
-  deallocData(DataSpace::HipPinned, unpack_var_ptrs); \
-  deallocData(DataSpace::HipPinned, unpack_len_ptrs);
-
-template < size_t block_size >
-__launch_bounds__(block_size)
-__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs,
-                                        Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs)
-{
-  Index_type j = blockIdx.y;
-
-  Real_ptr   buffer = pack_buffer_ptrs[j];
-  Int_ptr    list   = pack_list_ptrs[j];
-  Real_ptr   var    = pack_var_ptrs[j];
-  Index_type len    = pack_len_ptrs[j];
-
-  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
-       i < len;
-       i += block_size * gridDim.x) {
-    HALOEXCHANGE_FUSED_PACK_BODY;
-  }
-}
-
-template < size_t block_size >
-__launch_bounds__(block_size)
-__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs,
-                                          Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs)
-{
-  Index_type j = blockIdx.y;
-
-  Real_ptr   buffer = unpack_buffer_ptrs[j];
-  Int_ptr    list   = unpack_list_ptrs[j];
-  Real_ptr   var    = unpack_var_ptrs[j];
-  Index_type len    = unpack_len_ptrs[j];
-
-  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
-       i < len;
-       i += block_size * gridDim.x) {
-    HALOEXCHANGE_FUSED_UNPACK_BODY;
-  }
-}
-
-
-template < size_t block_size >
-void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid)
-{
-  const Index_type run_reps = getRunReps();
-
-  auto res{getHipResource()};
-
-  HALOEXCHANGE_FUSED_DATA_SETUP;
-
-  if ( vid == Base_HIP ) {
-
-    HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP;
-
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-      constexpr size_t shmem = 0;
-
-      Index_type pack_index = 0;
-      Index_type pack_len_sum = 0;
-
-      for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
-        Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
-        for (Index_type v = 0; v < num_vars; ++v) {
-          Real_ptr var = vars[v];
-          pack_buffer_ptrs[pack_index] = buffer;
-          pack_list_ptrs[pack_index] = list;
-          pack_var_ptrs[pack_index] = var;
-          pack_len_ptrs[pack_index] = len;
-          pack_len_sum += len;
-          pack_index += 1;
-          buffer += len;
-        }
-      }
-      Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
-      dim3 pack_nthreads_per_block(block_size);
-      dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index);
-      hipLaunchKernelGGL((haloexchange_fused_pack<block_size>), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(),
-          pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs);
-      hipErrchk( hipGetLastError() );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-
-      Index_type unpack_index = 0;
-      Index_type unpack_len_sum = 0;
-
-      for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
-        Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
-        for (Index_type v = 0; v < num_vars; ++v) {
-          Real_ptr var = vars[v];
-          unpack_buffer_ptrs[unpack_index] = buffer;
-          unpack_list_ptrs[unpack_index] = list;
-          unpack_var_ptrs[unpack_index] = var;
-          unpack_len_ptrs[unpack_index] = len;
-          unpack_len_sum += len;
-          unpack_index += 1;
-          buffer += len;
-        }
-      }
-      Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index;
-      dim3 unpack_nthreads_per_block(block_size);
-      dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index);
-      hipLaunchKernelGGL((haloexchange_fused_unpack<block_size>), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(),
-          unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs);
-      hipErrchk( hipGetLastError() );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-
-    }
-    stopTimer();
-
-    HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP;
-
-  } else if ( vid == RAJA_HIP ) {
-
-    using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::hip::pinned_mempool_type>;
-    using Allocator = AllocatorHolder::Allocator<char>;
-
-    AllocatorHolder allocatorHolder;
-
-    using workgroup_policy = RAJA::WorkGroupPolicy <
-                                 RAJA::hip_work_async<block_size>,
-#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
-                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-#else
-                                 RAJA::ordered,
-#endif
-                                 RAJA::constant_stride_array_of_objects >;
-
-    using workpool = RAJA::WorkPool< workgroup_policy,
-                                     Index_type,
-                                     RAJA::xargs<>,
-                                     Allocator >;
-
-    using workgroup = RAJA::WorkGroup< workgroup_policy,
-                                       Index_type,
-                                       RAJA::xargs<>,
-                                       Allocator >;
-
-    using worksite = RAJA::WorkSite< workgroup_policy,
-                                     Index_type,
-                                     RAJA::xargs<>,
-                                     Allocator >;
-
-    workpool pool_pack  (allocatorHolder.template getAllocator<char>());
-    workpool pool_unpack(allocatorHolder.template getAllocator<char>());
-    pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
-    pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
-
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-      for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
-        Int_ptr list = pack_index_lists[l];
-        Index_type len = pack_index_list_lengths[l];
-        for (Index_type v = 0; v < num_vars; ++v) {
-          Real_ptr var = vars[v];
-          auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) {
-                HALOEXCHANGE_FUSED_PACK_BODY;
-              };
-          pool_pack.enqueue(
-              RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_fused_pack_base_lam );
-          buffer += len;
-        }
-      }
-      workgroup group_pack = pool_pack.instantiate();
-      worksite site_pack = group_pack.run(res);
-      res.wait();
-
-      for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
-        Int_ptr list = unpack_index_lists[l];
-        Index_type len = unpack_index_list_lengths[l];
-        for (Index_type v = 0; v < num_vars; ++v) {
-          Real_ptr var = vars[v];
-          auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) {
-                HALOEXCHANGE_FUSED_UNPACK_BODY;
-              };
-          pool_unpack.enqueue(
-              RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_fused_unpack_base_lam );
-          buffer += len;
-        }
-      }
-      workgroup group_unpack = pool_unpack.instantiate();
-      worksite site_unpack = group_unpack.run(res);
-      res.wait();
-
-    }
-    stopTimer();
-
-  } else {
-     getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl;
-  }
-}
-
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Hip)
-
-} // end namespace apps
-} // end namespace rajaperf
-
-#endif  // RAJA_ENABLE_HIP
diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp
deleted file mode 100644
index 74dd5b0d5..000000000
--- a/src/apps/HALOEXCHANGE_FUSED.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
-// and RAJA Performance Suite project contributors.
-// See the RAJAPerf/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include "HALOEXCHANGE_FUSED.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-#include "common/DataUtils.hpp"
-
-#include <cmath>
-
-namespace rajaperf
-{
-namespace apps
-{
-
-HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params)
-  : KernelBase(rajaperf::Apps_HALOEXCHANGE_FUSED, params)
-{
-  m_grid_dims_default[0] = 100;
-  m_grid_dims_default[1] = 100;
-  m_grid_dims_default[2] = 100;
-  m_halo_width_default   = 1;
-  m_num_vars_default     = 3;
-
-  setDefaultProblemSize( m_grid_dims_default[0] *
-                         m_grid_dims_default[1] *
-                         m_grid_dims_default[2] );
-  setDefaultReps(50);
-
-  double cbrt_run_size = std::cbrt(getTargetProblemSize());
-
-  m_grid_dims[0] = cbrt_run_size;
-  m_grid_dims[1] = cbrt_run_size;
-  m_grid_dims[2] = cbrt_run_size;
-  m_halo_width = m_halo_width_default;
-  m_num_vars   = m_num_vars_default;
-
-  m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width;
-  m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width;
-  m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width;
-  m_var_size = m_grid_plus_halo_dims[0] *
-               m_grid_plus_halo_dims[1] *
-               m_grid_plus_halo_dims[2] ;
-
-  setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] );
-
-  setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) );
-  setKernelsPerRep( 2 );
-  setBytesPerRep( (0*sizeof(Int_type)  + 1*sizeof(Int_type) ) * getItsPerRep() +
-                  (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() +
-                  (0*sizeof(Int_type)  + 1*sizeof(Int_type) ) * getItsPerRep() +
-                  (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() );
-  setFLOPsPerRep(0);
-
-  setUsesFeature(Workgroup);
-
-  setVariantDefined( Base_Seq );
-  setVariantDefined( Lambda_Seq );
-  setVariantDefined( RAJA_Seq );
-
-  setVariantDefined( Base_OpenMP );
-  setVariantDefined( Lambda_OpenMP );
-  setVariantDefined( RAJA_OpenMP );
-
-  setVariantDefined( Base_OpenMPTarget );
-  setVariantDefined( RAJA_OpenMPTarget );
-
-  setVariantDefined( Base_CUDA );
-  setVariantDefined( RAJA_CUDA );
-
-  setVariantDefined( Base_HIP );
-  setVariantDefined( RAJA_HIP );
-}
-
-HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED()
-{
-}
-
-void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
-{
-  m_vars.resize(m_num_vars, nullptr);
-  for (Index_type v = 0; v < m_num_vars; ++v) {
-    allocAndInitData(m_vars[v], m_var_size, vid);
-    auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid);
-
-    Real_ptr var = m_vars[v];
-
-    for (Index_type i = 0; i < m_var_size; i++) {
-      var[i] = i + v;
-    }
-  }
-
-  m_pack_index_lists.resize(s_num_neighbors, nullptr);
-  m_pack_index_list_lengths.resize(s_num_neighbors, 0);
-  create_pack_lists(m_pack_index_lists, m_pack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid);
-
-  m_unpack_index_lists.resize(s_num_neighbors, nullptr);
-  m_unpack_index_list_lengths.resize(s_num_neighbors, 0);
-  create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid);
-
-  m_buffers.resize(s_num_neighbors, nullptr);
-  for (Index_type l = 0; l < s_num_neighbors; ++l) {
-    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
-    allocAndInitData(m_buffers[l], buffer_len, vid);
-  }
-}
-
-void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx)
-{
-  for (Real_ptr var : m_vars) {
-    checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid);
-  }
-}
-
-void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
-{
-  for (int l = 0; l < s_num_neighbors; ++l) {
-    deallocData(m_buffers[l], vid);
-  }
-  m_buffers.clear();
-
-  destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid);
-  m_unpack_index_list_lengths.clear();
-  m_unpack_index_lists.clear();
-
-  destroy_pack_lists(m_pack_index_lists, s_num_neighbors, vid);
-  m_pack_index_list_lengths.clear();
-  m_pack_index_lists.clear();
-
-  for (int v = 0; v < m_num_vars; ++v) {
-    deallocData(m_vars[v], vid);
-  }
-  m_vars.clear();
-}
-
-namespace {
-
-struct Extent
-{
-  Index_type i_min;
-  Index_type i_max;
-  Index_type j_min;
-  Index_type j_max;
-  Index_type k_min;
-  Index_type k_max;
-};
-
-}
-
-//
-// Function to generate index lists for packing.
-//
-void HALOEXCHANGE_FUSED::create_pack_lists(
-    std::vector<Int_ptr>& pack_index_lists,
-    std::vector<Index_type >& pack_index_list_lengths,
-    const Index_type halo_width, const Index_type* grid_dims,
-    const Index_type num_neighbors,
-    VariantID vid)
-{
-  std::vector<Extent> pack_index_list_extents(num_neighbors);
-
-  // faces
-  pack_index_list_extents[0]  = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[1]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[2]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[3]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[4]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[5]  = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-
-  // edges
-  pack_index_list_extents[6]  = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[7]  = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[8]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[9]  = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , grid_dims[2] + halo_width};
-  pack_index_list_extents[10] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[11] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[14] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[15] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[16] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[17] = Extent{halo_width  , grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-
-  // corners
-  pack_index_list_extents[18] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[19] = Extent{halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[20] = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[21] = Extent{halo_width  , halo_width   + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       halo_width  , halo_width   + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-  pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       halo_width  , halo_width   + halo_width};
-  pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width,
-                                       grid_dims[1], grid_dims[1] + halo_width,
-                                       grid_dims[2], grid_dims[2] + halo_width};
-
-  const Index_type grid_i_stride = 1;
-  const Index_type grid_j_stride = grid_dims[0] + 2*halo_width;
-  const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
-
-  for (Index_type l = 0; l < num_neighbors; ++l) {
-
-    Extent extent = pack_index_list_extents[l];
-
-    pack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
-                                 (extent.j_max - extent.j_min) *
-                                 (extent.k_max - extent.k_min) ;
-
-    allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid);
-    auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid);
-
-    Int_ptr pack_list = pack_index_lists[l];
-
-    Index_type list_idx = 0;
-    for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) {
-      for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) {
-        for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) {
-
-          Index_type pack_idx = ii * grid_i_stride +
-                         jj * grid_j_stride +
-                         kk * grid_k_stride ;
-
-          pack_list[list_idx] = pack_idx;
-
-          list_idx += 1;
-        }
-      }
-    }
-  }
-}
-
-//
-// Function to destroy packing index lists.
-//
-void HALOEXCHANGE_FUSED::destroy_pack_lists(
-    std::vector<Int_ptr>& pack_index_lists,
-    const Index_type num_neighbors,
-    VariantID vid)
-{
-  (void) vid;
-
-  for (Index_type l = 0; l < num_neighbors; ++l) {
-    deallocData(pack_index_lists[l], vid);
-  }
-}
-
-//
-// Function to generate index lists for unpacking.
-//
-void HALOEXCHANGE_FUSED::create_unpack_lists(
-    std::vector<Int_ptr>& unpack_index_lists,
-    std::vector<Index_type >& unpack_index_list_lengths,
-    const Index_type halo_width, const Index_type* grid_dims,
-    const Index_type num_neighbors,
-    VariantID vid)
-{
-  std::vector<Extent> unpack_index_list_extents(num_neighbors);
-
-  // faces
-  unpack_index_list_extents[0]  = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[1]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[2]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[3]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[4]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[5]  = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-
-  // edges
-  unpack_index_list_extents[6]  = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[7]  = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[8]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[9]  = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         halo_width               , grid_dims[2] +   halo_width};
-  unpack_index_list_extents[10] = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[11] = Extent{0                        ,                  halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         halo_width               , grid_dims[1] +   halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[14] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[15] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[16] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[17] = Extent{halo_width               , grid_dims[0] +   halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-
-  // corners
-  unpack_index_list_extents[18] = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[19] = Extent{0                        ,                  halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[20] = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[21] = Extent{0                        ,                  halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         0                        ,                  halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-  unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         0                        ,                  halo_width};
-  unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width,
-                                         grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width,
-                                         grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width};
-
-  const Index_type grid_i_stride = 1;
-  const Index_type grid_j_stride = grid_dims[0] + 2*halo_width;
-  const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
-
-  for (Index_type l = 0; l < num_neighbors; ++l) {
-
-    Extent extent = unpack_index_list_extents[l];
-
-    unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
-                                   (extent.j_max - extent.j_min) *
-                                   (extent.k_max - extent.k_min) ;
-
-    allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid);
-    auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid);
-
-    Int_ptr unpack_list = unpack_index_lists[l];
-
-    Index_type list_idx = 0;
-    for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) {
-      for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) {
-        for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) {
-
-          Index_type unpack_idx = ii * grid_i_stride +
-                           jj * grid_j_stride +
-                           kk * grid_k_stride ;
-
-          unpack_list[list_idx] = unpack_idx;
-
-          list_idx += 1;
-        }
-      }
-    }
-  }
-}
-
-//
-// Function to destroy unpacking index lists.
-//
-void HALOEXCHANGE_FUSED::destroy_unpack_lists(
-    std::vector<Int_ptr>& unpack_index_lists,
-    const Index_type num_neighbors,
-    VariantID vid)
-{
-  (void) vid;
-
-  for (Index_type l = 0; l < num_neighbors; ++l) {
-    deallocData(unpack_index_lists[l], vid);
-  }
-}
-
-} // end namespace apps
-} // end namespace rajaperf
diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp
index 8fe91fbf7..a0142d1aa 100644
--- a/src/apps/LTIMES-Cuda.cpp
+++ b/src/apps/LTIMES-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,8 +25,8 @@ namespace apps
 // Define thread block shape for CUDA execution
 //
 #define m_block_sz (32)
-#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
+#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
 
 #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
   m_block_sz, g_block_sz, z_block_sz
@@ -91,11 +91,12 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
       LTIMES_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-            <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(phidat, elldat, psidat,
-                                              num_d,
-                                              num_m, num_g, num_z);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel(
+        (ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        phidat, elldat, psidat,
+        num_d, num_m, num_g, num_z );
 
     }
     stopTimer();
@@ -105,19 +106,24 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto ltimes_lambda = [=] __device__ (Index_type z, Index_type g, 
+                                           Index_type m) {
+        for (Index_type d = 0; d < num_d; ++d ) {
+          LTIMES_BODY;
+        }
+      };
+
       LTIMES_THREADS_PER_BLOCK_CUDA;
       LTIMES_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(num_m, num_g, num_z,
-        [=] __device__ (Index_type z, Index_type g, Index_type m) {
-          for (Index_type d = 0; d < num_d; ++d ) {
-            LTIMES_BODY;
-          }
-        }
-      );
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel(
+        (ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                    decltype(ltimes_lambda)>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        num_m, num_g, num_z,
+        ltimes_lambda );
 
     }
     stopTimer();
@@ -144,14 +150,16 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(IDRange(0, num_d),
-                                                 IZRange(0, num_z),
-                                                 IGRange(0, num_g),
-                                                 IMRange(0, num_m)),
-                                         res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(IDRange(0, num_d),
+                           IZRange(0, num_z),
+                           IGRange(0, num_g),
+                           IMRange(0, num_m)),
+          res,
           [=] __device__ (ID d, IZ z, IG g, IM m) {
-          LTIMES_BODY_RAJA;
-        });
+            LTIMES_BODY_RAJA;
+          }
+        );
 
       }
       stopTimer();
diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp
index 035bbc12d..949694d10 100644
--- a/src/apps/LTIMES-Hip.cpp
+++ b/src/apps/LTIMES-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,8 +25,8 @@ namespace apps
 // Define thread block shape for Hip execution
 //
 #define m_block_sz (32)
-#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
+#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
 
 #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
   m_block_sz, g_block_sz, z_block_sz
@@ -90,12 +90,12 @@ void LTIMES::runHipVariantImpl(VariantID vid)
       LTIMES_NBLOCKS_HIP;
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         phidat, elldat, psidat,
-                         num_d,
-                         num_m, num_g, num_z);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        phidat, elldat, psidat,
+        num_d, num_m, num_g, num_z );
 
     }
     stopTimer();
@@ -105,21 +105,24 @@ void LTIMES::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto ltimes_lambda = [=] __device__ (Index_type z, Index_type g, 
+                                           Index_type m) {
+       for (Index_type d = 0; d < num_d; ++d ) {
+         LTIMES_BODY;
+       }
+      };
+
       LTIMES_THREADS_PER_BLOCK_HIP;
       LTIMES_NBLOCKS_HIP;
       constexpr size_t shmem = 0;
 
-      auto ltimes_lambda =
-        [=] __device__ (Index_type z, Index_type g, Index_type m) {
-          for (Index_type d = 0; d < num_d; ++d ) {
-            LTIMES_BODY;
-          }
-        };
-
-      hipLaunchKernelGGL((ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(ltimes_lambda)>),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         num_m, num_g, num_z, ltimes_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                    decltype(ltimes_lambda)>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        num_m, num_g, num_z,
+        ltimes_lambda );
 
     }
     stopTimer();
@@ -146,14 +149,16 @@ void LTIMES::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(IDRange(0, num_d),
-                                               IZRange(0, num_z),
-                                               IGRange(0, num_g),
-                                               IMRange(0, num_m)),
-                                       res,
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(IDRange(0, num_d),
+                         IZRange(0, num_z),
+                         IGRange(0, num_g),
+                         IMRange(0, num_m)),
+        res,
         [=] __device__ (ID d, IZ z, IG g, IM m) {
-        LTIMES_BODY_RAJA;
-      });
+          LTIMES_BODY_RAJA;
+        }
+      );
 
     }
     stopTimer();
diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp
index 93ce138ef..80c4a4a0e 100644
--- a/src/apps/LTIMES-OMP.cpp
+++ b/src/apps/LTIMES-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp
index 7ae4ee1e2..da7047d20 100644
--- a/src/apps/LTIMES-OMPTarget.cpp
+++ b/src/apps/LTIMES-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp
index 33fd4b666..66503ed26 100644
--- a/src/apps/LTIMES-Seq.cpp
+++ b/src/apps/LTIMES-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp
new file mode 100644
index 000000000..541a132f7
--- /dev/null
+++ b/src/apps/LTIMES-Sycl.cpp
@@ -0,0 +1,116 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "LTIMES.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define work-group shape for SYCL execution
+//
+#define m_wg_sz (32)
+#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz))
+#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz))
+
+template <size_t work_group_size >
+void LTIMES::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  LTIMES_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz),
+                              g_wg_sz * RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz),
+                              m_wg_sz * RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz));
+    sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          Index_type m = item.get_global_id(2);
+          Index_type g = item.get_global_id(1);
+          Index_type z = item.get_global_id(0);
+
+          if (m < num_m && g < num_g && z < num_z) {
+            for (Index_type d = 0; d < num_d; ++d) {
+              LTIMES_BODY;
+            }
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    LTIMES_VIEWS_RANGES_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<1, RAJA::sycl_global_2<z_wg_sz>,      //z 
+            RAJA::statement::For<2, RAJA::sycl_global_1<g_wg_sz>,    //g
+              RAJA::statement::For<3, RAJA::sycl_global_0<m_wg_sz>,  //m
+                RAJA::statement::For<0, RAJA::seq_exec,              //d
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::kernel_resource<EXEC_POL>( 
+          RAJA::make_tuple(IDRange(0, num_d),
+                           IZRange(0, num_z),
+                           IGRange(0, num_g),
+                           IMRange(0, num_m)),
+          res,
+          [=] (ID d, IZ z, IG g, IM m) {
+          LTIMES_BODY_RAJA;
+        });
+
+      }
+      stopTimer();
+
+  } else {
+     std::cout << "\n LTIMES : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(LTIMES, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp
index 0abb82d35..798d44715 100644
--- a/src/apps/LTIMES.cpp
+++ b/src/apps/LTIMES.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,20 +23,15 @@ namespace apps
 LTIMES::LTIMES(const RunParams& params)
   : KernelBase(rajaperf::Apps_LTIMES, params)
 {
-  m_num_d_default = 64;
-  m_num_z_default = 488;
-  m_num_g_default = 32;
-  m_num_m_default = 25;
+  m_num_d = params.getLtimesNumD();
+  m_num_g = params.getLtimesNumG();
+  m_num_m = params.getLtimesNumM();
+  Index_type num_z_default = std::max((Index_type{1000000} + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1));
 
-  setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default);
+  setDefaultProblemSize(m_num_d * m_num_g * num_z_default);
   setDefaultReps(50);
 
-  m_num_z = std::max( getTargetProblemSize() /
-                      (m_num_d_default * m_num_g_default),
-                      Index_type(1) );
-  m_num_g = m_num_g_default;
-  m_num_m = m_num_m_default;
-  m_num_d = m_num_d_default;
+  m_num_z = std::max((getTargetProblemSize() + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1));
 
   m_philen = m_num_m * m_num_g * m_num_z;
   m_elllen = m_num_d * m_num_m;
@@ -47,9 +42,11 @@ LTIMES::LTIMES(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_philen +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_elllen +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_psilen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_philen +
+                      1*sizeof(Real_type) * m_elllen +
+                      1*sizeof(Real_type) * m_psilen );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d);
 
   checksum_scale_factor = 0.001 *
@@ -77,6 +74,9 @@ LTIMES::LTIMES(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 LTIMES::~LTIMES()
diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp
index 2f3f0ca6d..0e74f187f 100644
--- a/src/apps/LTIMES.hpp
+++ b/src/apps/LTIMES.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -116,28 +116,28 @@ class LTIMES : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid); 
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Real_ptr m_phidat;
   Real_ptr m_elldat;
   Real_ptr m_psidat;
 
-  Index_type m_num_d_default;
-  Index_type m_num_z_default;
-  Index_type m_num_g_default;
-  Index_type m_num_m_default;
-
   Index_type m_num_d;
   Index_type m_num_z;
   Index_type m_num_g;
diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp
index f12e5d131..9486f20e2 100644
--- a/src/apps/LTIMES_NOVIEW-Cuda.cpp
+++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,8 +25,8 @@ namespace apps
 // Define thread block shape for CUDA execution
 //
 #define m_block_sz (32)
-#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
+#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
 
 #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
   m_block_sz, g_block_sz, z_block_sz
@@ -90,11 +90,12 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid)
       LTIMES_NOVIEW_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      ltimes_noview<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                   <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(phidat, elldat, psidat,
-                                                     num_d,
-                                                     num_m, num_g, num_z);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( 
+        (ltimes_noview<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        phidat, elldat, psidat,
+        num_d, num_m, num_g, num_z );
 
     }
     stopTimer();
@@ -104,19 +105,24 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA;
-      LTIMES_NOVIEW_NBLOCKS_CUDA;
-      constexpr size_t shmem = 0;
-
-      ltimes_noview_lam<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                       <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(num_m, num_g, num_z,
+      auto ltimes_noview_lambda = 
         [=] __device__ (Index_type z, Index_type g, Index_type m) {
           for (Index_type d = 0; d < num_d; ++d ) {
             LTIMES_NOVIEW_BODY;
           }
-        }
-      );
-      cudaErrchk( cudaGetLastError() );
+        }; 
+
+      LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA;
+      LTIMES_NOVIEW_NBLOCKS_CUDA;
+      constexpr size_t shmem = 0;
+
+      RPlaunchCudaKernel( 
+        (ltimes_noview_lam<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                           decltype(ltimes_noview_lambda)>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        num_m, num_g, num_z,
+        ltimes_noview_lambda );
 
     }
     stopTimer();
@@ -141,14 +147,17 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment(0, num_d),
-                                               RAJA::RangeSegment(0, num_z),
-                                               RAJA::RangeSegment(0, num_g),
-                                               RAJA::RangeSegment(0, num_m)),
-                                       res,
-        [=] __device__ (Index_type d, Index_type z, Index_type g, Index_type m) {
-        LTIMES_NOVIEW_BODY;
-      });
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment(0, num_d),
+                         RAJA::RangeSegment(0, num_z),
+                         RAJA::RangeSegment(0, num_g),
+                         RAJA::RangeSegment(0, num_m)),
+        res,
+        [=] __device__ (Index_type d, Index_type z,
+                        Index_type g, Index_type m) {
+          LTIMES_NOVIEW_BODY;
+        }
+      );
 
     }
     stopTimer();
diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp
index 7252a5402..be6e2d756 100644
--- a/src/apps/LTIMES_NOVIEW-Hip.cpp
+++ b/src/apps/LTIMES_NOVIEW-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,8 +25,8 @@ namespace apps
 // Define thread block shape for Hip execution
 //
 #define m_block_sz (32)
-#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
+#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
 
 #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
   m_block_sz, g_block_sz, z_block_sz
@@ -90,12 +90,12 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid)
       LTIMES_NOVIEW_NBLOCKS_HIP;
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((ltimes_noview<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         phidat, elldat, psidat,
-                         num_d,
-                         num_m, num_g, num_z);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (ltimes_noview<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        phidat, elldat, psidat,
+        num_d, num_m, num_g, num_z );
 
     }
     stopTimer();
@@ -105,22 +105,24 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP;
-      LTIMES_NOVIEW_NBLOCKS_HIP;
-      constexpr size_t shmem = 0;
-
-      auto ltimes_noview_lambda =
+      auto ltimes_noview_lambda = 
         [=] __device__ (Index_type z, Index_type g, Index_type m) {
           for (Index_type d = 0; d < num_d; ++d ) {
             LTIMES_NOVIEW_BODY;
           }
-      };
+        };
+
+      LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP;
+      LTIMES_NOVIEW_NBLOCKS_HIP;
+      constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((ltimes_noview_lam<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(ltimes_noview_lambda)>),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         num_m, num_g, num_z,
-                         ltimes_noview_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (ltimes_noview_lam<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                           decltype(ltimes_noview_lambda)>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        num_m, num_g, num_z,
+        ltimes_noview_lambda );
 
     }
     stopTimer();
@@ -145,14 +147,17 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment(0, num_d),
-                                               RAJA::RangeSegment(0, num_z),
-                                               RAJA::RangeSegment(0, num_g),
-                                               RAJA::RangeSegment(0, num_m)),
-                                       res,
-        [=] __device__ (Index_type d, Index_type z, Index_type g, Index_type m) {
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment(0, num_d),
+                         RAJA::RangeSegment(0, num_z),
+                         RAJA::RangeSegment(0, num_g),
+                         RAJA::RangeSegment(0, num_m)),
+        res,
+        [=] __device__ (Index_type d, Index_type z,
+                        Index_type g, Index_type m) {
           LTIMES_NOVIEW_BODY;
-      });
+        }
+      );
 
     }
     stopTimer();
diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp
index e9df87b83..900606076 100644
--- a/src/apps/LTIMES_NOVIEW-OMP.cpp
+++ b/src/apps/LTIMES_NOVIEW-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp
index 9a1f0bf06..1ffddaeaa 100644
--- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp
+++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp
index cd202004d..d4c6e4f41 100644
--- a/src/apps/LTIMES_NOVIEW-Seq.cpp
+++ b/src/apps/LTIMES_NOVIEW-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp
new file mode 100644
index 000000000..d9b5cfaf6
--- /dev/null
+++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp
@@ -0,0 +1,114 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "LTIMES_NOVIEW.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define work-group shape for SYCL execution
+//
+#define m_wg_sz (32)
+#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz))
+#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz))
+
+template <size_t work_group_size >
+void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  LTIMES_NOVIEW_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz),
+                              g_wg_sz * RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz),
+                              m_wg_sz * RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz));
+    sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          Index_type m = item.get_global_id(2);
+          Index_type g = item.get_global_id(1);
+          Index_type z = item.get_global_id(0);
+
+          if (m < num_m && g < num_g && z < num_z) {
+            for (Index_type d = 0; d < num_d; ++d) {
+              LTIMES_NOVIEW_BODY;
+            } 
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<1, RAJA::sycl_global_2<z_wg_sz>,      //z
+            RAJA::statement::For<2, RAJA::sycl_global_1<g_wg_sz>,    //g
+              RAJA::statement::For<3, RAJA::sycl_global_0<m_wg_sz>,  //m
+                RAJA::statement::For<0, RAJA::seq_exec,              //d
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel_resource<EXEC_POL>( 
+        RAJA::make_tuple(RAJA::RangeSegment(0, num_d),
+                         RAJA::RangeSegment(0, num_z),
+                         RAJA::RangeSegment(0, num_g),
+                         RAJA::RangeSegment(0, num_m)),
+        res,
+        [=] (Index_type d, Index_type z, Index_type g, Index_type m) {
+        LTIMES_NOVIEW_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n LTIMES_NOVIEW : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(LTIMES_NOVIEW, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp
index a106d5418..0e675d487 100644
--- a/src/apps/LTIMES_NOVIEW.cpp
+++ b/src/apps/LTIMES_NOVIEW.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,20 +23,15 @@ namespace apps
 LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params)
   : KernelBase(rajaperf::Apps_LTIMES_NOVIEW, params)
 {
-  m_num_d_default = 64;
-  m_num_z_default = 488;
-  m_num_g_default = 32;
-  m_num_m_default = 25;
+  m_num_d = params.getLtimesNumD();
+  m_num_g = params.getLtimesNumG();
+  m_num_m = params.getLtimesNumM();
+  Index_type num_z_default = std::max((Index_type{1000000} + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1));
 
-  setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default);
+  setDefaultProblemSize(m_num_d * m_num_g * num_z_default);
   setDefaultReps(50);
 
-  m_num_z = std::max( getTargetProblemSize() /
-                      (m_num_d_default * m_num_g_default),
-                      Index_type(1) );
-  m_num_g = m_num_g_default;
-  m_num_m = m_num_m_default;
-  m_num_d = m_num_d_default;
+  m_num_z = std::max((getTargetProblemSize() + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1));
 
   m_philen = m_num_m * m_num_g * m_num_z;
   m_elllen = m_num_d * m_num_m;
@@ -47,9 +42,11 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_philen +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_elllen +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_psilen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_philen +
+                      1*sizeof(Real_type) * m_elllen +
+                      1*sizeof(Real_type) * m_psilen );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d);
 
   checksum_scale_factor = 0.001 *
@@ -76,6 +73,9 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 LTIMES_NOVIEW::~LTIMES_NOVIEW()
diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp
index 96a296366..4829b8171 100644
--- a/src/apps/LTIMES_NOVIEW.hpp
+++ b/src/apps/LTIMES_NOVIEW.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -66,28 +66,28 @@ class LTIMES_NOVIEW : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Real_ptr m_phidat;
   Real_ptr m_elldat;
   Real_ptr m_psidat;
 
-  Index_type m_num_d_default;
-  Index_type m_num_z_default;
-  Index_type m_num_g_default;
-  Index_type m_num_m_default;
-
   Index_type m_num_d;
   Index_type m_num_z;
   Index_type m_num_g;
diff --git a/src/apps/MASS3DEA-Cuda.cpp b/src/apps/MASS3DEA-Cuda.cpp
index 87d918b11..649fd5b01 100644
--- a/src/apps/MASS3DEA-Cuda.cpp
+++ b/src/apps/MASS3DEA-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -69,15 +69,16 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) {
 
   case Base_CUDA: {
 
-    dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D);
-    constexpr size_t shmem = 0;
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      Mass3DEA<block_size><<<NE, nthreads_per_block, shmem, res.get_stream()>>>(B, D, M);
+      dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D);
+      constexpr size_t shmem = 0;
 
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (Mass3DEA<block_size>),
+                          NE, nthreads_per_block,
+                          shmem, res.get_stream(),
+                          B, D, M );
     }
     stopTimer();
 
diff --git a/src/apps/MASS3DEA-Hip.cpp b/src/apps/MASS3DEA-Hip.cpp
index 7184694e3..2eeabadeb 100644
--- a/src/apps/MASS3DEA-Hip.cpp
+++ b/src/apps/MASS3DEA-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -69,17 +69,16 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) {
 
   case Base_HIP: {
 
-    dim3 nblocks(NE);
-    dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D);
-    constexpr size_t shmem = 0;
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipLaunchKernelGGL((Mass3DEA<block_size>), dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         B, D, M);
+      dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D);
+      constexpr size_t shmem = 0;
 
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (Mass3DEA<block_size>),
+                         NE, nthreads_per_block,
+                         shmem, res.get_stream(),
+                         B, D, M );
     }
     stopTimer();
 
diff --git a/src/apps/MASS3DEA-OMP.cpp b/src/apps/MASS3DEA-OMP.cpp
index 7dd1ab122..2b77eeb6e 100644
--- a/src/apps/MASS3DEA-OMP.cpp
+++ b/src/apps/MASS3DEA-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/MASS3DEA-OMPTarget.cpp b/src/apps/MASS3DEA-OMPTarget.cpp
index b3d8aa75f..6f41914ab 100644
--- a/src/apps/MASS3DEA-OMPTarget.cpp
+++ b/src/apps/MASS3DEA-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/MASS3DEA-Seq.cpp b/src/apps/MASS3DEA-Seq.cpp
index bc906f0f6..f3b0cfc99 100644
--- a/src/apps/MASS3DEA-Seq.cpp
+++ b/src/apps/MASS3DEA-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/MASS3DEA-Sycl.cpp b/src/apps/MASS3DEA-Sycl.cpp
new file mode 100644
index 000000000..a2dfd87f3
--- /dev/null
+++ b/src/apps/MASS3DEA-Sycl.cpp
@@ -0,0 +1,196 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MASS3DEA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+template < size_t work_group_size >
+void MASS3DEA::runSyclVariantImpl(VariantID vid) {
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  MASS3DEA_DATA_SETUP;
+
+  switch (vid) {
+
+  case Base_SYCL: {
+
+    const ::sycl::range<3> workGroupSize(MEA_Q1D, MEA_Q1D, MEA_Q1D);
+    const ::sycl::range<3> gridSize(MEA_Q1D,MEA_Q1D,MEA_Q1D*NE);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      constexpr size_t shmem = 0;
+      qu->submit([&](cl::sycl::handler& h) {
+
+      ::sycl::local_accessor<double, 2> s_B(::sycl::range<2>(MEA_Q1D,MEA_D1D),h);
+      ::sycl::local_accessor<double, 3> s_D(::sycl::range<3>(MEA_Q1D,MEA_Q1D,MEA_Q1D),h);
+
+      h.parallel_for
+        (cl::sycl::nd_range<3>(gridSize, workGroupSize),
+         [=] (cl::sycl::nd_item<3> itm) {
+
+           const Index_type e = itm.get_group(2);
+
+           SYCL_FOREACH_THREAD(iz, 0, 1) {
+             SYCL_FOREACH_THREAD(d, 2, MEA_D1D) {
+               SYCL_FOREACH_THREAD(q, 1, MEA_Q1D) {
+                 MASS3DEA_1
+               }
+             }
+           }
+
+           //not needed as we dynamicaly allocate shared memory in sycl
+           //MASS3DEA_2
+
+           SYCL_FOREACH_THREAD(k1, 2, MEA_Q1D) {
+             SYCL_FOREACH_THREAD(k2, 1, MEA_Q1D) {
+               SYCL_FOREACH_THREAD(k3, 0, MEA_Q1D) {
+                 MASS3DEA_3
+               }
+             }
+           }
+
+           itm.barrier(::sycl::access::fence_space::local_space);
+
+           SYCL_FOREACH_THREAD(i1, 2, MEA_D1D) {
+             SYCL_FOREACH_THREAD(i2, 1, MEA_D1D) {
+               SYCL_FOREACH_THREAD(i3, 0, MEA_D1D) {
+                 MASS3DEA_4
+               }
+             }
+           }
+
+         });
+      });
+
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_SYCL: {
+
+    constexpr bool async = true;
+
+    using launch_policy = RAJA::LaunchPolicy<RAJA::sycl_launch_t<async>>;
+
+    using outer_x = RAJA::LoopPolicy<RAJA::sycl_group_2_direct>;
+
+    using inner_x = RAJA::LoopPolicy<RAJA::sycl_local_2_loop>;
+
+    using inner_y = RAJA::LoopPolicy<RAJA::sycl_local_1_loop>;
+
+    using inner_z = RAJA::LoopPolicy<RAJA::sycl_local_0_loop>;
+
+    constexpr size_t shmem = (MEA_Q1D*MEA_D1D + MEA_Q1D*MEA_Q1D*MEA_Q1D)*sizeof(double);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::launch<launch_policy>( res,
+        RAJA::LaunchParams(RAJA::Teams(NE),
+                           RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D), shmem),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+          RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
+            [&](int e) {
+
+              double * s_B_ptr = ctx.getSharedMemory<double>(MEA_Q1D*MEA_D1D);
+              double * s_D_ptr = ctx.getSharedMemory<double>(MEA_Q1D*MEA_Q1D*MEA_Q1D);
+
+              double (*s_B)[MEA_D1D] = (double (*)[MEA_D1D]) s_B_ptr;
+              double (*s_D)[MEA_Q1D][MEA_Q1D] = (double (*)[MEA_Q1D][MEA_Q1D]) s_B_ptr;
+
+              RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
+                [&](int ) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_D1D),
+                    [&](int d) {
+                      RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
+                        [&](int q) {
+                          MASS3DEA_1
+                        }
+                      ); // RAJA::loop<inner_y>
+                    }
+                  ); // RAJA::loop<inner_x>
+                }
+              ); // RAJA::loop<inner_z>
+
+              //not needed as we dynamicaly allocate shared memory in sycl
+              //MASS3DEA_2
+
+              RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
+                [&](int k1) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
+                    [&](int k2) {
+                      RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
+                        [&](int k3) {
+                          MASS3DEA_3
+                        }
+                      ); // RAJA::loop<inner_x>
+                    }
+                  ); // RAJA::loop<inner_y>
+                }
+              ); // RAJA::loop<inner_z>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_D1D),
+                [&](int i1) {
+                  RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_D1D),
+                    [&](int i2) {
+                      RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, MEA_D1D),
+                        [&](int i3) {
+                          MASS3DEA_4
+                        }
+                      ); // RAJA::loop<inner_x>
+                    }
+                  ); // RAJA::loop<inner_y>
+                }
+              ); // RAJA::loop<inner_z>
+
+            }  // lambda (e)
+          );  // RAJA::loop<outer_x>
+
+        }  // outer lambda (ctx)
+      );  // RAJA::launch
+
+    }  // loop over kernel reps
+    stopTimer();
+
+    break;
+  }
+
+  default: {
+
+    getCout() << "\n MASS3DEA : Unknown Sycl variant id = " << vid << std::endl;
+    break;
+  }
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DEA, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_SYCL
diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp
index d6239222f..9689c35ae 100644
--- a/src/apps/MASS3DEA.cpp
+++ b/src/apps/MASS3DEA.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,20 +29,21 @@ MASS3DEA::MASS3DEA(const RunParams& params)
   setDefaultReps(1);
 
   const int ea_mat_entries = MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D;
-  
-  m_NE = std::max(getTargetProblemSize()/(ea_mat_entries), Index_type(1));
+
+  m_NE = std::max((getTargetProblemSize() + (ea_mat_entries)/2) / (ea_mat_entries), Index_type(1));
 
   setActualProblemSize( m_NE*ea_mat_entries);
 
   setItsPerRep(getActualProblemSize());
   setKernelsPerRep(1);
 
-  setBytesPerRep( MEA_Q1D*MEA_D1D*sizeof(Real_type)  + // B
-                  MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE*sizeof(Real_type) + // D
-                  ea_mat_entries*m_NE*sizeof(Real_type) ); // M_e
+  setBytesReadPerRep( 1*sizeof(Real_type) * MEA_Q1D*MEA_D1D + // B
+                      1*sizeof(Real_type) * MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE ); // D
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * ea_mat_entries*m_NE ); // M_e
+  setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * 7 * ea_mat_entries);
-                 
+
   setUsesFeature(Launch);
 
   setVariantDefined( Base_Seq );
@@ -57,6 +58,9 @@ MASS3DEA::MASS3DEA(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
 }
 
 MASS3DEA::~MASS3DEA()
diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp
index df029299e..7c0ea6e02 100644
--- a/src/apps/MASS3DEA.hpp
+++ b/src/apps/MASS3DEA.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -104,11 +104,9 @@
 #define MASS3DEA_1 s_B[q][d] = B_MEA_(q, d);
 
 #define MASS3DEA_2                                                      \
-  double(*l_B)[MEA_D1D] = (double(*)[MEA_D1D])s_B;                      \
   RAJA_TEAM_SHARED double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D];
 
 #define MASS3DEA_2_CPU                                                  \
-  double(*l_B)[MEA_D1D] = (double(*)[MEA_D1D])s_B;                      \
   double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D];
 
 #define MASS3DEA_3 s_D[k1][k2][k3] = D_MEA_(k1, k2, k3, e);
@@ -123,9 +121,9 @@
           for (int k2 = 0; k2 < MEA_Q1D; ++k2) {                        \
             for (int k3 = 0; k3 < MEA_Q1D; ++k3) {                      \
                                                                         \
-              val += l_B[k1][i1] * l_B[k1][j1] * l_B[k2][i2]            \
-                * l_B[k2][j2] *                                         \
-                l_B[k3][i3] * l_B[k3][j3] * s_D[k1][k2][k3];            \
+              val += s_B[k1][i1] * s_B[k1][j1] * s_B[k2][i2]            \
+                * s_B[k2][j2] *                                         \
+                s_B[k3][i3] * s_B[k3][j3] * s_D[k1][k2][k3];            \
             }                                                           \
           }                                                             \
         }                                                               \
@@ -154,16 +152,23 @@ class MASS3DEA : public KernelBase {
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
-  template <size_t block_size> void runCudaVariantImpl(VariantID vid);
-  template <size_t block_size> void runHipVariantImpl(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
+  template <size_t block_size>
+  void runCudaVariantImpl(VariantID vid);
+  template <size_t block_size>
+  void runHipVariantImpl(VariantID vid);
+  template <size_t work_group_size>
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = MEA_D1D * MEA_D1D * MEA_D1D;
   using gpu_block_sizes_type =
-      gpu_block_size::list_type<default_gpu_block_size>;
+      integer::list_type<default_gpu_block_size>;
 
   Real_ptr m_B;
   Real_ptr m_Bt;
diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp
index a8f769d6a..60092ef7f 100644
--- a/src/apps/MASS3DPA-Cuda.cpp
+++ b/src/apps/MASS3DPA-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -99,15 +99,16 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) {
 
   case Base_CUDA: {
 
-    dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1);
-    constexpr size_t shmem = 0;
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      Mass3DPA<block_size><<<NE, nthreads_per_block, shmem, res.get_stream()>>>(B, Bt, D, X, Y);
+      dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1);
+      constexpr size_t shmem = 0;
 
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (Mass3DPA<block_size>),
+                          NE, nthreads_per_block,
+                          shmem, res.get_stream(),
+                          B, Bt, D, X, Y );
     }
     stopTimer();
 
diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp
index c9d600136..1fbd0dea9 100644
--- a/src/apps/MASS3DPA-Hip.cpp
+++ b/src/apps/MASS3DPA-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -99,17 +99,16 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) {
 
   case Base_HIP: {
 
-    dim3 nblocks(NE);
-    dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1);
-    constexpr size_t shmem = 0;
-
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipLaunchKernelGGL((Mass3DPA<block_size>), dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         B, Bt, D, X, Y);
+      dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1);
+      constexpr size_t shmem = 0;
 
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (Mass3DPA<block_size>),
+                         NE, nthreads_per_block,
+                         shmem, res.get_stream(),
+                         B, Bt, D, X, Y );
 
     }
     stopTimer();
diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp
index f2e122fed..4c6b2867c 100644
--- a/src/apps/MASS3DPA-OMP.cpp
+++ b/src/apps/MASS3DPA-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp
index e4cc02a4f..d74c14641 100644
--- a/src/apps/MASS3DPA-OMPTarget.cpp
+++ b/src/apps/MASS3DPA-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp
index 39087834d..e22068169 100644
--- a/src/apps/MASS3DPA-Seq.cpp
+++ b/src/apps/MASS3DPA-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/MASS3DPA-Sycl.cpp b/src/apps/MASS3DPA-Sycl.cpp
new file mode 100644
index 000000000..7d65034a7
--- /dev/null
+++ b/src/apps/MASS3DPA-Sycl.cpp
@@ -0,0 +1,316 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+// Uncomment to add compiler directives loop unrolling
+//#define USE_RAJAPERF_UNROLL
+
+#include "MASS3DPA.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace apps {
+
+template < size_t work_group_size >
+void MASS3DPA::runSyclVariantImpl(VariantID vid) {
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  MASS3DPA_DATA_SETUP;
+
+  const ::sycl::range<3> workGroupSize(1, MPA_Q1D, MPA_Q1D);
+  const ::sycl::range<3> gridSize(1, MPA_Q1D, MPA_Q1D*NE);
+
+  switch (vid) {
+
+  case Base_SYCL: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&](cl::sycl::handler& h) {
+
+        constexpr int MQ1 = MPA_Q1D;
+        constexpr int MD1 = MPA_D1D;
+        constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
+
+        auto sDQ_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MQ1 * MD1), h);
+        auto sm0_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MDQ * MDQ * MDQ), h);
+        auto sm1_vec = ::sycl::local_accessor<double, 1>(::sycl::range<1>(MDQ * MDQ * MDQ), h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, workGroupSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+             const Index_type e = itm.get_group(2);
+
+             double *sDQ = sDQ_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+             double *sm1 = sm1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+
+             double(*Bsmem)[MD1] = (double(*)[MD1])sDQ;
+             double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ;
+
+             double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0;
+             double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1;
+             double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0;
+             double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1;
+             double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0;
+             double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1;
+
+             SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) {
+               SYCL_FOREACH_THREAD(dx, 2, MPA_D1D){
+                 MASS3DPA_1
+               }
+               SYCL_FOREACH_THREAD(dx, 2, MPA_Q1D) {
+                 MASS3DPA_2
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) {
+               SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) {
+                 MASS3DPA_3
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) {
+               SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) {
+                 MASS3DPA_4
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) {
+               SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) {
+                 MASS3DPA_5
+               }
+             }
+
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(d, 1, MPA_D1D) {
+               SYCL_FOREACH_THREAD(q, 2, MPA_Q1D) {
+                 MASS3DPA_6
+               }
+             }
+
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) {
+               SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) {
+                 MASS3DPA_7
+               }
+             }
+             itm.barrier(::sycl::access::fence_space::local_space);
+
+             SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) {
+               SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) {
+                 MASS3DPA_8
+               }
+             }
+
+             itm.barrier(::sycl::access::fence_space::local_space);
+             SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) {
+               SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) {
+                 MASS3DPA_9
+               }
+             }
+
+           });
+      });
+
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_SYCL: {
+
+    constexpr bool async = true;
+
+    using launch_policy = RAJA::LaunchPolicy<RAJA::sycl_launch_t<async>>;
+
+    using outer_x = RAJA::LoopPolicy<RAJA::sycl_group_2_direct>;
+
+    using inner_x = RAJA::LoopPolicy<RAJA::sycl_local_2_direct>;
+
+    using inner_y = RAJA::LoopPolicy<RAJA::sycl_local_1_direct>;
+
+    //Caclulate amount of shared memory needed
+    size_t shmem = 0;
+    {
+      constexpr int MQ1 = MPA_Q1D;
+      constexpr int MD1 = MPA_D1D;
+      constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
+
+      constexpr int no_mats = 2;
+      shmem += MQ1 * MD1 * no_mats * MDQ * MDQ * MDQ * sizeof(double);
+    }
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::launch<launch_policy>( res,
+        RAJA::LaunchParams(RAJA::Teams(NE),
+                           RAJA::Threads(MPA_Q1D, MPA_Q1D), shmem),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+          RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
+            [&](int e) {
+
+             //Redefine inside the lambda to keep consistent with base version
+             constexpr int MQ1 = MPA_Q1D;
+             constexpr int MD1 = MPA_D1D;
+             constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
+
+             double *sDQ = ctx.getSharedMemory<double>(MQ1 * MD1);
+             double *sm0 = ctx.getSharedMemory<double>(MDQ * MDQ * MDQ);
+             double *sm1 = ctx.getSharedMemory<double>(MDQ * MDQ * MDQ);
+
+             double(*Bsmem)[MD1] = (double(*)[MD1])sDQ;
+             double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ;
+
+             double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0;
+             double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1;
+             double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0;
+             double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1;
+             double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0;
+             double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1;
+
+              RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                [&](int dy) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                    [&](int dx) {
+                      MASS3DPA_1
+                    }
+                  );  // RAJA::loop<inner_x>
+
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
+                    [&](int dx) {
+                      MASS3DPA_2
+                    }
+                  );  // RAJA::loop<inner_x>
+                }  // lambda (dy)
+              );  // RAJA::loop<inner_y>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                [&](int dy) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
+                    [&](int qx) {
+                      MASS3DPA_3
+                    }
+                  );  // RAJA::loop<inner_x>
+                }
+              );  // RAJA::loop<inner_y>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
+                [&](int qy) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
+                    [&](int qx) {
+                      MASS3DPA_4
+                    }
+                  );  // RAJA::loop<inner_x>
+                }
+              );  // RAJA::loop<inner_y>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
+                [&](int qy) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
+                    [&](int qx) {
+                      MASS3DPA_5
+                    }
+                  );  // RAJA::loop<inner_x>
+                }
+              );  // RAJA::loop<inner_y>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                [&](int d) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
+                    [&](int q) {
+                      MASS3DPA_6
+                    }
+                  );  // RAJA::loop<inner_x>
+                }
+              );  // RAJA::loop<inner_y>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_Q1D),
+                [&](int qy) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                    [&](int dx) {
+                      MASS3DPA_7
+                    }
+                  );  // RAJA::loop<inner_x>
+                }
+              );  // RAJA::loop<inner_y>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                [&](int dy) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                    [&](int dx) {
+                      MASS3DPA_8
+                    }
+                  );  // RAJA::loop<inner_x>
+                }
+              );  // RAJA::loop<inner_y>
+
+              ctx.teamSync();
+
+              RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                [&](int dy) {
+                  RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MPA_D1D),
+                    [&](int dx) {
+                      MASS3DPA_9
+                    }
+                  );  // RAJA::loop<inner_x>
+                }
+              );  // RAJA::loop<inner_y>
+
+            }  // lambda (e)
+          );  // RAJA::loop<outer_x>
+
+        }  // outer lambda (ctx)
+      );  // RAJA::launch
+
+    }  // loop over kernel reps
+    stopTimer();
+
+    break;
+  }
+
+  default: {
+
+    getCout() << "\n MASS3DPA : Unknown Sycl variant id = " << vid << std::endl;
+    break;
+  }
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_SYCL
diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp
index 1c99e9f73..f60e64c3e 100644
--- a/src/apps/MASS3DPA.cpp
+++ b/src/apps/MASS3DPA.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,18 +28,18 @@ MASS3DPA::MASS3DPA(const RunParams& params)
   setDefaultProblemSize(m_NE_default*MPA_Q1D*MPA_Q1D*MPA_Q1D);
   setDefaultReps(50);
 
-  m_NE = std::max(getTargetProblemSize()/(MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1));
+  m_NE = std::max((getTargetProblemSize() + (MPA_Q1D*MPA_Q1D*MPA_Q1D)/2) / (MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1));
 
   setActualProblemSize( m_NE*MPA_Q1D*MPA_Q1D*MPA_Q1D );
 
   setItsPerRep(getActualProblemSize());
   setKernelsPerRep(1);
 
-  setBytesPerRep( MPA_Q1D*MPA_D1D*sizeof(Real_type)  +
-                  MPA_Q1D*MPA_D1D*sizeof(Real_type)  +
-                  MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE*sizeof(Real_type) +
-                  MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) +
-                  MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) );
+  setBytesReadPerRep( 2*sizeof(Real_type) * MPA_Q1D*MPA_D1D + // B, Bt
+                      2*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X, Y
+                      1*sizeof(Real_type) * MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE ); // D
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y
+  setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D +
                          2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D +
@@ -61,6 +61,9 @@ MASS3DPA::MASS3DPA(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
 }
 
 MASS3DPA::~MASS3DPA()
diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp
index 7365fa011..0e11e234b 100644
--- a/src/apps/MASS3DPA.hpp
+++ b/src/apps/MASS3DPA.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -363,17 +363,22 @@ class MASS3DPA : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D;
-  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::list_type<default_gpu_block_size>;
 
   Real_ptr m_B;
   Real_ptr m_Bt;
diff --git a/src/apps/MATVEC_3D_STENCIL-Cuda.cpp b/src/apps/MATVEC_3D_STENCIL-Cuda.cpp
new file mode 100644
index 000000000..e5ef75f08
--- /dev/null
+++ b/src/apps/MATVEC_3D_STENCIL-Cuda.cpp
@@ -0,0 +1,204 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MATVEC_3D_STENCIL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void matvec_3d(Real_ptr b,
+                          Real_ptr xdbl,
+                          Real_ptr xdbc,
+                          Real_ptr xdbr,
+                          Real_ptr xdcl,
+                          Real_ptr xdcc,
+                          Real_ptr xdcr,
+                          Real_ptr xdfl,
+                          Real_ptr xdfc,
+                          Real_ptr xdfr,
+                          Real_ptr xcbl,
+                          Real_ptr xcbc,
+                          Real_ptr xcbr,
+                          Real_ptr xccl,
+                          Real_ptr xccc,
+                          Real_ptr xccr,
+                          Real_ptr xcfl,
+                          Real_ptr xcfc,
+                          Real_ptr xcfr,
+                          Real_ptr xubl,
+                          Real_ptr xubc,
+                          Real_ptr xubr,
+                          Real_ptr xucl,
+                          Real_ptr xucc,
+                          Real_ptr xucr,
+                          Real_ptr xufl,
+                          Real_ptr xufc,
+                          Real_ptr xufr,
+                          Real_ptr dbl,
+                          Real_ptr dbc,
+                          Real_ptr dbr,
+                          Real_ptr dcl,
+                          Real_ptr dcc,
+                          Real_ptr dcr,
+                          Real_ptr dfl,
+                          Real_ptr dfc,
+                          Real_ptr dfr,
+                          Real_ptr cbl,
+                          Real_ptr cbc,
+                          Real_ptr cbr,
+                          Real_ptr ccl,
+                          Real_ptr ccc,
+                          Real_ptr ccr,
+                          Real_ptr cfl,
+                          Real_ptr cfc,
+                          Real_ptr cfr,
+                          Real_ptr ubl,
+                          Real_ptr ubc,
+                          Real_ptr ubr,
+                          Real_ptr ucl,
+                          Real_ptr ucc,
+                          Real_ptr ucr,
+                          Real_ptr ufl,
+                          Real_ptr ufc,
+                          Real_ptr ufr,
+                          Index_ptr real_zones,
+                          Index_type ibegin, Index_type iend)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = ii + ibegin;
+   if (i < iend) {
+     MATVEC_3D_STENCIL_BODY_INDEX;
+     MATVEC_3D_STENCIL_BODY;
+   }
+}
+
+
+template < size_t block_size >
+void MATVEC_3D_STENCIL::runCudaVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  auto res{getCudaResource()};
+
+  MATVEC_3D_STENCIL_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchCudaKernel( (matvec_3d<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          b,
+                          xdbl,
+                          xdbc,
+                          xdbr,
+                          xdcl,
+                          xdcc,
+                          xdcr,
+                          xdfl,
+                          xdfc,
+                          xdfr,
+                          xcbl,
+                          xcbc,
+                          xcbr,
+                          xccl,
+                          xccc,
+                          xccr,
+                          xcfl,
+                          xcfc,
+                          xcfr,
+                          xubl,
+                          xubc,
+                          xubr,
+                          xucl,
+                          xucc,
+                          xucr,
+                          xufl,
+                          xufc,
+                          xufr,
+                          dbl,
+                          dbc,
+                          dbr,
+                          dcl,
+                          dcc,
+                          dcr,
+                          dfl,
+                          dfc,
+                          dfr,
+                          cbl,
+                          cbc,
+                          cbr,
+                          ccl,
+                          ccc,
+                          ccr,
+                          cfl,
+                          cfc,
+                          cfr,
+                          ubl,
+                          ubc,
+                          ubr,
+                          ucl,
+                          ucc,
+                          ucr,
+                          ufl,
+                          ufc,
+                          ufr,
+                          real_zones,
+                          ibegin, iend );
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    RAJA::TypedListSegment<Index_type> zones(real_zones, iend,
+                                             res, RAJA::Unowned);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+        zones, [=] __device__ (Index_type i) {
+          MATVEC_3D_STENCIL_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  MATVEC_3D_STENCIL : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Cuda)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/MATVEC_3D_STENCIL-Hip.cpp b/src/apps/MATVEC_3D_STENCIL-Hip.cpp
new file mode 100644
index 000000000..a24757cb8
--- /dev/null
+++ b/src/apps/MATVEC_3D_STENCIL-Hip.cpp
@@ -0,0 +1,204 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MATVEC_3D_STENCIL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void matvec_3d(Real_ptr b,
+                          Real_ptr xdbl,
+                          Real_ptr xdbc,
+                          Real_ptr xdbr,
+                          Real_ptr xdcl,
+                          Real_ptr xdcc,
+                          Real_ptr xdcr,
+                          Real_ptr xdfl,
+                          Real_ptr xdfc,
+                          Real_ptr xdfr,
+                          Real_ptr xcbl,
+                          Real_ptr xcbc,
+                          Real_ptr xcbr,
+                          Real_ptr xccl,
+                          Real_ptr xccc,
+                          Real_ptr xccr,
+                          Real_ptr xcfl,
+                          Real_ptr xcfc,
+                          Real_ptr xcfr,
+                          Real_ptr xubl,
+                          Real_ptr xubc,
+                          Real_ptr xubr,
+                          Real_ptr xucl,
+                          Real_ptr xucc,
+                          Real_ptr xucr,
+                          Real_ptr xufl,
+                          Real_ptr xufc,
+                          Real_ptr xufr,
+                          Real_ptr dbl,
+                          Real_ptr dbc,
+                          Real_ptr dbr,
+                          Real_ptr dcl,
+                          Real_ptr dcc,
+                          Real_ptr dcr,
+                          Real_ptr dfl,
+                          Real_ptr dfc,
+                          Real_ptr dfr,
+                          Real_ptr cbl,
+                          Real_ptr cbc,
+                          Real_ptr cbr,
+                          Real_ptr ccl,
+                          Real_ptr ccc,
+                          Real_ptr ccr,
+                          Real_ptr cfl,
+                          Real_ptr cfc,
+                          Real_ptr cfr,
+                          Real_ptr ubl,
+                          Real_ptr ubc,
+                          Real_ptr ubr,
+                          Real_ptr ucl,
+                          Real_ptr ucc,
+                          Real_ptr ucr,
+                          Real_ptr ufl,
+                          Real_ptr ufc,
+                          Real_ptr ufr,
+                          Index_ptr real_zones,
+                          Index_type ibegin, Index_type iend)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = ii + ibegin;
+   if (i < iend) {
+     MATVEC_3D_STENCIL_BODY_INDEX;
+     MATVEC_3D_STENCIL_BODY;
+   }
+}
+
+
+template < size_t block_size >
+void MATVEC_3D_STENCIL::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  auto res{getHipResource()};
+
+  MATVEC_3D_STENCIL_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchHipKernel( (matvec_3d<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         b,
+                         xdbl,
+                         xdbc,
+                         xdbr,
+                         xdcl,
+                         xdcc,
+                         xdcr,
+                         xdfl,
+                         xdfc,
+                         xdfr,
+                         xcbl,
+                         xcbc,
+                         xcbr,
+                         xccl,
+                         xccc,
+                         xccr,
+                         xcfl,
+                         xcfc,
+                         xcfr,
+                         xubl,
+                         xubc,
+                         xubr,
+                         xucl,
+                         xucc,
+                         xucr,
+                         xufl,
+                         xufc,
+                         xufr,
+                         dbl,
+                         dbc,
+                         dbr,
+                         dcl,
+                         dcc,
+                         dcr,
+                         dfl,
+                         dfc,
+                         dfr,
+                         cbl,
+                         cbc,
+                         cbr,
+                         ccl,
+                         ccc,
+                         ccr,
+                         cfl,
+                         cfc,
+                         cfr,
+                         ubl,
+                         ubc,
+                         ubr,
+                         ucl,
+                         ucc,
+                         ucr,
+                         ufl,
+                         ufc,
+                         ufr,
+                         real_zones,
+                         ibegin, iend );
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_HIP ) {
+
+    RAJA::TypedListSegment<Index_type> zones(real_zones, iend,
+                                             res, RAJA::Unowned);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+        zones, [=] __device__ (Index_type i) {
+          MATVEC_3D_STENCIL_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  MATVEC_3D_STENCIL : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Hip)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/apps/MATVEC_3D_STENCIL-OMP.cpp b/src/apps/MATVEC_3D_STENCIL-OMP.cpp
new file mode 100644
index 000000000..6365aed70
--- /dev/null
+++ b/src/apps/MATVEC_3D_STENCIL-OMP.cpp
@@ -0,0 +1,108 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MATVEC_3D_STENCIL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+void MATVEC_3D_STENCIL::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  MATVEC_3D_STENCIL_DATA_SETUP;
+
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          MATVEC_3D_STENCIL_BODY_INDEX;
+          MATVEC_3D_STENCIL_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      auto matvec_3d_lam = [=](Index_type ii) {
+            MATVEC_3D_STENCIL_BODY_INDEX;
+            MATVEC_3D_STENCIL_BODY;
+          };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          matvec_3d_lam(ii);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      camp::resources::Resource working_res{camp::resources::Host::get_default()};
+      RAJA::TypedListSegment<Index_type> zones(real_zones, iend,
+                                               working_res, RAJA::Unowned);
+
+      auto matvec_3d_lam = [=](Index_type i) {
+                                         MATVEC_3D_STENCIL_BODY;
+                                       };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          zones, matvec_3d_lam);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  MATVEC_3D_STENCIL : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp b/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp
new file mode 100644
index 000000000..09a3093d4
--- /dev/null
+++ b/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp
@@ -0,0 +1,87 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MATVEC_3D_STENCIL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+
+void MATVEC_3D_STENCIL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  MATVEC_3D_STENCIL_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(b, \
+                                       dbl, dbc, dbr, dcl, dcc, dcr, dfl, dfc, dfr, \
+                                       xdbl, xdbc, xdbr, xdcl, xdcc, xdcr, xdfl, xdfc, xdfr, \
+                                       cbl, cbc, cbr, ccl, ccc, ccr, cfl, cfc, cfr, \
+                                       xcbl, xcbc, xcbr, xccl, xccc, xccr, xcfl, xcfc, xcfr, \
+                                       ubl, ubc, ubr, ucl, ucc, ucr, ufl, ufc, ufr, \
+                                       xubl, xubc, xubr, xucl, xucc, xucr, xufl, xufc, xufr, \
+                                       real_zones) device( did )
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+      for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+        MATVEC_3D_STENCIL_BODY_INDEX;
+        MATVEC_3D_STENCIL_BODY;
+      }
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    camp::resources::Resource working_res{camp::resources::Omp::get_default()};
+    RAJA::TypedListSegment<Index_type> zones(real_zones, iend,
+                                             working_res, RAJA::Unowned);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+        zones, [=](Index_type i) {
+        MATVEC_3D_STENCIL_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+    getCout() << "\n  MATVEC_3D_STENCIL : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/MATVEC_3D_STENCIL-Seq.cpp b/src/apps/MATVEC_3D_STENCIL-Seq.cpp
new file mode 100644
index 000000000..795a01e19
--- /dev/null
+++ b/src/apps/MATVEC_3D_STENCIL-Seq.cpp
@@ -0,0 +1,101 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MATVEC_3D_STENCIL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+void MATVEC_3D_STENCIL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  MATVEC_3D_STENCIL_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          MATVEC_3D_STENCIL_BODY_INDEX;
+          MATVEC_3D_STENCIL_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      auto matvec_3d_lam = [=](Index_type ii) {
+                         MATVEC_3D_STENCIL_BODY_INDEX;
+                         MATVEC_3D_STENCIL_BODY;
+                       };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          matvec_3d_lam(ii);
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      camp::resources::Resource working_res{camp::resources::Host::get_default()};
+      RAJA::TypedListSegment<Index_type> zones(real_zones, iend,
+                                               working_res, RAJA::Unowned);
+
+      auto matvec_3d_lam = [=](Index_type i) {
+                         MATVEC_3D_STENCIL_BODY;
+                       };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::seq_exec>(zones, matvec_3d_lam);
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      getCout() << "\n  MATVEC_3D_STENCIL : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/MATVEC_3D_STENCIL-Sycl.cpp b/src/apps/MATVEC_3D_STENCIL-Sycl.cpp
new file mode 100644
index 000000000..c6110f2d8
--- /dev/null
+++ b/src/apps/MATVEC_3D_STENCIL-Sycl.cpp
@@ -0,0 +1,89 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MATVEC_3D_STENCIL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf
+{
+namespace apps
+{
+
+template <size_t work_group_size >
+void MATVEC_3D_STENCIL::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  MATVEC_3D_STENCIL_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type ii = item.get_global_id(0);
+          Index_type i = ii + ibegin;
+          if (i < iend) {
+            MATVEC_3D_STENCIL_BODY_INDEX;
+            MATVEC_3D_STENCIL_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    RAJA::TypedListSegment<Index_type> zones(real_zones, iend,
+                                             res, RAJA::Unowned);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::sycl_exec<work_group_size, true /*async*/>>(res,
+          zones, [=](Index_type i) {
+        MATVEC_3D_STENCIL_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  MATVEC_3D_STENCIL : Unknown Sycl variant id = " << vid << std::endl;
+  }
+
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp
new file mode 100644
index 000000000..2b8cb6978
--- /dev/null
+++ b/src/apps/MATVEC_3D_STENCIL.cpp
@@ -0,0 +1,189 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MATVEC_3D_STENCIL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "AppsData.hpp"
+#include "common/DataUtils.hpp"
+
+#include <cmath>
+
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+MATVEC_3D_STENCIL::MATVEC_3D_STENCIL(const RunParams& params)
+  : KernelBase(rajaperf::Apps_MATVEC_3D_STENCIL, params)
+{
+  setDefaultProblemSize(100*100*100);  // See rzmax in ADomain struct
+  setDefaultReps(100);
+
+  Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1;
+  m_domain = new ADomain(rzmax, /* ndims = */ 3);
+
+  m_zonal_array_length = m_domain->lpz+1;
+
+  setActualProblemSize( m_domain->n_real_zones );
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+
+  // touched data size, not actual number of stores and loads
+  const size_t ilen = m_domain->imax - m_domain->imin;
+  const size_t jlen = m_domain->jmax - m_domain->jmin;
+  const size_t klen = m_domain->kmax - m_domain->kmin;
+  auto get_size_extra = [&](size_t iextra, size_t jextra, size_t kextra) {
+    return (ilen + iextra) * (jlen + jextra) * (klen + kextra);
+  };
+  auto get_size_matrix = [&](size_t ishift, size_t jshift, size_t kshift) {
+    // get the used size of matrix coefficient allocations
+    return get_size_extra(0,0,0) +                   // real zones
+          (get_size_extra(0,0,0) - (ilen - ishift) * // plus some extra from the
+                                   (jlen - jshift) * // edges based on the shift
+                                   (klen - kshift));
+  };
+
+  const size_t b_accessed = get_size_extra(0, 0, 0);
+  const size_t x_accessed = get_size_extra(2, 2, 2) ;
+  const size_t m_accessed = get_size_matrix(0, 0, 0) +
+                            get_size_matrix(1, 0, 0) +
+                            get_size_matrix(1, 1, 0) +
+                            get_size_matrix(0, 1, 0) +
+                            get_size_matrix(1, 1, 0) +
+                            get_size_matrix(1, 1, 1) +
+                            get_size_matrix(0, 1, 1) +
+                            get_size_matrix(1, 1, 1) +
+                            get_size_matrix(1, 0, 1) +
+                            get_size_matrix(0, 0, 1) +
+                            get_size_matrix(1, 0, 1) +
+                            get_size_matrix(1, 1, 1) +
+                            get_size_matrix(0, 1, 1) +
+                            get_size_matrix(1, 1, 1) ;
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
+                      1*sizeof(Real_type) * x_accessed +
+                      1*sizeof(Real_type) * m_accessed );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * b_accessed );
+  setBytesAtomicModifyWrittenPerRep( 0 );
+
+  const size_t multiplies = 27;
+  const size_t adds = 26;
+  setFLOPsPerRep((multiplies + adds) * getItsPerRep());
+
+  checksum_scale_factor = 1.0 *
+              ( static_cast<Checksum_type>(getDefaultProblemSize()) /
+                                           getActualProblemSize() );
+
+  setUsesFeature(Forall);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+}
+
+MATVEC_3D_STENCIL::~MATVEC_3D_STENCIL()
+{
+  delete m_domain;
+}
+
+void MATVEC_3D_STENCIL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocAndInitDataConst(m_b, m_zonal_array_length, 0.0, vid);
+  allocAndInitData(m_x, m_zonal_array_length, vid);
+
+  allocAndInitData(m_matrix.dbl, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.dbc, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.dbr, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.dcl, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.dcc, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.dcr, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.dfl, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.dfc, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.dfr, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.cbl, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.cbc, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.cbr, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.ccl, m_zonal_array_length, vid);
+  allocAndInitData(m_matrix.ccc, m_zonal_array_length, vid);
+  m_matrix.ccr = m_matrix.ccl + 1                               ;
+  m_matrix.cfl = m_matrix.cbr - 1 + m_domain->jp                ;
+  m_matrix.cfc = m_matrix.cbc     + m_domain->jp                ;
+  m_matrix.cfr = m_matrix.cbl + 1 + m_domain->jp                ;
+  m_matrix.ubl = m_matrix.dfr - 1 - m_domain->jp + m_domain->kp ;
+  m_matrix.ubc = m_matrix.dfc     - m_domain->jp + m_domain->kp ;
+  m_matrix.ubr = m_matrix.dfl + 1 - m_domain->jp + m_domain->kp ;
+  m_matrix.ucl = m_matrix.dcr - 1                + m_domain->kp ;
+  m_matrix.ucc = m_matrix.dcc                    + m_domain->kp ;
+  m_matrix.ucr = m_matrix.dcl + 1                + m_domain->kp ;
+  m_matrix.ufl = m_matrix.dbr - 1 + m_domain->jp + m_domain->kp ;
+  m_matrix.ufc = m_matrix.dbc     + m_domain->jp + m_domain->kp ;
+  m_matrix.ufr = m_matrix.dbl + 1 + m_domain->jp + m_domain->kp ;
+
+  allocAndInitDataConst(m_real_zones, m_domain->n_real_zones,
+                        static_cast<Index_type>(-1), vid);
+
+  {
+    auto reset_rz = scopedMoveData(m_real_zones, m_domain->n_real_zones, vid);
+
+    setRealZones_3d(m_real_zones, *m_domain);
+  }
+
+}
+
+void MATVEC_3D_STENCIL::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid].at(tune_idx) += calcChecksum(m_b, m_zonal_array_length, checksum_scale_factor , vid);
+}
+
+void MATVEC_3D_STENCIL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+
+  deallocData(m_b, vid);
+  deallocData(m_x, vid);
+
+  deallocData(m_matrix.dbl, vid);
+  deallocData(m_matrix.dbc, vid);
+  deallocData(m_matrix.dbr, vid);
+  deallocData(m_matrix.dcl, vid);
+  deallocData(m_matrix.dcc, vid);
+  deallocData(m_matrix.dcr, vid);
+  deallocData(m_matrix.dfl, vid);
+  deallocData(m_matrix.dfc, vid);
+  deallocData(m_matrix.dfr, vid);
+  deallocData(m_matrix.cbl, vid);
+  deallocData(m_matrix.cbc, vid);
+  deallocData(m_matrix.cbr, vid);
+  deallocData(m_matrix.ccl, vid);
+  deallocData(m_matrix.ccc, vid);
+
+  deallocData(m_real_zones, vid);
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/MATVEC_3D_STENCIL.hpp b/src/apps/MATVEC_3D_STENCIL.hpp
new file mode 100644
index 000000000..a537e7149
--- /dev/null
+++ b/src/apps/MATVEC_3D_STENCIL.hpp
@@ -0,0 +1,199 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// MATVEC_3D_STENCIL kernel reference implementation:
+///
+/// for (Index_type ii = ibegin; ii < iend; ++ii ) {
+///   Index_type i = real_zones[ii];
+///
+///   b[i] = dbl[i] * xdbl[i] + dbc[i] * xdbc[i] + dbr[i] * xdbr[i] +
+///          dcl[i] * xdcl[i] + dcc[i] * xdcc[i] + dcr[i] * xdcr[i] +
+///          dfl[i] * xdfl[i] + dfc[i] * xdfc[i] + dfr[i] * xdfr[i] +
+///
+///          cbl[i] * xcbl[i] + cbc[i] * xcbc[i] + cbr[i] * xcbr[i] +
+///          ccl[i] * xccl[i] + ccc[i] * xccc[i] + ccr[i] * xccr[i] +
+///          cfl[i] * xcfl[i] + cfc[i] * xcfc[i] + cfr[i] * xcfr[i] +
+///
+///          ubl[i] * xubl[i] + ubc[i] * xubc[i] + ubr[i] * xubr[i] +
+///          ucl[i] * xucl[i] + ucc[i] * xucc[i] + ucr[i] * xucr[i] +
+///          ufl[i] * xufl[i] + ufc[i] * xufc[i] + ufr[i] * xufr[i] ;
+///
+/// }
+///
+
+#ifndef RAJAPerf_Apps_MATVEC_3D_STENCIL_HPP
+#define RAJAPerf_Apps_MATVEC_3D_STENCIL_HPP
+
+#define MATVEC_3D_STENCIL_DATA_SETUP \
+  Real_ptr b = m_b; \
+  \
+  Real_ptr xdbl = m_x - m_domain->kp - m_domain->jp - 1 ; \
+  Real_ptr xdbc = m_x - m_domain->kp - m_domain->jp     ; \
+  Real_ptr xdbr = m_x - m_domain->kp - m_domain->jp + 1 ; \
+  Real_ptr xdcl = m_x - m_domain->kp                - 1 ; \
+  Real_ptr xdcc = m_x - m_domain->kp                    ; \
+  Real_ptr xdcr = m_x - m_domain->kp                + 1 ; \
+  Real_ptr xdfl = m_x - m_domain->kp + m_domain->jp - 1 ; \
+  Real_ptr xdfc = m_x - m_domain->kp + m_domain->jp     ; \
+  Real_ptr xdfr = m_x - m_domain->kp + m_domain->jp + 1 ; \
+  Real_ptr xcbl = m_x                - m_domain->jp - 1 ; \
+  Real_ptr xcbc = m_x                - m_domain->jp     ; \
+  Real_ptr xcbr = m_x                - m_domain->jp + 1 ; \
+  Real_ptr xccl = m_x                               - 1 ; \
+  Real_ptr xccc = m_x                                   ; \
+  Real_ptr xccr = m_x                               + 1 ; \
+  Real_ptr xcfl = m_x                + m_domain->jp - 1 ; \
+  Real_ptr xcfc = m_x                + m_domain->jp     ; \
+  Real_ptr xcfr = m_x                + m_domain->jp + 1 ; \
+  Real_ptr xubl = m_x + m_domain->kp - m_domain->jp - 1 ; \
+  Real_ptr xubc = m_x + m_domain->kp - m_domain->jp     ; \
+  Real_ptr xubr = m_x + m_domain->kp - m_domain->jp + 1 ; \
+  Real_ptr xucl = m_x + m_domain->kp                - 1 ; \
+  Real_ptr xucc = m_x + m_domain->kp                    ; \
+  Real_ptr xucr = m_x + m_domain->kp                + 1 ; \
+  Real_ptr xufl = m_x + m_domain->kp + m_domain->jp - 1 ; \
+  Real_ptr xufc = m_x + m_domain->kp + m_domain->jp     ; \
+  Real_ptr xufr = m_x + m_domain->kp + m_domain->jp + 1 ; \
+  \
+  Real_ptr dbl = m_matrix.dbl; \
+  Real_ptr dbc = m_matrix.dbc; \
+  Real_ptr dbr = m_matrix.dbr; \
+  Real_ptr dcl = m_matrix.dcl; \
+  Real_ptr dcc = m_matrix.dcc; \
+  Real_ptr dcr = m_matrix.dcr; \
+  Real_ptr dfl = m_matrix.dfl; \
+  Real_ptr dfc = m_matrix.dfc; \
+  Real_ptr dfr = m_matrix.dfr; \
+  Real_ptr cbl = m_matrix.cbl; \
+  Real_ptr cbc = m_matrix.cbc; \
+  Real_ptr cbr = m_matrix.cbr; \
+  Real_ptr ccl = m_matrix.ccl; \
+  Real_ptr ccc = m_matrix.ccc; \
+  Real_ptr ccr = m_matrix.ccr; \
+  Real_ptr cfl = m_matrix.cfl; \
+  Real_ptr cfc = m_matrix.cfc; \
+  Real_ptr cfr = m_matrix.cfr; \
+  Real_ptr ubl = m_matrix.ubl; \
+  Real_ptr ubc = m_matrix.ubc; \
+  Real_ptr ubr = m_matrix.ubr; \
+  Real_ptr ucl = m_matrix.ucl; \
+  Real_ptr ucc = m_matrix.ucc; \
+  Real_ptr ucr = m_matrix.ucr; \
+  Real_ptr ufl = m_matrix.ufl; \
+  Real_ptr ufc = m_matrix.ufc; \
+  Real_ptr ufr = m_matrix.ufr; \
+  \
+  Index_ptr real_zones = m_real_zones;
+
+#define MATVEC_3D_STENCIL_BODY_INDEX \
+  Index_type i = real_zones[ii];
+
+#define MATVEC_3D_STENCIL_BODY \
+  b[i] = dbl[i] * xdbl[i] + dbc[i] * xdbc[i] + dbr[i] * xdbr[i] + \
+         dcl[i] * xdcl[i] + dcc[i] * xdcc[i] + dcr[i] * xdcr[i] + \
+         dfl[i] * xdfl[i] + dfc[i] * xdfc[i] + dfr[i] * xdfr[i] + \
+                                                                  \
+         cbl[i] * xcbl[i] + cbc[i] * xcbc[i] + cbr[i] * xcbr[i] + \
+         ccl[i] * xccl[i] + ccc[i] * xccc[i] + ccr[i] * xccr[i] + \
+         cfl[i] * xcfl[i] + cfc[i] * xcfc[i] + cfr[i] * xcfr[i] + \
+                                                                  \
+         ubl[i] * xubl[i] + ubc[i] * xubc[i] + ubr[i] * xubr[i] + \
+         ucl[i] * xucl[i] + ucc[i] * xucc[i] + ucr[i] * xucr[i] + \
+         ufl[i] * xufl[i] + ufc[i] * xufc[i] + ufr[i] * xufr[i] ; \
+
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace apps
+{
+class ADomain;
+
+class MATVEC_3D_STENCIL : public KernelBase
+{
+public:
+
+  MATVEC_3D_STENCIL(const RunParams& params);
+
+  ~MATVEC_3D_STENCIL();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
+
+  struct Matrix
+  {
+    Real_ptr dbl;
+    Real_ptr dbc;
+    Real_ptr dbr;
+    Real_ptr dcl;
+    Real_ptr dcc;
+    Real_ptr dcr;
+    Real_ptr dfl;
+    Real_ptr dfc;
+    Real_ptr dfr;
+    Real_ptr cbl;
+    Real_ptr cbc;
+    Real_ptr cbr;
+    Real_ptr ccl;
+    Real_ptr ccc;
+    Real_ptr ccr;
+    Real_ptr cfl;
+    Real_ptr cfc;
+    Real_ptr cfr;
+    Real_ptr ubl;
+    Real_ptr ubc;
+    Real_ptr ubr;
+    Real_ptr ucl;
+    Real_ptr ucc;
+    Real_ptr ucr;
+    Real_ptr ufl;
+    Real_ptr ufc;
+    Real_ptr ufr;
+  };
+
+  Real_ptr m_b;
+  Real_ptr m_x;
+  Matrix m_matrix;
+
+  ADomain* m_domain;
+  Index_type* m_real_zones;
+  Index_type m_zonal_array_length;
+};
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp
index 0b5d3b078..494d70bbb 100644
--- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp
+++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -61,11 +61,13 @@ void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      nodal_accumulation_3d<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(vol,
-                                       x0, x1, x2, x3, x4, x5, x6, x7,
-                                       real_zones,
-                                       ibegin, iend);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (nodal_accumulation_3d<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          vol,
+                          x0, x1, x2, x3, x4, x5, x6, x7,
+                          real_zones,
+                          ibegin, iend );
 
     }
     stopTimer();
diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp
index 476ab5da8..da8f5dd12 100644
--- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp
+++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -61,11 +61,13 @@ void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((nodal_accumulation_3d<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol,
-                                       x0, x1, x2, x3, x4, x5, x6, x7,
-                                       real_zones,
-                                       ibegin, iend);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (nodal_accumulation_3d<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         vol,
+                         x0, x1, x2, x3, x4, x5, x6, x7,
+                         real_zones,
+                         ibegin, iend );
 
     }
     stopTimer();
diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp
index d62b5527a..a17576c98 100644
--- a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp
+++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp
index a12a91efd..7d5c59614 100644
--- a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp
+++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp
index 9fbc2effa..cf176d4c1 100644
--- a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp
+++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp
index ed1bd2078..5bbbbb986 100644
--- a/src/apps/NODAL_ACCUMULATION_3D.cpp
+++ b/src/apps/NODAL_ACCUMULATION_3D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,7 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params)
   setDefaultProblemSize(100*100*100);  // See rzmax in ADomain struct
   setDefaultReps(100);
 
-  Index_type rzmax = std::cbrt(getTargetProblemSize())+1;
+  Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1;
   m_domain = new ADomain(rzmax, /* ndims = */ 3);
 
   m_nodal_array_length = m_domain->nnalls;
@@ -39,9 +39,11 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() +
-                  (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_domain->n_real_nodes);
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
+                      1*sizeof(Real_type) * getItsPerRep() +
+                      1*sizeof(Real_type) * m_domain->n_real_nodes);
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_domain->n_real_nodes );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(9 * getItsPerRep());
 
   checksum_scale_factor = 0.001 *
diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp
index a8d194387..085c0099a 100644
--- a/src/apps/NODAL_ACCUMULATION_3D.hpp
+++ b/src/apps/NODAL_ACCUMULATION_3D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -100,6 +100,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
@@ -107,7 +108,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_vol;
diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp
index 16b395259..c98e3a48a 100644
--- a/src/apps/PRESSURE-Cuda.cpp
+++ b/src/apps/PRESSURE-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -64,19 +64,22 @@ void PRESSURE::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       constexpr size_t shmem = 0;
-
-       pressurecalc1<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( bvc, compression,
-                                                 cls,
-                                                 iend );
-       cudaErrchk( cudaGetLastError() );
-
-       pressurecalc2<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( p_new, bvc, e_old,
-                                                 vnewc,
-                                                 p_cut, eosvmax, pmin,
-                                                 iend );
-       cudaErrchk( cudaGetLastError() );
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchCudaKernel( (pressurecalc1<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          bvc, compression, cls,
+                          iend );
+
+      RPlaunchCudaKernel( (pressurecalc2<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          p_new, bvc, e_old,
+                          vnewc,
+                          p_cut, eosvmax, pmin,
+                          iend );
 
     }
     stopTimer();
diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp
index 1d590e6f0..18d4a1c88 100644
--- a/src/apps/PRESSURE-Hip.cpp
+++ b/src/apps/PRESSURE-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -64,19 +64,22 @@ void PRESSURE::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       constexpr size_t shmem = 0;
-
-       hipLaunchKernelGGL((pressurecalc1<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  bvc, compression,
-                                                 cls,
-                                                 iend );
-       hipErrchk( hipGetLastError() );
-
-       hipLaunchKernelGGL((pressurecalc2<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  p_new, bvc, e_old,
-                                                 vnewc,
-                                                 p_cut, eosvmax, pmin,
-                                                 iend );
-       hipErrchk( hipGetLastError() );
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchHipKernel( (pressurecalc1<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         bvc, compression, cls, 
+                         iend );
+
+      RPlaunchHipKernel( (pressurecalc2<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         p_new, bvc, e_old,
+                         vnewc,
+                         p_cut, eosvmax, pmin,
+                         iend );
 
     }
     stopTimer();
diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp
index 0f0dd2e4e..ceab0286c 100644
--- a/src/apps/PRESSURE-OMP.cpp
+++ b/src/apps/PRESSURE-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp
index 830859f07..4d0b61f58 100644
--- a/src/apps/PRESSURE-OMPTarget.cpp
+++ b/src/apps/PRESSURE-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp
index 4506eb7a2..18fc0929e 100644
--- a/src/apps/PRESSURE-Seq.cpp
+++ b/src/apps/PRESSURE-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/PRESSURE-Sycl.cpp b/src/apps/PRESSURE-Sycl.cpp
new file mode 100644
index 000000000..2b0e3b4dd
--- /dev/null
+++ b/src/apps/PRESSURE-Sycl.cpp
@@ -0,0 +1,106 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "PRESSURE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+template <size_t work_group_size >
+void PRESSURE::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  PRESSURE_DATA_SETUP;
+
+  using sycl::fabs;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            PRESSURE_BODY1
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            PRESSURE_BODY2
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    const bool async = true;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::region<RAJA::seq_region>( [=]() {
+
+        RAJA::forall< RAJA::sycl_exec<work_group_size, async> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+          PRESSURE_BODY1;
+        });
+
+        RAJA::forall< RAJA::sycl_exec<work_group_size, async> >( res,
+          RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+          PRESSURE_BODY2;
+        });
+
+      }); // end sequential region (for single-source code)
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  PRESSURE : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PRESSURE, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp
index c772a0989..ed1e00306 100644
--- a/src/apps/PRESSURE.cpp
+++ b/src/apps/PRESSURE.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,11 @@ PRESSURE::PRESSURE(const RunParams& params)
 
   setItsPerRep( 2 * getActualProblemSize() );
   setKernelsPerRep(2);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() +
-                  (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
+                      3*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
+                         1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((2 +
                   1
                   ) * getActualProblemSize());
@@ -52,6 +55,9 @@ PRESSURE::PRESSURE(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 PRESSURE::~PRESSURE()
diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp
index c0568a8e0..d1cffe62a 100644
--- a/src/apps/PRESSURE.hpp
+++ b/src/apps/PRESSURE.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -72,17 +72,23 @@ class PRESSURE : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
 
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
+
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_compression;
   Real_ptr m_bvc;
diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp
index 18bb89c4c..378aa36bc 100644
--- a/src/apps/VOL3D-Cuda.cpp
+++ b/src/apps/VOL3D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -68,13 +68,15 @@ void VOL3D::runCudaVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      vol3d<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(vol,
-                                       x0, x1, x2, x3, x4, x5, x6, x7,
-                                       y0, y1, y2, y3, y4, y5, y6, y7,
-                                       z0, z1, z2, z3, z4, z5, z6, z7,
-                                       vnormq,
-                                       ibegin, iend);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (vol3d<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          vol, 
+                          x0, x1, x2, x3, x4, x5, x6, x7,
+                          y0, y1, y2, y3, y4, y5, y6, y7,
+                          z0, z1, z2, z3, z4, z5, z6, z7,
+                          vnormq,
+                          ibegin, iend );
 
     }
     stopTimer();
diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp
index 9a0a2323b..704008006 100644
--- a/src/apps/VOL3D-Hip.cpp
+++ b/src/apps/VOL3D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -68,13 +68,15 @@ void VOL3D::runHipVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((vol3d<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol,
-                                       x0, x1, x2, x3, x4, x5, x6, x7,
-                                       y0, y1, y2, y3, y4, y5, y6, y7,
-                                       z0, z1, z2, z3, z4, z5, z6, z7,
-                                       vnormq,
-                                       ibegin, iend);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (vol3d<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         vol,
+                         x0, x1, x2, x3, x4, x5, x6, x7,
+                         y0, y1, y2, y3, y4, y5, y6, y7,
+                         z0, z1, z2, z3, z4, z5, z6, z7,
+                         vnormq,
+                         ibegin, iend );
 
     }
     stopTimer();
diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp
index 44e3cdcf4..7c6562c47 100644
--- a/src/apps/VOL3D-OMP.cpp
+++ b/src/apps/VOL3D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp
index 538fb47c7..d97a5889a 100644
--- a/src/apps/VOL3D-OMPTarget.cpp
+++ b/src/apps/VOL3D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp
index 631c545bc..d174ad43c 100644
--- a/src/apps/VOL3D-Seq.cpp
+++ b/src/apps/VOL3D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp
new file mode 100644
index 000000000..37c7bc90f
--- /dev/null
+++ b/src/apps/VOL3D-Sycl.cpp
@@ -0,0 +1,84 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "VOL3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+template <size_t work_group_size >
+void VOL3D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = m_domain->fpz;
+  const Index_type iend = m_domain->lpz+1;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  VOL3D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+ 
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type ii = item.get_global_id(0);
+          Index_type i = ii + ibegin;
+          if (i < iend) {
+            VOL3D_BODY
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+ 
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        VOL3D_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  VOL3D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(VOL3D, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp
index 11051adc5..16951253d 100644
--- a/src/apps/VOL3D.cpp
+++ b/src/apps/VOL3D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,18 +28,19 @@ VOL3D::VOL3D(const RunParams& params)
   setDefaultProblemSize(100*100*100);  // See rzmax in ADomain struct
   setDefaultReps(100);
 
-  Index_type rzmax = std::cbrt(getTargetProblemSize())+1;
+  Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1;
   m_domain = new ADomain(rzmax, /* ndims = */ 3);
 
   m_array_length = m_domain->nnalls;
 
-  setActualProblemSize( m_domain->lpz+1 - m_domain->fpz );
+  setActualProblemSize( m_domain->n_real_zones );
 
   setItsPerRep( m_domain->lpz+1 - m_domain->fpz );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() +
-                  (0*sizeof(Real_type) + 3*sizeof(Real_type)) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
+  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(72 * (m_domain->lpz+1 - m_domain->fpz));
 
   checksum_scale_factor = 0.001 *
@@ -64,6 +65,9 @@ VOL3D::VOL3D(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 VOL3D::~VOL3D()
diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp
index f3d296440..6847ce13f 100644
--- a/src/apps/VOL3D.hpp
+++ b/src/apps/VOL3D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -173,17 +173,22 @@ class VOL3D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp
index 1dc6216d7..f33c8656d 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -61,11 +61,13 @@ void ZONAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      zonal_accumulation_3d<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(vol,
-                                       x0, x1, x2, x3, x4, x5, x6, x7,
-                                       real_zones,
-                                       ibegin, iend);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (zonal_accumulation_3d<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          vol,
+                          x0, x1, x2, x3, x4, x5, x6, x7,
+                          real_zones,
+                          ibegin, iend );
 
     }
     stopTimer();
diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp
index d861128b9..c92d7b6db 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -61,11 +61,13 @@ void ZONAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((zonal_accumulation_3d<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol,
-                                       x0, x1, x2, x3, x4, x5, x6, x7,
-                                       real_zones,
-                                       ibegin, iend);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (zonal_accumulation_3d<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         vol,
+                         x0, x1, x2, x3, x4, x5, x6, x7,
+                         real_zones,
+                         ibegin, iend );
 
     }
     stopTimer();
diff --git a/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp b/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp
index eea4614cf..9b0890ef4 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp
index 573765e30..0a4f96119 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp
index 1bd904088..ea9be8c17 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp
new file mode 100644
index 000000000..67b25086e
--- /dev/null
+++ b/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp
@@ -0,0 +1,89 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ZONAL_ACCUMULATION_3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+
+template < size_t work_group_size >
+void ZONAL_ACCUMULATION_3D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  ZONAL_ACCUMULATION_3D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+   
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type ii = item.get_global_id(0);
+          Index_type i = ii + ibegin;
+          if (i < iend) {
+            ZONAL_ACCUMULATION_3D_BODY_INDEX;
+            ZONAL_ACCUMULATION_3D_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    RAJA::TypedListSegment<Index_type> zones(real_zones, iend,
+                                             res, RAJA::Unowned);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        zones, [=] (Index_type i) {
+          ZONAL_ACCUMULATION_3D_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ZONAL_ACCUMULATION_3D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ZONAL_ACCUMULATION_3D, Sycl)
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp
index 267190132..993f65a07 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D.cpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,7 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params)
   setDefaultProblemSize(100*100*100);  // See rzmax in ADomain struct
   setDefaultReps(100);
 
-  Index_type rzmax = std::cbrt(getTargetProblemSize())+1;
+  Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1;
   m_domain = new ADomain(rzmax, /* ndims = */ 3);
 
   m_nodal_array_length = m_domain->nnalls;
@@ -39,9 +39,10 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() +
-                  (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_domain->n_real_nodes);
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
+                      1*sizeof(Real_type) * m_domain->n_real_nodes );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(8 * getItsPerRep());
 
   checksum_scale_factor = 0.001 *
@@ -66,6 +67,9 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 ZONAL_ACCUMULATION_3D::~ZONAL_ACCUMULATION_3D()
diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp
index 1c823ea2a..572490caa 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D.hpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -81,17 +81,22 @@ class ZONAL_ACCUMULATION_3D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_vol;
diff --git a/src/apps/mixed_fem_helper.hpp b/src/apps/mixed_fem_helper.hpp
index 88f7d3b64..6ee3b1a06 100644
--- a/src/apps/mixed_fem_helper.hpp
+++ b/src/apps/mixed_fem_helper.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt
index 4b47c7b48..25969c207 100644
--- a/src/basic-kokkos/CMakeLists.txt
+++ b/src/basic-kokkos/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp
index b8ab91cd1..eb2cc1e83 100644
--- a/src/basic-kokkos/DAXPY-Kokkos.cpp
+++ b/src/basic-kokkos/DAXPY-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp b/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp
index 9e74c4e0c..aadcf9401 100644
--- a/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp
+++ b/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp
index 19e916dac..a67b041c7 100644
--- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp
+++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp
index 661180c7b..b07c6f881 100644
--- a/src/basic-kokkos/INIT3-Kokkos.cpp
+++ b/src/basic-kokkos/INIT3-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp
index 8c775a3b0..efd1ed118 100644
--- a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp
+++ b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp
index 9df018264..996ec6225 100644
--- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp
+++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/MULADDSUB-Kokkos.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp
index 49e890315..fb2f59ac2 100644
--- a/src/basic-kokkos/MULADDSUB-Kokkos.cpp
+++ b/src/basic-kokkos/MULADDSUB-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp
index 36929cead..fb5b5ba98 100644
--- a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp
+++ b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp
index 233ca71af..6247970b1 100644
--- a/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp
+++ b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp
index 23c0ab6f4..e461c00a1 100644
--- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp
+++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp
index 5cdb9060f..43b629cad 100644
--- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp
+++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/ARRAY_OF_PTRS-Cuda.cpp b/src/basic/ARRAY_OF_PTRS-Cuda.cpp
index b6b53e249..e9cdd2349 100644
--- a/src/basic/ARRAY_OF_PTRS-Cuda.cpp
+++ b/src/basic/ARRAY_OF_PTRS-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -54,9 +54,11 @@ void ARRAY_OF_PTRS::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      array_of_ptrs<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-          y, x_array, array_size, iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (array_of_ptrs<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          y, x_array, array_size, iend );
 
     }
     stopTimer();
@@ -66,13 +68,18 @@ void ARRAY_OF_PTRS::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto array_of_ptrs_lambda = [=] __device__ (Index_type i) {
+        ARRAY_OF_PTRS_BODY(x);
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        ARRAY_OF_PTRS_BODY(x);
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size, 
+                                              decltype(array_of_ptrs_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, array_of_ptrs_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/ARRAY_OF_PTRS-Hip.cpp b/src/basic/ARRAY_OF_PTRS-Hip.cpp
index 26c0f8800..aa5777fb4 100644
--- a/src/basic/ARRAY_OF_PTRS-Hip.cpp
+++ b/src/basic/ARRAY_OF_PTRS-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -54,9 +54,11 @@ void ARRAY_OF_PTRS::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((array_of_ptrs<block_size>),dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-          y, x_array, array_size, iend );
-      hipErrchk( hipGetLastError() );
+  
+      RPlaunchHipKernel( (array_of_ptrs<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         y, x_array, array_size, iend );
 
     }
     stopTimer();
@@ -72,9 +74,12 @@ void ARRAY_OF_PTRS::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(array_of_ptrs_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, array_of_ptrs_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(array_of_ptrs_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, array_of_ptrs_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/ARRAY_OF_PTRS-OMP.cpp b/src/basic/ARRAY_OF_PTRS-OMP.cpp
index 3e05e929a..774fd8f98 100644
--- a/src/basic/ARRAY_OF_PTRS-OMP.cpp
+++ b/src/basic/ARRAY_OF_PTRS-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp b/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp
index 7a7642b4e..02301ca1e 100644
--- a/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp
+++ b/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/ARRAY_OF_PTRS-Seq.cpp b/src/basic/ARRAY_OF_PTRS-Seq.cpp
index ba728d775..dd22e010d 100644
--- a/src/basic/ARRAY_OF_PTRS-Seq.cpp
+++ b/src/basic/ARRAY_OF_PTRS-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,9 +26,11 @@ void ARRAY_OF_PTRS::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune
 
   ARRAY_OF_PTRS_DATA_SETUP;
 
+#if defined(RUN_RAJA_SEQ)
   auto array_of_ptrs_lam = [=](Index_type i) {
                      ARRAY_OF_PTRS_BODY(x);
                    };
+#endif
 
   switch ( vid ) {
 
diff --git a/src/basic/ARRAY_OF_PTRS-Sycl.cpp b/src/basic/ARRAY_OF_PTRS-Sycl.cpp
new file mode 100644
index 000000000..c3d987987
--- /dev/null
+++ b/src/basic/ARRAY_OF_PTRS-Sycl.cpp
@@ -0,0 +1,82 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ARRAY_OF_PTRS.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+template < size_t work_group_size >
+void ARRAY_OF_PTRS::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  ARRAY_OF_PTRS_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            ARRAY_OF_PTRS_BODY(x);
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        ARRAY_OF_PTRS_BODY(x);
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  ARRAY_OF_PTRS : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ARRAY_OF_PTRS, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp
index 2a88e5005..9f0995d0d 100644
--- a/src/basic/ARRAY_OF_PTRS.cpp
+++ b/src/basic/ARRAY_OF_PTRS.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,11 +26,13 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params)
 
   setActualProblemSize( getTargetProblemSize() );
 
-  m_array_size = ARRAY_OF_PTRS_MAX_ARRAY_SIZE;
+  m_array_size = params.getArrayOfPtrsArraySize();
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + m_array_size*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( m_array_size*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(m_array_size * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -54,6 +56,9 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp
index ee8a44862..029353f45 100644
--- a/src/basic/ARRAY_OF_PTRS.hpp
+++ b/src/basic/ARRAY_OF_PTRS.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -24,8 +24,6 @@
 #ifndef RAJAPerf_Basic_ARRAY_OF_PTRS_HPP
 #define RAJAPerf_Basic_ARRAY_OF_PTRS_HPP
 
-#define ARRAY_OF_PTRS_MAX_ARRAY_SIZE 26
-
 #define ARRAY_OF_PTRS_DATA_SETUP_X_ARRAY \
   for (Index_type a = 0; a < array_size; ++a) { \
     x[a] = x_data + a * iend ; \
@@ -72,18 +70,24 @@ class ARRAY_OF_PTRS : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt
index d21d46e5a..ee71119ce 100644
--- a/src/basic/CMakeLists.txt
+++ b/src/basic/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -14,18 +14,21 @@ blt_add_library(
           ARRAY_OF_PTRS-Cuda.cpp
           ARRAY_OF_PTRS-OMP.cpp
           ARRAY_OF_PTRS-OMPTarget.cpp
+          ARRAY_OF_PTRS-Sycl.cpp
           COPY8.cpp
           COPY8-Seq.cpp
           COPY8-Hip.cpp
           COPY8-Cuda.cpp
           COPY8-OMP.cpp
           COPY8-OMPTarget.cpp
+          COPY8-Sycl.cpp
           DAXPY.cpp
           DAXPY-Seq.cpp
           DAXPY-Hip.cpp
           DAXPY-Cuda.cpp
           DAXPY-OMP.cpp
           DAXPY-OMPTarget.cpp
+          DAXPY-Sycl.cpp
           DAXPY_ATOMIC.cpp
           DAXPY_ATOMIC-Seq.cpp
           DAXPY_ATOMIC-Hip.cpp
@@ -38,6 +41,7 @@ blt_add_library(
           IF_QUAD-Cuda.cpp
           IF_QUAD-OMP.cpp
           IF_QUAD-OMPTarget.cpp
+          IF_QUAD-Sycl.cpp
           INDEXLIST.cpp
           INDEXLIST-Seq.cpp
           INDEXLIST-Hip.cpp
@@ -56,36 +60,42 @@ blt_add_library(
           INIT3-Cuda.cpp
           INIT3-OMP.cpp
           INIT3-OMPTarget.cpp
+          INIT3-Sycl.cpp
           INIT_VIEW1D.cpp
           INIT_VIEW1D-Seq.cpp
           INIT_VIEW1D-Hip.cpp
           INIT_VIEW1D-Cuda.cpp
           INIT_VIEW1D-OMP.cpp
           INIT_VIEW1D-OMPTarget.cpp
+          INIT_VIEW1D-Sycl.cpp
           INIT_VIEW1D_OFFSET.cpp
           INIT_VIEW1D_OFFSET-Seq.cpp
           INIT_VIEW1D_OFFSET-Hip.cpp
           INIT_VIEW1D_OFFSET-Cuda.cpp
           INIT_VIEW1D_OFFSET-OMP.cpp
           INIT_VIEW1D_OFFSET-OMPTarget.cpp
+          INIT_VIEW1D_OFFSET-Sycl.cpp
           MAT_MAT_SHARED.cpp
           MAT_MAT_SHARED-Seq.cpp
           MAT_MAT_SHARED-Hip.cpp
           MAT_MAT_SHARED-Cuda.cpp
           MAT_MAT_SHARED-OMP.cpp
           MAT_MAT_SHARED-OMPTarget.cpp
+          MAT_MAT_SHARED-Sycl.cpp
           MULADDSUB.cpp
           MULADDSUB-Seq.cpp
           MULADDSUB-Hip.cpp
           MULADDSUB-Cuda.cpp
           MULADDSUB-OMP.cpp
           MULADDSUB-OMPTarget.cpp
+          MULADDSUB-Sycl.cpp
           NESTED_INIT.cpp
           NESTED_INIT-Seq.cpp
           NESTED_INIT-Hip.cpp
           NESTED_INIT-Cuda.cpp
           NESTED_INIT-OMP.cpp
           NESTED_INIT-OMPTarget.cpp
+          NESTED_INIT-Sycl.cpp
           PI_ATOMIC.cpp
           PI_ATOMIC-Seq.cpp
           PI_ATOMIC-Hip.cpp
@@ -98,12 +108,14 @@ blt_add_library(
           PI_REDUCE-Cuda.cpp
           PI_REDUCE-OMP.cpp
           PI_REDUCE-OMPTarget.cpp
+          PI_REDUCE-Sycl.cpp
           REDUCE3_INT.cpp
           REDUCE3_INT-Seq.cpp
           REDUCE3_INT-Hip.cpp
           REDUCE3_INT-Cuda.cpp
           REDUCE3_INT-OMP.cpp
           REDUCE3_INT-OMPTarget.cpp
+          REDUCE3_INT-Sycl.cpp
           REDUCE_STRUCT.cpp
           REDUCE_STRUCT-Seq.cpp
           REDUCE_STRUCT-Hip.cpp
@@ -116,5 +128,12 @@ blt_add_library(
           TRAP_INT-Cuda.cpp
           TRAP_INT-OMPTarget.cpp
           TRAP_INT-OMP.cpp
+          TRAP_INT-Sycl.cpp
+          MULTI_REDUCE.cpp
+          MULTI_REDUCE-Seq.cpp
+          MULTI_REDUCE-Hip.cpp
+          MULTI_REDUCE-Cuda.cpp
+          MULTI_REDUCE-OMP.cpp
+          MULTI_REDUCE-OMPTarget.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/basic/COPY8-Cuda.cpp b/src/basic/COPY8-Cuda.cpp
index b5bfeafbc..f8f1aeb31 100644
--- a/src/basic/COPY8-Cuda.cpp
+++ b/src/basic/COPY8-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,8 +23,10 @@ namespace basic
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7,
-                      Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7,
+__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3,
+                      Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7,
+                      Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3,
+                      Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7,
                       Index_type iend)
 {
    Index_type i = blockIdx.x * block_size + threadIdx.x;
@@ -52,11 +54,13 @@ void COPY8::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      copy8<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-          y0, y1, y2, y3, y4, y5, y6, y7,
-          x0, x1, x2, x3, x4, x5, x6, x7,
-          iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (copy8<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          y0, y1, y2, y3, y4, y5, y6, y7,
+                          x0, x1, x2, x3, x4, x5, x6, x7,
+                          iend );
 
     }
     stopTimer();
@@ -66,13 +70,18 @@ void COPY8::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto copy8_lambda = [=] __device__ (Index_type i) {
+        COPY8_BODY;
+      }; 
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        COPY8_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(copy8_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, copy8_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/COPY8-Hip.cpp b/src/basic/COPY8-Hip.cpp
index fe24822f5..714a00a0b 100644
--- a/src/basic/COPY8-Hip.cpp
+++ b/src/basic/COPY8-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,8 +23,10 @@ namespace basic
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7,
-                      Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7,
+__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3,
+                      Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7,
+                      Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3,
+                      Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7,
                       Index_type iend)
 {
    Index_type i = blockIdx.x * block_size + threadIdx.x;
@@ -53,11 +55,13 @@ void COPY8::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((copy8<block_size>),dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-          y0, y1, y2, y3, y4, y5, y6, y7,
-          x0, x1, x2, x3, x4, x5, x6, x7,
-          iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (copy8<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         y0, y1, y2, y3, y4, y5, y6, y7,
+                         x0, x1, x2, x3, x4, x5, x6, x7,
+                         iend );
 
     }
     stopTimer();
@@ -73,9 +77,12 @@ void COPY8::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(copy8_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy8_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(copy8_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, copy8_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/COPY8-OMP.cpp b/src/basic/COPY8-OMP.cpp
index 8ba6699c6..a8dec3228 100644
--- a/src/basic/COPY8-OMP.cpp
+++ b/src/basic/COPY8-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/COPY8-OMPTarget.cpp b/src/basic/COPY8-OMPTarget.cpp
index 729449861..63a207ba8 100644
--- a/src/basic/COPY8-OMPTarget.cpp
+++ b/src/basic/COPY8-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -40,7 +40,8 @@ void COPY8::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      #pragma omp target is_device_ptr(x, y) device( did )
+      #pragma omp target is_device_ptr(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2,   \
+                                 y3, y4, y5, y6, y7) device( did )
       #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
       for (Index_type i = ibegin; i < iend; ++i ) {
         COPY8_BODY;
@@ -70,4 +71,4 @@ void COPY8::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun
 } // end namespace basic
 } // end namespace rajaperf
 
-#endif  // RAJA_ENABLE_TARGET_OPENMP
+#endif // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/COPY8-Seq.cpp b/src/basic/COPY8-Seq.cpp
index 1ae6af854..32bf188d6 100644
--- a/src/basic/COPY8-Seq.cpp
+++ b/src/basic/COPY8-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/COPY8-Sycl.cpp b/src/basic/COPY8-Sycl.cpp
new file mode 100644
index 000000000..8ce2a8e24
--- /dev/null
+++ b/src/basic/COPY8-Sycl.cpp
@@ -0,0 +1,82 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "COPY8.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+template < size_t work_group_size >
+void COPY8::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  COPY8_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            COPY8_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        COPY8_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  COPY8 : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(COPY8, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp
index 7a75daa40..ce8847032 100644
--- a/src/basic/COPY8.cpp
+++ b/src/basic/COPY8.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ COPY8::COPY8(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (8*sizeof(Real_type) + 8*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 8*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 8*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature(Forall);
@@ -51,6 +53,9 @@ COPY8::COPY8(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 COPY8::~COPY8()
diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp
index f98784d16..61945eed3 100644
--- a/src/basic/COPY8.hpp
+++ b/src/basic/COPY8.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -79,18 +79,24 @@ class COPY8 : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x0;
   Real_ptr m_x1;
diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp
index 79f1fde4a..d58468ba3 100644
--- a/src/basic/DAXPY-Cuda.cpp
+++ b/src/basic/DAXPY-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,9 +52,11 @@ void DAXPY::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      daxpy<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( y, x, a,
-                                        iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (daxpy<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          y, x, a, iend );
 
     }
     stopTimer();
@@ -64,13 +66,18 @@ void DAXPY::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto daxpy_lambda = [=] __device__ (Index_type i) {
+        DAXPY_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        DAXPY_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size, 
+                                              decltype(daxpy_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, daxpy_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp
index 22f86b4d7..f08dba1fc 100644
--- a/src/basic/DAXPY-Hip.cpp
+++ b/src/basic/DAXPY-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,9 +53,11 @@ void DAXPY::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((daxpy<block_size>),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), y, x, a,
-                                        iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (daxpy<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         y, x, a, iend );
 
     }
     stopTimer();
@@ -71,9 +73,12 @@ void DAXPY::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(daxpy_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(daxpy_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, daxpy_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp
index afc0e653c..8f1b95641 100644
--- a/src/basic/DAXPY-OMP.cpp
+++ b/src/basic/DAXPY-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp
index 387a4c40d..fc36ad257 100644
--- a/src/basic/DAXPY-OMPTarget.cpp
+++ b/src/basic/DAXPY-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp
index 7b024ca49..e23cc5e6f 100644
--- a/src/basic/DAXPY-Seq.cpp
+++ b/src/basic/DAXPY-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp
new file mode 100644
index 000000000..15642a08b
--- /dev/null
+++ b/src/basic/DAXPY-Sycl.cpp
@@ -0,0 +1,82 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DAXPY.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+template <size_t work_group_size >
+void DAXPY::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  DAXPY_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            DAXPY_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        DAXPY_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  DAXPY : Unknown Sycl variant id = " << vid << std::endl;
+  }
+
+}
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DAXPY, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp
index 8aa05e66a..fafb9bb66 100644
--- a/src/basic/DAXPY.cpp
+++ b/src/basic/DAXPY.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ DAXPY::DAXPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -52,6 +54,9 @@ DAXPY::DAXPY(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp
index bcaca8054..db9edba60 100644
--- a/src/basic/DAXPY.hpp
+++ b/src/basic/DAXPY.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,18 +52,24 @@ class DAXPY : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp
index 29a142d01..c4cee2dd2 100644
--- a/src/basic/DAXPY_ATOMIC-Cuda.cpp
+++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,9 +52,11 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      daxpy_atomic<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( y, x, a,
-                                        iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (daxpy_atomic<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          y, x, a, iend );
 
     }
     stopTimer();
@@ -64,13 +66,18 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto daxpy_atomic_lambda = [=] __device__ (Index_type i) {
+        DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic);
+      }; 
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic);
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size, 
+                                              decltype(daxpy_atomic_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, daxpy_atomic_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp
index 0688950b0..258c979b6 100644
--- a/src/basic/DAXPY_ATOMIC-Hip.cpp
+++ b/src/basic/DAXPY_ATOMIC-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,9 +52,11 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((daxpy_atomic<block_size>),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), y, x, a,
-                                        iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (daxpy_atomic<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         y, x, a, iend );
 
     }
     stopTimer();
@@ -70,9 +72,12 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(daxpy_atomic_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_atomic_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(daxpy_atomic_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, daxpy_atomic_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp
index 4d2f4db87..a41c6c049 100644
--- a/src/basic/DAXPY_ATOMIC-OMP.cpp
+++ b/src/basic/DAXPY_ATOMIC-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp
index bc6b08932..ae7319e25 100644
--- a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp
+++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp
index 1c33c45f8..9fd78fecf 100644
--- a/src/basic/DAXPY_ATOMIC-Seq.cpp
+++ b/src/basic/DAXPY_ATOMIC-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp
index a9f709276..24ec906c4 100644
--- a/src/basic/DAXPY_ATOMIC.cpp
+++ b/src/basic/DAXPY_ATOMIC.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ DAXPY_ATOMIC::DAXPY_ATOMIC(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
   setFLOPsPerRep(2 * getActualProblemSize());
 
   setUsesFeature(Forall);
diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp
index 9c2890e48..17bf3979b 100644
--- a/src/basic/DAXPY_ATOMIC.hpp
+++ b/src/basic/DAXPY_ATOMIC.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -55,10 +55,12 @@ class DAXPY_ATOMIC : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
@@ -66,7 +68,7 @@ class DAXPY_ATOMIC : public KernelBase
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp
index 0702e7d2d..da959e485 100644
--- a/src/basic/IF_QUAD-Cuda.cpp
+++ b/src/basic/IF_QUAD-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,8 +53,13 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      ifquad<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( x1, x2, a, b, c, iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (ifquad<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x1, x2,
+                          a, b, c,
+                          iend );
 
     }
     stopTimer();
@@ -63,13 +68,18 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto ifquad_lambda = [=] __device__ (Index_type i) {
+        IF_QUAD_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        IF_QUAD_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(ifquad_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, ifquad_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp
index 5b47d786b..259306d0f 100644
--- a/src/basic/IF_QUAD-Hip.cpp
+++ b/src/basic/IF_QUAD-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,9 +53,13 @@ void IF_QUAD::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((ifquad<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  x1, x2, a, b, c,
-                                          iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (ifquad<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         x1, x2,
+                         a, b, c,
+                         iend );
 
     }
     stopTimer();
@@ -71,9 +75,12 @@ void IF_QUAD::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(ifquad_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, ifquad_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(ifquad_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, ifquad_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp
index e952f05fb..297decc78 100644
--- a/src/basic/IF_QUAD-OMP.cpp
+++ b/src/basic/IF_QUAD-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp
index d6232ec13..bedec322c 100644
--- a/src/basic/IF_QUAD-OMPTarget.cpp
+++ b/src/basic/IF_QUAD-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp
index aa2448a1b..14735ecd8 100644
--- a/src/basic/IF_QUAD-Seq.cpp
+++ b/src/basic/IF_QUAD-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp
new file mode 100644
index 000000000..17e569c6f
--- /dev/null
+++ b/src/basic/IF_QUAD-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "IF_QUAD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf
+{
+namespace basic
+{
+
+template <size_t work_group_size >
+void IF_QUAD::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  IF_QUAD_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+
+          if (i < iend) {
+            IF_QUAD_BODY
+          }
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         IF_QUAD_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  IF_QUAD : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(IF_QUAD, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp
index c31dc79d4..58ccb9f58 100644
--- a/src/basic/IF_QUAD.cpp
+++ b/src/basic/IF_QUAD.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ IF_QUAD::IF_QUAD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (2*sizeof(Real_type) + 3*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 3*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(11 * getActualProblemSize()); // 1 sqrt
 
   checksum_scale_factor = 0.0001 *
@@ -56,6 +58,9 @@ IF_QUAD::IF_QUAD(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp
index f1f3e12a8..a742d22eb 100644
--- a/src/basic/IF_QUAD.hpp
+++ b/src/basic/IF_QUAD.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -69,18 +69,24 @@ class IF_QUAD : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_a;
   Real_ptr m_b;
diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp
index cb6c88a9e..afcb54176 100644
--- a/src/basic/INDEXLIST-Cuda.cpp
+++ b/src/basic/INDEXLIST-Cuda.cpp
@@ -13,11 +13,7 @@
 #if defined(RAJA_ENABLE_CUDA)
 
 #include "common/CudaDataUtils.hpp"
-
-#include <cub/block/block_scan.cuh>
-#include <cub/block/block_exchange.cuh>
-#include <cub/warp/warp_reduce.cuh>
-#include <cub/warp/warp_scan.cuh>
+#include "common/CudaGridScan.hpp"
 
 #include <iostream>
 
@@ -26,177 +22,11 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define magic numbers for CUDA execution
-  //
-  const size_t warp_size = 32;
-  const size_t items_per_thread = 15;
-
-
-// perform a grid scan on val and returns the result at each thread
-// in exclusive and inclusive, note that val is used as scratch space
-template < size_t block_size, size_t items_per_thread >
-__device__ void grid_scan(const int block_id,
-                          Index_type (&val)[items_per_thread],
-                          Index_type (&exclusive)[items_per_thread],
-                          Index_type (&inclusive)[items_per_thread],
-                          Index_type* block_counts,
-                          Index_type* grid_counts,
-                          unsigned* block_readys)
-{
-  const bool first_block = (block_id == 0);
-  const bool last_block = (block_id == gridDim.x-1);
-  const bool last_thread = (threadIdx.x == block_size-1);
-  const bool last_warp = (threadIdx.x >= block_size - warp_size);
-  const int warp_index = (threadIdx.x % warp_size);
-  const unsigned warp_index_mask = (1u << warp_index);
-  const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u);
-
-  using BlockScan = cub::BlockScan<Index_type, block_size>; //, cub::BLOCK_SCAN_WARP_SCANS>;
-  using BlockExchange = cub::BlockExchange<Index_type, block_size, items_per_thread>;
-  using WarpReduce = cub::WarpReduce<Index_type, warp_size>;
-
-  union SharedStorage {
-    typename BlockScan::TempStorage block_scan_storage;
-    typename BlockExchange::TempStorage block_exchange_storage;
-    typename WarpReduce::TempStorage warp_reduce_storage;
-    volatile Index_type prev_grid_count;
-  };
-  __shared__ SharedStorage s_temp_storage;
-
-
-  BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val);
-  __syncthreads();
-
-
-  BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive);
-  __syncthreads();
-
-  for (size_t ti = 0; ti < items_per_thread; ++ti) {
-    inclusive[ti] = exclusive[ti] + val[ti];
-  }
-
-  BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive);
-  __syncthreads();
-  BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive);
-  __syncthreads();
-  if (first_block) {
-
-    if (!last_block && last_thread) {
-      block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
-      grid_counts[block_id] = inclusive[items_per_thread-1];  // write inclusive scan result for grid through block
-      __threadfence();                         // ensure block_counts, grid_counts ready (release)
-      atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready
-    }
-
-  } else {
-
-    if (!last_block && last_thread) {
-      block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
-      __threadfence();                         // ensure block_counts ready (release)
-      atomicExch(&block_readys[block_id], 1u); // write block_counts is ready
-    }
-
-    // get prev_grid_count using last warp in block
-    if (last_warp) {
-
-      Index_type prev_grid_count = 0;
-
-      // accumulate previous block counts into registers of warp
-
-      int prev_block_base_id = block_id - warp_size;
-
-      unsigned prev_block_ready = 0u;
-      unsigned prev_blocks_ready_ballot = 0u;
-      unsigned prev_grids_ready_ballot = 0u;
-
-      // accumulate full warp worths of block counts
-      // stop if run out of full warps of a grid count is ready
-      while (prev_block_base_id >= 0) {
-
-        const int prev_block_id = prev_block_base_id + warp_index;
-
-        // ensure previous block_counts are ready
-        do {
-          prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
-
-          prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u);
-
-        } while (prev_blocks_ready_ballot != 0xffffffffu);
-
-        prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u);
-
-        if (prev_grids_ready_ballot != 0u) {
-          break;
-        }
-
-        __threadfence(); // ensure block_counts or grid_counts ready (acquire)
-
-        // accumulate block_counts for prev_block_id
-        prev_grid_count += block_counts[prev_block_id];
-
-        prev_block_ready = 0u;
-
-        prev_block_base_id -= warp_size;
-      }
-
-      const int prev_block_id = prev_block_base_id + warp_index;
-
-      // ensure previous block_counts are ready
-      // this checks that block counts is ready for all blocks above
-      // the highest grid count that is ready
-      while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) {
-
-        if (prev_block_id >= 0) {
-          prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
-        }
-
-        prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u);
-        prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u);
-      }
-      __threadfence(); // ensure block_counts or grid_counts ready (acquire)
-
-      // read one grid_count from a block with id grid_count_ready_id
-      // and read the block_counts from blocks with higher ids.
-      if (warp_index_mask > prev_grids_ready_ballot) {
-        // accumulate block_counts for prev_block_id
-        prev_grid_count += block_counts[prev_block_id];
-      } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) {
-        // accumulate grid_count for grid_count_ready_id
-        prev_grid_count += grid_counts[prev_block_id];
-      }
-
-
-      prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count);
-      prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp
-
-      if (last_thread) {
-
-        if (!last_block) {
-          grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1];   // write inclusive scan result for grid through block
-          __threadfence();                        // ensure grid_counts ready (release)
-          atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready
-        }
-
-        s_temp_storage.prev_grid_count = prev_grid_count;
-      }
-    }
-
-    __syncthreads();
-    Index_type prev_grid_count = s_temp_storage.prev_grid_count;
-
-    for (size_t ti = 0; ti < items_per_thread; ++ti) {
-      exclusive[ti] = prev_grid_count + exclusive[ti];
-      inclusive[ti] = prev_grid_count + inclusive[ti];
-    }
+template < size_t block_size >
+using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type<
+    detail::cuda::grid_scan_max_items_per_thread<Index_type, block_size>::value+1,
+    integer::LessEqual<detail::cuda::grid_scan_max_items_per_thread<Index_type, block_size>::value>>;
 
-    if (last_block) {
-      for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) {
-        while (atomicCAS(&block_readys[i], 2u, 0u) != 2u);
-      }
-    }
-  }
-}
 
 template < size_t block_size, size_t items_per_thread >
 __launch_bounds__(block_size)
@@ -208,7 +38,7 @@ __global__ void indexlist(Real_ptr x,
                           Index_type* len,
                           Index_type iend)
 {
-  // blocks do start running in order in cuda and hip, so a block with a higher
+  // blocks do start running in order in cuda, so a block with a higher
   // index can wait on a block with a lower index without deadlocking
   // (replace with an atomicInc if this changes)
   const int block_id = blockIdx.x;
@@ -228,7 +58,7 @@ __global__ void indexlist(Real_ptr x,
 
   Index_type exclusives[items_per_thread];
   Index_type inclusives[items_per_thread];
-  grid_scan<block_size, items_per_thread>(
+  detail::cuda::GridScan<Index_type, block_size, items_per_thread>::grid_scan(
       block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys);
 
   for (size_t ti = 0; ti < items_per_thread; ++ti) {
@@ -246,7 +76,8 @@ __global__ void indexlist(Real_ptr x,
   }
 }
 
-template < size_t block_size >
+
+template < size_t block_size, size_t items_per_thread >
 void INDEXLIST::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
@@ -270,18 +101,18 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid)
     allocData(DataSpace::CudaDevice, grid_counts, grid_size);
     unsigned* block_readys;
     allocData(DataSpace::CudaDevice, block_readys, grid_size);
-    cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) );
-    cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      indexlist<block_size, items_per_thread>
-          <<<grid_size, block_size, shmem_size, res.get_stream()>>>(
-          x+ibegin, list+ibegin,
-          block_counts, grid_counts, block_readys,
-          len, iend-ibegin );
-      cudaErrchk( cudaGetLastError() );
+      cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, 
+                                  res.get_stream()) );
+      RPlaunchCudaKernel( (indexlist<block_size, items_per_thread>),
+                          grid_size, block_size,
+                          shmem_size, res.get_stream(),
+                          x+ibegin, list+ibegin,
+                          block_counts, grid_counts, block_readys,
+                          len, iend-ibegin );
 
       cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
       m_len = *len;
@@ -299,7 +130,98 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INDEXLIST, Cuda)
+
+void INDEXLIST::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        using cuda_items_per_thread = cuda_items_per_thread_type<block_size>;
+
+        if (camp::size<cuda_items_per_thread>::value == 0) {
+
+          if (tune_idx == t) {
+
+            runCudaVariantImpl<decltype(block_size)::value,
+                               detail::cuda::grid_scan_default_items_per_thread<
+                                  Real_type, block_size, RAJA_PERFSUITE_TUNING_CUDA_ARCH>::value
+                               >(vid);
+
+          }
+
+          t += 1;
+
+        }
+
+        seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) {
+
+          if (run_params.numValidItemsPerThread() == 0u ||
+              run_params.validItemsPerThread(block_size)) {
+
+            if (tune_idx == t) {
+
+              runCudaVariantImpl<decltype(block_size)::value, items_per_thread>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  INDEXLIST : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+}
+
+void INDEXLIST::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        using cuda_items_per_thread = cuda_items_per_thread_type<block_size>;
+
+        if (camp::size<cuda_items_per_thread>::value == 0) {
+
+          addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+        }
+
+        seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) {
+
+          if (run_params.numValidItemsPerThread() == 0u ||
+              run_params.validItemsPerThread(block_size)) {
+
+            addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_"
+                                      "block_"+std::to_string(block_size));
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp
index 9b0555057..3be527d35 100644
--- a/src/basic/INDEXLIST-Hip.cpp
+++ b/src/basic/INDEXLIST-Hip.cpp
@@ -13,11 +13,7 @@
 #if defined(RAJA_ENABLE_HIP)
 
 #include "common/HipDataUtils.hpp"
-
-#include <rocprim/block/block_scan.hpp>
-#include <rocprim/block/block_exchange.hpp>
-#include <rocprim/warp/warp_reduce.hpp>
-#include <rocprim/warp/warp_scan.hpp>
+#include "common/HipGridScan.hpp"
 
 #include <iostream>
 
@@ -26,177 +22,11 @@ namespace rajaperf
 namespace basic
 {
 
-  //
-  // Define magic numbers for HIP execution
-  //
-  const size_t warp_size = 64;
-  const size_t items_per_thread = 8;
-
-
-// perform a grid scan on val and returns the result at each thread
-// in exclusive and inclusive, note that val is used as scratch space
-template < size_t block_size, size_t items_per_thread >
-__device__ void grid_scan(const int block_id,
-                          Index_type (&val)[items_per_thread],
-                          Index_type (&exclusive)[items_per_thread],
-                          Index_type (&inclusive)[items_per_thread],
-                          Index_type* block_counts,
-                          Index_type* grid_counts,
-                          unsigned* block_readys)
-{
-  const bool first_block = (block_id == 0);
-  const bool last_block = (block_id == static_cast<int>(gridDim.x-1));
-  const bool last_thread = (threadIdx.x == block_size-1);
-  const bool last_warp = (threadIdx.x >= block_size - warp_size);
-  const int warp_index = (threadIdx.x % warp_size);
-  const unsigned long long warp_index_mask = (1ull << warp_index);
-  const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull);
-
-  using BlockScan = rocprim::block_scan<Index_type, block_size>; //, rocprim::block_scan_algorithm::reduce_then_scan>;
-  using BlockExchange = rocprim::block_exchange<Index_type, block_size, items_per_thread>;
-  using WarpReduce = rocprim::warp_reduce<Index_type, warp_size>;
-
-  union SharedStorage {
-    typename BlockScan::storage_type block_scan_storage;
-    typename BlockExchange::storage_type block_exchange_storage;
-    typename WarpReduce::storage_type warp_reduce_storage;
-    volatile Index_type prev_grid_count;
-  };
-  __shared__ SharedStorage s_temp_storage;
-
-
-  BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage);
-  __syncthreads();
-
-
-  BlockScan().exclusive_scan(val, exclusive, Index_type{0}, s_temp_storage.block_scan_storage);
-  __syncthreads();
-
-  for (size_t ti = 0; ti < items_per_thread; ++ti) {
-    inclusive[ti] = exclusive[ti] + val[ti];
-  }
-
-  BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage);
-  __syncthreads();
-  BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage);
-  __syncthreads();
-  if (first_block) {
-
-    if (!last_block && last_thread) {
-      block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
-      grid_counts[block_id] = inclusive[items_per_thread-1];  // write inclusive scan result for grid through block
-      __threadfence();                         // ensure block_counts, grid_counts ready (release)
-      atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready
-    }
-
-  } else {
-
-    if (!last_block && last_thread) {
-      block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
-      __threadfence();                         // ensure block_counts ready (release)
-      atomicExch(&block_readys[block_id], 1u); // write block_counts is ready
-    }
-
-    // get prev_grid_count using last warp in block
-    if (last_warp) {
-
-      Index_type prev_grid_count = 0;
-
-      // accumulate previous block counts into registers of warp
-
-      int prev_block_base_id = block_id - warp_size;
-
-      unsigned prev_block_ready = 0u;
-      unsigned long long prev_blocks_ready_ballot = 0ull;
-      unsigned long long prev_grids_ready_ballot = 0ull;
-
-      // accumulate full warp worths of block counts
-      // stop if run out of full warps of a grid count is ready
-      while (prev_block_base_id >= 0) {
-
-        const int prev_block_id = prev_block_base_id + warp_index;
-
-        // ensure previous block_counts are ready
-        do {
-          prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
-
-          prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u);
-
-        } while (prev_blocks_ready_ballot != 0xffffffffffffffffull);
-
-        prev_grids_ready_ballot = __ballot(prev_block_ready == 2u);
-
-        if (prev_grids_ready_ballot != 0ull) {
-          break;
-        }
-
-        __threadfence(); // ensure block_counts or grid_counts ready (acquire)
-
-        // accumulate block_counts for prev_block_id
-        prev_grid_count += block_counts[prev_block_id];
-
-        prev_block_ready = 0u;
-
-        prev_block_base_id -= warp_size;
-      }
-
-      const int prev_block_id = prev_block_base_id + warp_index;
-
-      // ensure previous block_counts are ready
-      // this checks that block counts is ready for all blocks above
-      // the highest grid count that is ready
-      while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) {
-
-        if (prev_block_id >= 0) {
-          prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
-        }
-
-        prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u);
-        prev_grids_ready_ballot = __ballot(prev_block_ready == 2u);
-      }
-      __threadfence(); // ensure block_counts or grid_counts ready (acquire)
-
-      // read one grid_count from a block with id grid_count_ready_id
-      // and read the block_counts from blocks with higher ids.
-      if (warp_index_mask > prev_grids_ready_ballot) {
-        // accumulate block_counts for prev_block_id
-        prev_grid_count += block_counts[prev_block_id];
-      } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) {
-        // accumulate grid_count for grid_count_ready_id
-        prev_grid_count += grid_counts[prev_block_id];
-      }
-
-
-      WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage);
-      prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp
-
-      if (last_thread) {
-
-        if (!last_block) {
-          grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1];   // write inclusive scan result for grid through block
-          __threadfence();                        // ensure grid_counts ready (release)
-          atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready
-        }
-
-        s_temp_storage.prev_grid_count = prev_grid_count;
-      }
-    }
-
-    __syncthreads();
-    Index_type prev_grid_count = s_temp_storage.prev_grid_count;
-
-    for (size_t ti = 0; ti < items_per_thread; ++ti) {
-      exclusive[ti] = prev_grid_count + exclusive[ti];
-      inclusive[ti] = prev_grid_count + inclusive[ti];
-    }
+template < size_t block_size >
+using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type<
+    detail::hip::grid_scan_max_items_per_thread<Index_type, block_size>::value+1,
+    integer::LessEqual<detail::hip::grid_scan_max_items_per_thread<Index_type, block_size>::value>>;
 
-    if (last_block) {
-      for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) {
-        while (atomicCAS(&block_readys[i], 2u, 0u) != 2u);
-      }
-    }
-  }
-}
 
 template < size_t block_size, size_t items_per_thread >
 __launch_bounds__(block_size)
@@ -208,9 +38,9 @@ __global__ void indexlist(Real_ptr x,
                           Index_type* len,
                           Index_type iend)
 {
-  // blocks do start running in order in cuda and hip, so a block with a higher
-  // index can wait on a block with a lower index without deadlocking
-  // (replace with an atomicInc if this changes)
+  // It looks like blocks do not start running in order in hip, so a block
+  // with a higher index can't wait on a block with a lower index without
+  // deadlocking (have to replace with an atomicInc)
   const int block_id = blockIdx.x;
 
   Index_type vals[items_per_thread];
@@ -228,7 +58,7 @@ __global__ void indexlist(Real_ptr x,
 
   Index_type exclusives[items_per_thread];
   Index_type inclusives[items_per_thread];
-  grid_scan<block_size, items_per_thread>(
+  detail::hip::GridScan<Index_type, block_size, items_per_thread>::grid_scan(
       block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys);
 
   for (size_t ti = 0; ti < items_per_thread; ++ti) {
@@ -246,7 +76,7 @@ __global__ void indexlist(Real_ptr x,
   }
 }
 
-template < size_t block_size >
+template < size_t block_size, size_t items_per_thread >
 void INDEXLIST::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
@@ -263,25 +93,26 @@ void INDEXLIST::runHipVariantImpl(VariantID vid)
     const size_t shmem_size = 0;
 
     Index_type* len;
-    allocData(DataSpace::HipPinned, len, 1);
+    allocData(DataSpace::HipPinnedCoarse, len, 1);
     Index_type* block_counts;
     allocData(DataSpace::HipDevice, block_counts, grid_size);
     Index_type* grid_counts;
     allocData(DataSpace::HipDevice, grid_counts, grid_size);
     unsigned* block_readys;
     allocData(DataSpace::HipDevice, block_readys, grid_size);
-    hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) );
-    hipErrchk( hipStreamSynchronize( res.get_stream() ) );
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      indexlist<block_size, items_per_thread>
-          <<<grid_size, block_size, shmem_size, res.get_stream()>>>(
-          x+ibegin, list+ibegin,
-          block_counts, grid_counts, block_readys,
-          len, iend-ibegin );
-      hipErrchk( hipGetLastError() );
+      hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size,
+                                res.get_stream()) );
+
+      RPlaunchHipKernel( (indexlist<block_size, items_per_thread>),
+                         grid_size, block_size,
+                         shmem_size, res.get_stream(),
+                         x+ibegin, list+ibegin,
+                         block_counts, grid_counts, block_readys,
+                         len, iend-ibegin );
 
       hipErrchk( hipStreamSynchronize( res.get_stream() ) );
       m_len = *len;
@@ -289,7 +120,7 @@ void INDEXLIST::runHipVariantImpl(VariantID vid)
     }
     stopTimer();
 
-    deallocData(DataSpace::HipPinned, len);
+    deallocData(DataSpace::HipPinnedCoarse, len);
     deallocData(DataSpace::HipDevice, block_counts);
     deallocData(DataSpace::HipDevice, grid_counts);
     deallocData(DataSpace::HipDevice, block_readys);
@@ -299,7 +130,98 @@ void INDEXLIST::runHipVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INDEXLIST, Hip)
+
+void INDEXLIST::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        using hip_items_per_thread = hip_items_per_thread_type<block_size>;
+
+        if (camp::size<hip_items_per_thread>::value == 0) {
+
+          if (tune_idx == t) {
+
+            runHipVariantImpl<decltype(block_size)::value,
+                               detail::hip::grid_scan_default_items_per_thread<
+                                  Real_type, block_size, RAJA_PERFSUITE_TUNING_HIP_ARCH>::value
+                               >(vid);
+
+          }
+
+          t += 1;
+
+        }
+
+        seq_for(hip_items_per_thread{}, [&](auto items_per_thread) {
+
+          if (run_params.numValidItemsPerThread() == 0u ||
+              run_params.validItemsPerThread(block_size)) {
+
+            if (tune_idx == t) {
+
+              runHipVariantImpl<block_size, items_per_thread>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  INDEXLIST : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+}
+
+void INDEXLIST::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        using hip_items_per_thread = hip_items_per_thread_type<block_size>;
+
+        if (camp::size<hip_items_per_thread>::value == 0) {
+
+          addVariantTuningName(vid, "block_"+std::to_string(block_size));
+
+        }
+
+        seq_for(hip_items_per_thread{}, [&](auto items_per_thread) {
+
+          if (run_params.numValidItemsPerThread() == 0u ||
+              run_params.validItemsPerThread(block_size)) {
+
+              addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_"
+                                        "block_"+std::to_string(block_size));
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp
index cb559c8b2..0336d0643 100644
--- a/src/basic/INDEXLIST.cpp
+++ b/src/basic/INDEXLIST.cpp
@@ -28,9 +28,11 @@ INDEXLIST::INDEXLIST(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Index_type) + 1*sizeof(Index_type)) +
-                  (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 + // about 50% output
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Index_type) +
+                      1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Index_type) +
+                         1*sizeof(Int_type) * getActualProblemSize() / 2 ); // about 50% output
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature(Forall);
diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp
index 0836d8197..bdf00a446 100644
--- a/src/basic/INDEXLIST.hpp
+++ b/src/basic/INDEXLIST.hpp
@@ -1,7 +1,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
-// See the RAJAPerf/COPYRIGHT file for details.
+// See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
@@ -63,14 +63,15 @@ class INDEXLIST : public KernelBase
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
-  template < size_t block_size >
+  
+  template < size_t block_size, size_t items_per_thread >
   void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size >
+  template < size_t block_size, size_t items_per_thread >
   void runHipVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Int_ptr m_list;
diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp
index 7b6a9ade6..e95524201 100644
--- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp
+++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp
@@ -101,9 +101,11 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      indexlist_conditional<block_size><<<grid_size, block_size, shmem, stream>>>(
-          x, counts, iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (indexlist_conditional<block_size>),
+                          grid_size, block_size,
+                          shmem, stream,
+                          x, counts, iend );
 
       cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
                                                   temp_storage_bytes,
@@ -114,9 +116,10 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid)
                                                   scan_size,
                                                   stream));
 
-      indexlist_make_list<block_size><<<grid_size, block_size, shmem, stream>>>(
-          list, counts, len, iend );
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (indexlist_make_list<block_size>),
+                          grid_size, block_size,
+                          shmem, stream,
+                          list, counts, len, iend );
 
       cudaErrchk( cudaStreamSynchronize(stream) );
       m_len = *len;
@@ -133,34 +136,42 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid)
 
     INDEXLIST_3LOOP_DATA_SETUP_CUDA;
 
+    Index_type* len;
+    allocData(DataSpace::CudaPinned, len, 1);
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::cuda_reduce, Index_type> len(0);
-
       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
         RAJA::RangeSegment(ibegin, iend),
         [=] __device__ (Index_type i) {
         counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
       });
 
-      RAJA::exclusive_scan_inplace< RAJA::cuda_exec<block_size, true /*async*/> >( res,
-          RAJA::make_span(counts+ibegin, iend+1-ibegin));
+      RAJA::exclusive_scan_inplace<
+        RAJA::cuda_exec<block_size, true /*async*/> >(
+          res,
+          RAJA::make_span(counts+ibegin, iend+1-ibegin) );
 
       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
         RAJA::RangeSegment(ibegin, iend),
         [=] __device__ (Index_type i) {
         if (counts[i] != counts[i+1]) {
           list[counts[i]] = i;
-          len += 1;
+        }
+        if (i == iend-1) {
+          *len = counts[i+1];
         }
       });
 
-      m_len = len.get();
+      res.wait();
+      m_len = *len;
 
     }
     stopTimer();
 
+    deallocData(DataSpace::CudaPinned, len);
+
     INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA;
 
   } else {
diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp
index b4d0d26f8..e1de399a0 100644
--- a/src/basic/INDEXLIST_3LOOP-Hip.cpp
+++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp
@@ -74,7 +74,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid)
     INDEXLIST_3LOOP_DATA_SETUP_HIP;
 
     Index_type* len;
-    allocData(DataSpace::HipPinned, len, 1);
+    allocData(DataSpace::HipPinnedCoarse, len, 1);
 
     hipStream_t stream = res.get_stream();
 
@@ -112,9 +112,11 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((indexlist_conditional<block_size>), grid_size, block_size, shmem, stream,
-          x, counts, iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (indexlist_conditional<block_size>),
+                         grid_size, block_size,
+                         shmem, stream,
+                         x, counts, iend );
 
 #if defined(__HIPCC__)
       hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
@@ -136,9 +138,10 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid)
                                                  stream));
 #endif
 
-      hipLaunchKernelGGL((indexlist_make_list<block_size>), grid_size, block_size, shmem, stream,
-          list, counts, len, iend );
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (indexlist_make_list<block_size>),
+                         grid_size, block_size,
+                         shmem, stream,
+                         list, counts, len, iend );
 
       hipErrchk( hipStreamSynchronize(stream) );
       m_len = *len;
@@ -147,7 +150,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     deallocData(DataSpace::HipDevice, temp_storage);
-    deallocData(DataSpace::HipPinned, len);
+    deallocData(DataSpace::HipPinnedCoarse, len);
 
     INDEXLIST_3LOOP_DATA_TEARDOWN_HIP;
 
@@ -155,34 +158,42 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid)
 
     INDEXLIST_3LOOP_DATA_SETUP_HIP;
 
+    Index_type* len;
+    allocData(DataSpace::HipPinnedCoarse, len, 1);
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::hip_reduce, Index_type> len(0);
-
       RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
         RAJA::RangeSegment(ibegin, iend),
         [=] __device__ (Index_type i) {
         counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0;
       });
 
-      RAJA::exclusive_scan_inplace< RAJA::hip_exec<block_size, true /*async*/> >( res,
-          RAJA::make_span(counts+ibegin, iend+1-ibegin));
+      RAJA::exclusive_scan_inplace<
+        RAJA::hip_exec<block_size, true /*async*/> >(
+          res,
+          RAJA::make_span(counts+ibegin, iend+1-ibegin) );
 
       RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
         RAJA::RangeSegment(ibegin, iend),
         [=] __device__ (Index_type i) {
         if (counts[i] != counts[i+1]) {
           list[counts[i]] = i;
-          len += 1;
+        }
+        if (i == iend-1) {
+          *len = counts[i+1];
         }
       });
 
-      m_len = len.get();
+      res.wait();
+      m_len = *len;
 
     }
     stopTimer();
 
+    deallocData(DataSpace::HipPinnedCoarse, len);
+
     INDEXLIST_3LOOP_DATA_TEARDOWN_HIP;
 
   } else {
diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp
index d84736ef7..57cb14c23 100644
--- a/src/basic/INDEXLIST_3LOOP-OMP.cpp
+++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp
@@ -203,8 +203,6 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::ReduceSum<RAJA::omp_reduce, Index_type> len(0);
-
         RAJA::forall<RAJA::omp_parallel_for_exec>(
           RAJA::RangeSegment(ibegin, iend),
           [=](Index_type i) {
@@ -219,11 +217,10 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
           [=](Index_type i) {
           if (counts[i] != counts[i+1]) {
             list[counts[i]] = i;
-            len += 1;
           }
         });
 
-        m_len = len.get();
+        m_len = counts[iend];
 
       }
       stopTimer();
diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp
index 9de3f3393..3828c5652 100644
--- a/src/basic/INDEXLIST_3LOOP-Seq.cpp
+++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp
@@ -117,8 +117,6 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::ReduceSum<RAJA::seq_reduce, Index_type> len(0);
-
         RAJA::forall<RAJA::seq_exec>(
           RAJA::RangeSegment(ibegin, iend),
           [=](Index_type i) {
@@ -133,11 +131,10 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
           [=](Index_type i) {
           if (counts[i] != counts[i+1]) {
             list[counts[i]] = i;
-            len += 1;
           }
         });
 
-        m_len = len.get();
+        m_len = counts[iend];
 
       }
       stopTimer();
diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp
index 49117dc66..1759f10b0 100644
--- a/src/basic/INDEXLIST_3LOOP.cpp
+++ b/src/basic/INDEXLIST_3LOOP.cpp
@@ -28,14 +28,19 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params)
 
   setItsPerRep( 3 * getActualProblemSize() + 1 );
   setKernelsPerRep(3);
-  setBytesPerRep( (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() +
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
 
-                  (1*sizeof(Index_type) + 1*sizeof(Index_type)) +
-                  (1*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) +
+                      1*sizeof(Index_type) +
+                      1*sizeof(Index_type) * (getActualProblemSize()+1) +
 
-                  (0*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) +
-                  (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 ); // about 50% output
+                      1*sizeof(Index_type) * (getActualProblemSize()+1) );
+  setBytesWrittenPerRep( 1*sizeof(Index_type) * getActualProblemSize() +
+
+                         1*sizeof(Index_type) +
+                         1*sizeof(Index_type) * (getActualProblemSize()+1) +
+
+                         1*sizeof(Int_type) * (getActualProblemSize()+1) / 2 ); // about 50% output
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature(Forall);
diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp
index e19ee5508..5cd2ac8ab 100644
--- a/src/basic/INDEXLIST_3LOOP.hpp
+++ b/src/basic/INDEXLIST_3LOOP.hpp
@@ -1,7 +1,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
-// See the RAJAPerf/COPYRIGHT file for details.
+// See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
@@ -74,6 +74,7 @@ class INDEXLIST_3LOOP : public KernelBase
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
@@ -81,7 +82,7 @@ class INDEXLIST_3LOOP : public KernelBase
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Int_ptr m_list;
diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp
index a6f61d73a..8b3cb9bb7 100644
--- a/src/basic/INIT3-Cuda.cpp
+++ b/src/basic/INIT3-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,9 +53,13 @@ void INIT3::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      init3<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( out1, out2, out3, in1, in2,
-                                        iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (init3<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          out1, out2, out3,
+                          in1, in2,
+                          iend );
 
     }
     stopTimer();
@@ -65,13 +69,18 @@ void INIT3::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto init3_lambda = [=] __device__ (Index_type i) {
+        INIT3_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        INIT3_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(init3_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, init3_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp
index 99f5eec2b..be0f2e74f 100644
--- a/src/basic/INIT3-Hip.cpp
+++ b/src/basic/INIT3-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,9 +53,13 @@ void INIT3::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((init3<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  out1, out2, out3, in1, in2,
-                                        iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (init3<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         out1, out2, out3,
+                         in1, in2,
+                         iend );
 
     }
     stopTimer();
@@ -71,9 +75,12 @@ void INIT3::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(init3_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, init3_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(init3_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, init3_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp
index 25d31585c..346a92399 100644
--- a/src/basic/INIT3-OMP.cpp
+++ b/src/basic/INIT3-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp
index 825730bdc..0caee8c80 100644
--- a/src/basic/INIT3-OMPTarget.cpp
+++ b/src/basic/INIT3-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp
index 398e986b1..20feb79a4 100644
--- a/src/basic/INIT3-Seq.cpp
+++ b/src/basic/INIT3-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp
new file mode 100644
index 000000000..ea5277730
--- /dev/null
+++ b/src/basic/INIT3-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT3.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+template <size_t work_group_size >
+void INIT3::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  INIT3_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+ 
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                                         [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            INIT3_BODY
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        INIT3_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  INIT3 : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT3, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp
index bbf90da80..1f0da97f3 100644
--- a/src/basic/INIT3.cpp
+++ b/src/basic/INIT3.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ INIT3::INIT3(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (3*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -52,6 +54,9 @@ INIT3::INIT3(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp
index aed67bfeb..7e5f6a026 100644
--- a/src/basic/INIT3.hpp
+++ b/src/basic/INIT3.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -55,18 +55,24 @@ class INIT3 : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_out1;
   Real_ptr m_out2;
diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp
index ca6fbdf3c..e535ac041 100644
--- a/src/basic/INIT_VIEW1D-Cuda.cpp
+++ b/src/basic/INIT_VIEW1D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,8 +53,11 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      initview1d<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( a, v, iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (initview1d<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          a, v, iend );
 
     }
     stopTimer();
@@ -64,13 +67,18 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto initview1d_lambda = [=] __device__ (Index_type i) {
+        INIT_VIEW1D_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        INIT_VIEW1D_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(initview1d_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, initview1d_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp
index 0951d954f..130a62a42 100644
--- a/src/basic/INIT_VIEW1D-Hip.cpp
+++ b/src/basic/INIT_VIEW1D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,9 +53,11 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((initview1d<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-          a, v, iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (initview1d<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         a, v, iend );
 
     }
     stopTimer();
@@ -71,9 +73,12 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(initview1d_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(initview1d_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, initview1d_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp
index 742270ff6..52160ab13 100644
--- a/src/basic/INIT_VIEW1D-OMP.cpp
+++ b/src/basic/INIT_VIEW1D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp
index d9ad636e1..825fcd569 100644
--- a/src/basic/INIT_VIEW1D-OMPTarget.cpp
+++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp
index 59c494c49..b1c761a5c 100644
--- a/src/basic/INIT_VIEW1D-Seq.cpp
+++ b/src/basic/INIT_VIEW1D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp
new file mode 100644
index 000000000..ff06d2203
--- /dev/null
+++ b/src/basic/INIT_VIEW1D-Sycl.cpp
@@ -0,0 +1,83 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT_VIEW1D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf
+{
+namespace basic
+{
+
+template <size_t work_group_size >
+void INIT_VIEW1D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  INIT_VIEW1D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                                        [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            INIT_VIEW1D_BODY
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    INIT_VIEW1D_VIEW_RAJA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size  /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        INIT_VIEW1D_BODY_RAJA;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  INIT_VIEW1D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT_VIEW1D, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp
index 018811f34..eb24e8e8e 100644
--- a/src/basic/INIT_VIEW1D.cpp
+++ b/src/basic/INIT_VIEW1D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -53,6 +55,9 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp
index f3770f69a..0a3be36c3 100644
--- a/src/basic/INIT_VIEW1D.hpp
+++ b/src/basic/INIT_VIEW1D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -66,18 +66,24 @@ class INIT_VIEW1D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_a;
   Real_type m_val;
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp
index 7d9bee43b..3a13f5210 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -54,10 +54,12 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
       constexpr size_t shmem = 0;
-      initview1d_offset<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( a, v,
-                                                    ibegin,
-                                                    iend );
-      cudaErrchk( cudaGetLastError() );
+     
+      RPlaunchCudaKernel( (initview1d_offset<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          a, v, 
+                          ibegin, iend );
 
     }
     stopTimer();
@@ -67,13 +69,18 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto initview1d_offset_lambda = [=] __device__ (Index_type i) {
+        INIT_VIEW1D_OFFSET_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        INIT_VIEW1D_OFFSET_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(initview1d_offset_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, initview1d_offset_lambda ); 
 
     }
     stopTimer();
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
index 2fb16872f..2940bb59d 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -54,9 +54,12 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((initview1d_offset<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-          a, v, ibegin, iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (initview1d_offset<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         a, v,
+                         ibegin, iend );
 
     }
     stopTimer();
@@ -72,9 +75,12 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(initview1d_offset_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_offset_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(initview1d_offset_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, initview1d_offset_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp
index 8fb7c0129..bb6834c17 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
index d045462d7..f87fa2625 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp
index c25511aa1..b7588350a 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp
new file mode 100644
index 000000000..f586540f6
--- /dev/null
+++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT_VIEW1D_OFFSET.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+template <size_t work_group_size >
+void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 1;
+  const Index_type iend = getActualProblemSize()+1;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  INIT_VIEW1D_OFFSET_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+ 
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend-ibegin, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                                         [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = ibegin + item.get_global_id(0);
+          if (i < iend) {
+            INIT_VIEW1D_OFFSET_BODY
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+  
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        INIT_VIEW1D_OFFSET_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT_VIEW1D_OFFSET, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp
index 4daa109a6..1eef8fc3d 100644
--- a/src/basic/INIT_VIEW1D_OFFSET.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -53,6 +55,9 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp
index d32f59c7b..92a75935d 100644
--- a/src/basic/INIT_VIEW1D_OFFSET.hpp
+++ b/src/basic/INIT_VIEW1D_OFFSET.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -65,18 +65,24 @@ class INIT_VIEW1D_OFFSET : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_a;
   Real_type m_val;
diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp
index f63af21d7..926c5f979 100644
--- a/src/basic/MAT_MAT_SHARED-Cuda.cpp
+++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp
@@ -50,7 +50,7 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A,
 template < size_t block_size >
 void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid)
 {
-  constexpr Index_type tile_size = gpu_block_size::sqrt(block_size);
+  constexpr Index_type tile_size = integer::sqrt(block_size);
   static_assert(tile_size*tile_size == block_size, "Invalid block_size");
 
   const Index_type run_reps = getRunReps();
@@ -73,9 +73,10 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      mat_mat_shared<tile_size><<<gridDim, blockDim, shmem, res.get_stream()>>>(N, C, A, B);
-
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (mat_mat_shared<tile_size>),
+                          gridDim, blockDim,
+                          shmem, res.get_stream(),
+                          N, C, A, B );
     }
     stopTimer();
 
@@ -84,7 +85,8 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      lambda_cuda<tile_size*tile_size><<<gridDim, blockDim, shmem, res.get_stream()>>>([=] __device__() {
+      auto mat_mat_shared_lambda = [=] __device__() {
+
         auto outer_y = [&](Index_type by) {
           auto outer_x = [&](Index_type bx) {
             MAT_MAT_SHARED_BODY_0(tile_size)
@@ -171,9 +173,13 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid)
           Index_type by = blockIdx.y;
           if(by < Ny) outer_y(by);
         }
-      });
+      };
 
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (lambda_cuda<tile_size*tile_size, 
+                                       decltype(mat_mat_shared_lambda)>),
+                          gridDim, blockDim,
+                          shmem, res.get_stream(),
+                          mat_mat_shared_lambda ); 
     }
     stopTimer();
 
diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp
index d548395e3..9c58d9267 100644
--- a/src/basic/MAT_MAT_SHARED-Hip.cpp
+++ b/src/basic/MAT_MAT_SHARED-Hip.cpp
@@ -50,7 +50,7 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A,
 template < size_t block_size >
 void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid)
 {
-  constexpr Index_type tile_size = gpu_block_size::sqrt(block_size);
+  constexpr Index_type tile_size = integer::sqrt(block_size);
   static_assert(tile_size*tile_size == block_size, "Invalid block_size");
 
   const Index_type run_reps = getRunReps();
@@ -73,10 +73,10 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipLaunchKernelGGL((mat_mat_shared<tile_size>), dim3(gridDim), dim3(blockDim), shmem, res.get_stream(),
-                         N, C, A, B);
-
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (mat_mat_shared<tile_size>),
+                         gridDim, blockDim,
+                         shmem, res.get_stream(),
+                         N, C, A, B );
     }
     stopTimer();
 
@@ -85,7 +85,7 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      auto mat_mat_shared_lam = [=] __device__() {
+      auto mat_mat_shared_lambda = [=] __device__() {
 
         auto outer_y = [&](Index_type by) {
           auto outer_x = [&](Index_type bx) {
@@ -175,10 +175,11 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid)
         }
       };
 
-      hipLaunchKernelGGL((lambda_hip<tile_size*tile_size, decltype(mat_mat_shared_lam)>),
-        gridDim, blockDim, shmem, res.get_stream(), mat_mat_shared_lam);
-
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (lambda_hip<tile_size*tile_size,
+                                     decltype(mat_mat_shared_lambda)>),
+                         gridDim, blockDim,
+                         shmem, res.get_stream(),
+                         mat_mat_shared_lambda );
     }
     stopTimer();
 
diff --git a/src/basic/MAT_MAT_SHARED-Sycl.cpp b/src/basic/MAT_MAT_SHARED-Sycl.cpp
new file mode 100644
index 000000000..174ac0952
--- /dev/null
+++ b/src/basic/MAT_MAT_SHARED-Sycl.cpp
@@ -0,0 +1,201 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_MAT_SHARED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+template < size_t work_group_size >
+void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid)
+{
+  constexpr Index_type tile_size = integer::sqrt(work_group_size);
+  static_assert(tile_size*tile_size == work_group_size, "Invalid block_size");
+
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+
+  const Index_type Nx = RAJA_DIVIDE_CEILING_INT(N, tile_size);
+  const Index_type Ny = RAJA_DIVIDE_CEILING_INT(N, tile_size);
+
+  //Right most is the fastest index
+  const ::sycl::range<3> workGroupSize(1, tile_size, tile_size);
+  const ::sycl::range<3> gridSize(1, Ny*tile_size, Nx*tile_size);
+
+  constexpr size_t shmem = tile_size * tile_size;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  MAT_MAT_SHARED_DATA_SETUP;
+
+  if (vid == Base_SYCL) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&](cl::sycl::handler& h) {
+
+       ::sycl::local_accessor<double, 2> As(::sycl::range<2>(tile_size, tile_size), h);
+       ::sycl::local_accessor<double, 2> Bs(::sycl::range<2>(tile_size, tile_size), h);
+       ::sycl::local_accessor<double, 2> Cs(::sycl::range<2>(tile_size, tile_size), h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, workGroupSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+             Index_type tx = itm.get_local_id(2);
+             Index_type ty = itm.get_local_id(1);
+             Index_type bx = itm.get_group(2);
+             Index_type by = itm.get_group(1);
+
+             MAT_MAT_SHARED_BODY_1(tile_size)
+
+               for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) {
+
+                 MAT_MAT_SHARED_BODY_2(tile_size)
+
+                 itm.barrier(::sycl::access::fence_space::local_space);
+
+                 MAT_MAT_SHARED_BODY_3(tile_size)
+
+                 itm.barrier(::sycl::access::fence_space::local_space);
+               }
+
+             MAT_MAT_SHARED_BODY_4(tile_size)
+
+           });
+
+      });
+
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    constexpr bool async = true;
+
+    const int local_mats = 3;
+    constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(double);
+
+    using launch_policy = RAJA::LaunchPolicy<RAJA::sycl_launch_t<async>>;
+
+    using teams_x = RAJA::LoopPolicy<RAJA::sycl_group_2_direct>;
+
+    using teams_y = RAJA::LoopPolicy<RAJA::sycl_group_1_direct>;
+
+    using threads_x = RAJA::LoopPolicy<RAJA::sycl_local_2_direct>;
+
+    using threads_y = RAJA::LoopPolicy<RAJA::sycl_local_1_direct>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::launch<launch_policy>( res,
+        RAJA::LaunchParams(RAJA::Teams(Nx, Ny),
+                           RAJA::Threads(tile_size, tile_size), shmem),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+          RAJA::loop<teams_y>(ctx, RAJA::RangeSegment(0, Ny),
+            [&](Index_type by) {
+              RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, Nx),
+                [&](Index_type bx) {
+
+                  //We only support dynamic shared memory in Sycl
+                  //Thus requiring a different setup than other backends
+                  //which use static shared memory
+                  double * As_ptr = ctx.getSharedMemory<double>(tile_size * tile_size);
+                  double * Bs_ptr = ctx.getSharedMemory<double>(tile_size * tile_size);
+                  double * Cs_ptr = ctx.getSharedMemory<double>(tile_size * tile_size);
+                  double (*As)[tile_size] = (double (*)[tile_size]) As_ptr;
+                  double (*Bs)[tile_size] = (double (*)[tile_size]) Bs_ptr;
+                  double (*Cs)[tile_size] = (double (*)[tile_size]) Cs_ptr;
+
+                  RAJA::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
+                    [&](Index_type ty) {
+                      RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
+                        [&](Index_type tx) {
+                          MAT_MAT_SHARED_BODY_1(tile_size)
+                        }
+                      );  // RAJA::loop<threads_x>
+                    }
+                  );  // RAJA::loop<threads_y>
+
+                  for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) {
+
+                    RAJA::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
+                      [&](Index_type ty) {
+                        RAJA::loop<threads_x>(ctx,
+                                                    RAJA::RangeSegment(0, tile_size),
+                          [&](Index_type tx) {
+                            MAT_MAT_SHARED_BODY_2(tile_size)
+                          }
+                        ); // RAJA::loop<threads_x>
+                      }
+                    );  // RAJA::loop<threads_y>
+
+                    ctx.teamSync();
+
+                    RAJA::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
+                      [&](Index_type ty) {
+                        RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
+                          [&](Index_type tx) {
+                            MAT_MAT_SHARED_BODY_3(tile_size)
+                          }
+                        );  // RAJA::loop<threads_x>
+                      }
+                    );  // RAJA::loop<threads_y>
+
+                    ctx.teamSync();
+
+                  }  // for (k)
+
+                  RAJA::loop<threads_y>(ctx, RAJA::RangeSegment(0, tile_size),
+                    [&](Index_type ty) {
+                      RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, tile_size),
+                        [&](Index_type tx) {
+                          MAT_MAT_SHARED_BODY_4(tile_size)
+                        }
+                      );  // RAJA::loop<threads_x>
+                    }
+                  );  // RAJA::loop<threads_y>
+
+                }  // lambda (bx)
+              );  // RAJA::loop<teams_x>
+            }  // lambda (by)
+          );  // RAJA::loop<teams_y>
+
+        }   // outer lambda (ctx)
+      );  // RAJA::launch
+
+    }  // loop over kernel reps
+    stopTimer();
+
+  } else {
+    getCout() << "\n  MAT_MAT_SHARED : Unknown Sycl variant id = " << vid
+              << std::endl;
+  }
+
+}
+
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MAT_MAT_SHARED, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_SYCL
diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp
index 2173a1bc6..61a85e898 100644
--- a/src/basic/MAT_MAT_SHARED.cpp
+++ b/src/basic/MAT_MAT_SHARED.cpp
@@ -24,15 +24,16 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams &params)
   setDefaultProblemSize(m_N_default*m_N_default);
   setDefaultReps(5);
 
-  m_N = std::max(Index_type(std::sqrt(getTargetProblemSize())), Index_type(1));
+  m_N = std::sqrt(getTargetProblemSize()) + std::sqrt(2)-1;
 
   setActualProblemSize(m_N * m_N);
 
   setItsPerRep(getActualProblemSize());
   setKernelsPerRep(1);
 
-  setBytesPerRep( m_N*m_N*sizeof(Real_type) +
-                  m_N*m_N*sizeof(Real_type) );
+  setBytesReadPerRep( 2*sizeof(Real_type) * m_N*m_N );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_N*m_N  );
+  setBytesAtomicModifyWrittenPerRep( 0 );
 
   const Index_type no_tiles = (TL_SZ + m_N - 1) / TL_SZ;
   const Index_type no_blocks = RAJA_DIVIDE_CEILING_INT(m_N, TL_SZ);
@@ -60,6 +61,9 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams &params)
   setVariantDefined(Base_HIP);
   setVariantDefined(Lambda_HIP);
   setVariantDefined(RAJA_HIP);
+
+  setVariantDefined(Base_SYCL);
+  setVariantDefined(RAJA_SYCL);  
 }
 
 MAT_MAT_SHARED::~MAT_MAT_SHARED() {}
diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp
index 095721c27..b543dd4f7 100644
--- a/src/basic/MAT_MAT_SHARED.hpp
+++ b/src/basic/MAT_MAT_SHARED.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -94,7 +94,7 @@ constexpr rajaperf::Index_type TL_SZ = 16;
   RAJA_TEAM_SHARED double Bs[tile_size][tile_size];                            \
   RAJA_TEAM_SHARED double Cs[tile_size][tile_size];
 
-#define MAT_MAT_SHARED_BODY_1(tile_size)                                       \
+#define MAT_MAT_SHARED_BODY_1(tile_size)        \
   Cs[ty][tx] = 0;
 
 #define MAT_MAT_SHARED_BODY_2(tile_size)                                       \
@@ -139,17 +139,22 @@ class MAT_MAT_SHARED : public KernelBase {
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = TL_SZ * TL_SZ;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size, gpu_block_size::ExactSqrt>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size, integer::ExactSqrt>;
 
   Real_ptr m_A;
   Real_ptr m_B;
diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp
index 3f0dec4dd..e39cd5a77 100644
--- a/src/basic/MULADDSUB-Cuda.cpp
+++ b/src/basic/MULADDSUB-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,9 +53,13 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      muladdsub<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( out1, out2, out3, in1, in2,
-                                            iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (muladdsub<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          out1, out2, out3,
+                          in1, in2,
+                          iend );
 
     }
     stopTimer();
@@ -65,13 +69,18 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto muladdsub_lambda = [=] __device__ (Index_type i) {
+        MULADDSUB_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        MULADDSUB_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(muladdsub_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, muladdsub_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp
index 9d292001f..bb846eef5 100644
--- a/src/basic/MULADDSUB-Hip.cpp
+++ b/src/basic/MULADDSUB-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,9 +53,13 @@ void MULADDSUB::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((muladdsub<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-          out1, out2, out3, in1, in2, iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (muladdsub<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         out1, out2, out3,
+                         in1, in2,
+                         iend );
 
     }
     stopTimer();
@@ -71,9 +75,12 @@ void MULADDSUB::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(muladdsub_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, muladdsub_lambda );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(muladdsub_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, muladdsub_lambda );
 
     }
     stopTimer();
diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp
index 6c9bb2038..28f5edab1 100644
--- a/src/basic/MULADDSUB-OMP.cpp
+++ b/src/basic/MULADDSUB-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp
index af691d008..f4d0e716a 100644
--- a/src/basic/MULADDSUB-OMPTarget.cpp
+++ b/src/basic/MULADDSUB-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp
index 59ddf1ea1..40ede6d64 100644
--- a/src/basic/MULADDSUB-Seq.cpp
+++ b/src/basic/MULADDSUB-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp
new file mode 100644
index 000000000..9bca65221
--- /dev/null
+++ b/src/basic/MULADDSUB-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULADDSUB.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+template <size_t work_group_size >
+void MULADDSUB::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  MULADDSUB_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+ 
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                                         [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            MULADDSUB_BODY
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        MULADDSUB_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  MULADDSUB : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MULADDSUB, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp
index a5deb6049..4ab19194c 100644
--- a/src/basic/MULADDSUB.cpp
+++ b/src/basic/MULADDSUB.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ MULADDSUB::MULADDSUB(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (3*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -52,6 +54,9 @@ MULADDSUB::MULADDSUB(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp
index e604a34c8..1846a49a7 100644
--- a/src/basic/MULADDSUB.hpp
+++ b/src/basic/MULADDSUB.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,18 +58,24 @@ class MULADDSUB : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_out1;
   Real_ptr m_out2;
diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp
new file mode 100644
index 000000000..fa52f9e99
--- /dev/null
+++ b/src/basic/MULTI_REDUCE-Cuda.cpp
@@ -0,0 +1,302 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULTI_REDUCE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+constexpr Index_type warp_size = 32;
+
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values,
+                                            MULTI_REDUCE::Data_ptr data,
+                                            Index_ptr bins,
+                                            Index_type iend,
+                                            Index_type num_bins,
+                                            Index_type shared_replication,
+                                            Index_type global_replication)
+{
+  if (shared_replication > 0) {
+
+    extern __shared__ MULTI_REDUCE::Data_type shared_values[];
+    for (Index_type t = threadIdx.x;
+         t < Index_type(num_bins * shared_replication);
+         t += block_size) {
+      shared_values[t] = MULTI_REDUCE::Data_type(0);
+    }
+    __syncthreads();
+
+    {
+      Index_type i = blockIdx.x * block_size + threadIdx.x;
+      for ( ; i < iend ; i += gridDim.x * block_size ) {
+        Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication);
+        RAJA::atomicAdd<RAJA::cuda_atomic>(&shared_values[offset], data[i]);
+      }
+    }
+
+    __syncthreads();
+    for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) {
+      auto block_sum = MULTI_REDUCE::Data_type(0);
+      for (Index_type s = 0; s < shared_replication; ++s) {
+        block_sum += shared_values[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)];
+      }
+      if (block_sum != MULTI_REDUCE::Data_type(0)) {
+        Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins;
+        RAJA::atomicAdd<RAJA::cuda_atomic>(&global_values[offset], block_sum);
+      }
+    }
+
+  } else {
+
+    Index_type i = blockIdx.x * block_size + threadIdx.x;
+    Index_type warp = i / warp_size;
+    for ( ; i < iend ; i += gridDim.x * block_size ) {
+      Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins;
+      RAJA::atomicAdd<RAJA::cuda_atomic>(&global_values[offset], data[i]);
+    }
+  }
+}
+
+template < Index_type block_size,
+           Index_type preferred_global_replication,
+           Index_type preferred_shared_replication,
+           typename MappingHelper >
+void MULTI_REDUCE::runCudaVariantAtomicRuntime(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  MULTI_REDUCE_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    auto* func = &multi_reduce_atomic_runtime<block_size>;
+
+    cudaFuncAttributes func_attr;
+    cudaErrchk(cudaFuncGetAttributes(&func_attr, (const void*)func));
+    const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes;
+    const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins;
+
+    const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication));
+    const Index_type shmem = shared_replication * num_bins * sizeof(Data_type);
+
+    const Index_type max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, func, block_size, shmem);
+    const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+    const Index_type grid_size = std::min(normal_grid_size, max_grid_size);
+
+    const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size));
+
+    RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, global_replication);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, global_replication);
+
+      RPlaunchCudaKernel( func,
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          values,
+                          data,
+                          bins,
+                          iend,
+                          num_bins,
+                          shared_replication,
+                          global_replication );
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, global_replication);
+      for (Index_type bin = 0; bin < num_bins; ++bin) {
+        Data_type value_final = Data_type(0);
+        for (Index_type r = 0; r < global_replication; ++r) {
+          Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins;
+          value_final += hvalues[offset];
+        }
+        values_final[bin] = value_final;
+      }
+
+    }
+    stopTimer();
+
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(values, hvalues);
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    using exec_policy = std::conditional_t<MappingHelper::direct,
+        RAJA::cuda_exec<block_size, true /*async*/>,
+        RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+    using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy<
+        RAJA::cuda::MultiReduceTuning<
+          RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+          RAJA::cuda::AtomicReplicationTuning<
+            RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer<
+              RAJA::cuda::ConstantPreferredReplicationConcretizer<preferred_shared_replication>>,
+            RAJA::cuda::thread_xyz<>,
+            RAJA::GetOffsetRight<int>>,
+          RAJA::cuda::AtomicReplicationTuning<
+            RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer<
+              RAJA::cuda::ConstantPreferredReplicationConcretizer<preferred_global_replication>>,
+            RAJA::cuda::warp_global_xyz<>,
+            RAJA::GetOffsetLeft<int>>>>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      MULTI_REDUCE_INIT_VALUES_RAJA(multi_reduce_policy);
+
+      RAJA::forall<exec_policy>( res,
+          RAJA::RangeSegment(ibegin, iend),
+          [=] __device__ (Index_type i) {
+        MULTI_REDUCE_BODY;
+      });
+
+      MULTI_REDUCE_FINALIZE_VALUES_RAJA(multi_reduce_policy);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  MULTI_REDUCE : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+}
+
+void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if (camp::size<cuda_atomic_global_replications_type>::value == 0 &&
+              camp::size<cuda_atomic_shared_replications_type>::value == 0 ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantAtomicRuntime<decltype(block_size)::value,
+                                          default_cuda_atomic_global_replication,
+                                          default_cuda_atomic_shared_replication,
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+          seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(global_replication)) {
+
+              seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) {
+
+                if (tune_idx == t) {
+
+                  setBlockSize(block_size);
+                  runCudaVariantAtomicRuntime<decltype(block_size)::value,
+                                              decltype(global_replication)::value,
+                                              decltype(shared_replication)::value,
+                                              decltype(mapping_helper)>(vid);
+
+                }
+
+                t += 1;
+
+              });
+
+            }
+
+          });
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  MULTI_REDUCE : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void MULTI_REDUCE::setCudaTuningDefinitions(VariantID vid)
+{
+  seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+    if (run_params.numValidGPUBlockSize() == 0u ||
+        run_params.validGPUBlockSize(block_size)) {
+
+      seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+        if (camp::size<cuda_atomic_global_replications_type>::value == 0 &&
+            camp::size<cuda_atomic_shared_replications_type>::value == 0 ) {
+
+          addVariantTuningName(vid, "atomic_"+
+                                    decltype(mapping_helper)::get_name()+"_"+
+                                    std::to_string(block_size));
+
+        }
+
+        seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) {
+
+          if (run_params.numValidAtomicReplication() == 0u ||
+              run_params.validAtomicReplication(global_replication)) {
+
+            seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) {
+
+              addVariantTuningName(vid, "atomic_"
+                                        "shared("+std::to_string(shared_replication)+")_"+
+                                        "global("+std::to_string(global_replication)+")_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+          }
+
+        });
+
+      });
+
+    }
+
+  });
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp
new file mode 100644
index 000000000..e2106a79e
--- /dev/null
+++ b/src/basic/MULTI_REDUCE-Hip.cpp
@@ -0,0 +1,302 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULTI_REDUCE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+constexpr Index_type warp_size = 64;
+
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values,
+                                            MULTI_REDUCE::Data_ptr data,
+                                            Index_ptr bins,
+                                            Index_type iend,
+                                            Index_type num_bins,
+                                            Index_type shared_replication,
+                                            Index_type global_replication)
+{
+  if (shared_replication > 0) {
+
+    extern __shared__ MULTI_REDUCE::Data_type shared_values[];
+    for (Index_type t = threadIdx.x;
+         t < Index_type(num_bins * shared_replication);
+         t += block_size) {
+      shared_values[t] = MULTI_REDUCE::Data_type(0);
+    }
+    __syncthreads();
+
+    {
+      Index_type i = blockIdx.x * block_size + threadIdx.x;
+      for ( ; i < iend ; i += gridDim.x * block_size ) {
+        Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication);
+        RAJA::atomicAdd<RAJA::hip_atomic>(&shared_values[offset], data[i]);
+      }
+    }
+
+    __syncthreads();
+    for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) {
+      auto block_sum = MULTI_REDUCE::Data_type(0);
+      for (Index_type s = 0; s < shared_replication; ++s) {
+        block_sum += shared_values[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)];
+      }
+      if (block_sum != MULTI_REDUCE::Data_type(0)) {
+        Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins;
+        RAJA::atomicAdd<RAJA::hip_atomic>(&global_values[offset], block_sum);
+      }
+    }
+
+  } else {
+
+    Index_type i = blockIdx.x * block_size + threadIdx.x;
+    Index_type warp = i / warp_size;
+    for ( ; i < iend ; i += gridDim.x * block_size ) {
+      Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins;
+      RAJA::atomicAdd<RAJA::hip_atomic>(&global_values[offset], data[i]);
+    }
+  }
+}
+
+template < Index_type block_size,
+           Index_type preferred_global_replication,
+           Index_type preferred_shared_replication,
+           typename MappingHelper >
+void MULTI_REDUCE::runHipVariantAtomicRuntime(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  MULTI_REDUCE_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    auto* func = &multi_reduce_atomic_runtime<block_size>;
+
+    hipFuncAttributes func_attr;
+    hipErrchk(hipFuncGetAttributes(&func_attr, (const void*)func));
+    const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes;
+    const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins;
+
+    const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication));
+    const Index_type shmem = shared_replication * num_bins * sizeof(Data_type);
+
+    const Index_type max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, func, block_size, shmem);
+    const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+    const Index_type grid_size = std::min(normal_grid_size, max_grid_size);
+
+    const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size));
+
+    RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, global_replication);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, global_replication);
+
+      RPlaunchHipKernel( func,
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         values,
+                         data,
+                         bins,
+                         iend,
+                         num_bins,
+                         shared_replication,
+                         global_replication );
+
+      RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, global_replication);
+      for (Index_type bin = 0; bin < num_bins; ++bin) {
+        Data_type value_final = Data_type(0);
+        for (Index_type r = 0; r < global_replication; ++r) {
+          Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins;
+          value_final += hvalues[offset];
+        }
+        values_final[bin] = value_final;
+      }
+
+    }
+    stopTimer();
+
+    RAJAPERF_HIP_REDUCER_TEARDOWN(values, hvalues);
+
+  } else if ( vid == RAJA_HIP ) {
+
+    using exec_policy = std::conditional_t<MappingHelper::direct,
+        RAJA::hip_exec<block_size, true /*async*/>,
+        RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+    using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy<
+        RAJA::hip::MultiReduceTuning<
+          RAJA::hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+          RAJA::hip::AtomicReplicationTuning<
+            RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer<
+              RAJA::hip::ConstantPreferredReplicationConcretizer<preferred_shared_replication>>,
+            RAJA::hip::thread_xyz<>,
+            RAJA::GetOffsetRight<int>>,
+          RAJA::hip::AtomicReplicationTuning<
+            RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer<
+              RAJA::hip::ConstantPreferredReplicationConcretizer<preferred_global_replication>>,
+            RAJA::hip::warp_global_xyz<>,
+            RAJA::GetOffsetLeft<int>>>>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      MULTI_REDUCE_INIT_VALUES_RAJA(multi_reduce_policy);
+
+      RAJA::forall<exec_policy>( res,
+          RAJA::RangeSegment(ibegin, iend),
+          [=] __device__ (Index_type i) {
+        MULTI_REDUCE_BODY;
+      });
+
+      MULTI_REDUCE_FINALIZE_VALUES_RAJA(multi_reduce_policy);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  MULTI_REDUCE : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+}
+
+void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if (camp::size<hip_atomic_global_replications_type>::value == 0 &&
+              camp::size<hip_atomic_shared_replications_type>::value == 0 ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantAtomicRuntime<decltype(block_size)::value,
+                                          default_hip_atomic_global_replication,
+                                          default_hip_atomic_shared_replication,
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+          seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) {
+
+            if (run_params.numValidAtomicReplication() == 0u ||
+                run_params.validAtomicReplication(global_replication)) {
+
+              seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) {
+
+                if (tune_idx == t) {
+
+                  setBlockSize(block_size);
+                  runHipVariantAtomicRuntime<decltype(block_size)::value,
+                                             decltype(global_replication)::value,
+                                             decltype(shared_replication)::value,
+                                             decltype(mapping_helper)>(vid);
+
+                }
+
+                t += 1;
+
+              });
+
+            }
+
+          });
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  MULTI_REDUCE : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void MULTI_REDUCE::setHipTuningDefinitions(VariantID vid)
+{
+  seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+    if (run_params.numValidGPUBlockSize() == 0u ||
+        run_params.validGPUBlockSize(block_size)) {
+
+      seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+        if (camp::size<hip_atomic_global_replications_type>::value == 0 &&
+            camp::size<hip_atomic_shared_replications_type>::value == 0 ) {
+
+          addVariantTuningName(vid, "atomic_"+
+                                    decltype(mapping_helper)::get_name()+"_"+
+                                    std::to_string(block_size));
+
+        }
+
+        seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) {
+
+          if (run_params.numValidAtomicReplication() == 0u ||
+              run_params.validAtomicReplication(global_replication)) {
+
+            seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) {
+
+              addVariantTuningName(vid, "atomic_"
+                                        "shared("+std::to_string(shared_replication)+")_"+
+                                        "global("+std::to_string(global_replication)+")_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+          }
+
+        });
+
+      });
+
+    }
+
+  });
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/basic/MULTI_REDUCE-OMP.cpp b/src/basic/MULTI_REDUCE-OMP.cpp
new file mode 100644
index 000000000..2e2ebf5d4
--- /dev/null
+++ b/src/basic/MULTI_REDUCE-OMP.cpp
@@ -0,0 +1,121 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULTI_REDUCE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+void MULTI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  MULTI_REDUCE_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      MULTI_REDUCE_SETUP_VALUES;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        MULTI_REDUCE_INIT_VALUES;
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          #pragma omp atomic
+          MULTI_REDUCE_BODY;
+        }
+
+        MULTI_REDUCE_FINALIZE_VALUES;
+
+      }
+      stopTimer();
+
+      MULTI_REDUCE_TEARDOWN_VALUES;
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      MULTI_REDUCE_SETUP_VALUES;
+
+      auto multi_reduce_base_lam = [=](Index_type i) {
+                                 #pragma omp atomic
+                                 MULTI_REDUCE_BODY;
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        MULTI_REDUCE_INIT_VALUES;
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          multi_reduce_base_lam(i);
+        }
+
+        MULTI_REDUCE_FINALIZE_VALUES;
+
+      }
+      stopTimer();
+
+      MULTI_REDUCE_TEARDOWN_VALUES;
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::omp_multi_reduce);
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            MULTI_REDUCE_BODY;
+        });
+
+        MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::omp_multi_reduce);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  MULTI_REDUCE : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+  MULTI_REDUCE_DATA_TEARDOWN;
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/MULTI_REDUCE-OMPTarget.cpp b/src/basic/MULTI_REDUCE-OMPTarget.cpp
new file mode 100644
index 000000000..8c2e18060
--- /dev/null
+++ b/src/basic/MULTI_REDUCE-OMPTarget.cpp
@@ -0,0 +1,68 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULTI_REDUCE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+
+void MULTI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  MULTI_REDUCE_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      initOpenMPDeviceData(values, values_init, num_bins);
+
+      #pragma omp target is_device_ptr(values, bins, data)
+      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        #pragma omp atomic
+        MULTI_REDUCE_BODY;
+      }
+
+      getOpenMPDeviceData(values_final, values, num_bins);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  MULTI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+
+  MULTI_REDUCE_DATA_TEARDOWN;
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/MULTI_REDUCE-Seq.cpp b/src/basic/MULTI_REDUCE-Seq.cpp
new file mode 100644
index 000000000..a771953aa
--- /dev/null
+++ b/src/basic/MULTI_REDUCE-Seq.cpp
@@ -0,0 +1,114 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULTI_REDUCE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+void MULTI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  MULTI_REDUCE_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      MULTI_REDUCE_SETUP_VALUES;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        MULTI_REDUCE_INIT_VALUES;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          MULTI_REDUCE_BODY;
+        }
+
+        MULTI_REDUCE_FINALIZE_VALUES;
+
+      }
+      stopTimer();
+
+      MULTI_REDUCE_TEARDOWN_VALUES;
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      MULTI_REDUCE_SETUP_VALUES;
+
+      auto multi_reduce_base_lam = [=](Index_type i) {
+                                 MULTI_REDUCE_BODY;
+                               };
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        MULTI_REDUCE_INIT_VALUES;
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          multi_reduce_base_lam(i);
+        }
+
+        MULTI_REDUCE_FINALIZE_VALUES;
+
+      }
+      stopTimer();
+
+      MULTI_REDUCE_TEARDOWN_VALUES;
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::seq_multi_reduce);
+
+        RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+            MULTI_REDUCE_BODY;
+        });
+
+        MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::seq_multi_reduce);
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+    default : {
+      getCout() << "\n  MULTI_REDUCE : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+  MULTI_REDUCE_DATA_TEARDOWN;
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp
new file mode 100644
index 000000000..8fc6ee6c6
--- /dev/null
+++ b/src/basic/MULTI_REDUCE.cpp
@@ -0,0 +1,154 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULTI_REDUCE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <algorithm>
+#include <stdlib.h>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+MULTI_REDUCE::MULTI_REDUCE(const RunParams& params)
+  : KernelBase(rajaperf::Basic_MULTI_REDUCE, params)
+{
+  setDefaultProblemSize(1000000);
+  setDefaultReps(50);
+
+  setActualProblemSize( getTargetProblemSize() );
+
+  m_num_bins = params.getMultiReduceNumBins();
+  m_bin_assignment_algorithm = params.getMultiReduceBinAssignmentAlgorithm();
+
+  setItsPerRep( getActualProblemSize() );
+  setKernelsPerRep(1);
+  setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins +
+                      1*sizeof(Data_type) * getActualProblemSize() +
+                      1*sizeof(Index_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins );
+  setBytesAtomicModifyWrittenPerRep( 0 );
+  setFLOPsPerRep(1 * getActualProblemSize());
+
+  setUsesFeature(Forall);
+  setUsesFeature(Atomic);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Kokkos_Lambda );
+}
+
+MULTI_REDUCE::~MULTI_REDUCE()
+{
+}
+
+void MULTI_REDUCE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  allocData(m_bins, getActualProblemSize(), vid);
+  allocAndInitDataRandValue(m_data, getActualProblemSize(), vid);
+  {
+    auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid);
+
+    const bool init_random_per_iterate =
+        (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Random);
+    const bool init_random_sizes =
+        (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsRandomSizes);
+    const bool init_even_sizes =
+        (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsEvenSizes);
+    const bool init_all_one =
+        (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Single);
+
+    if (init_even_sizes || init_random_sizes || init_all_one) {
+      Real_ptr data = nullptr;
+      if (init_even_sizes) {
+        allocData(data, m_num_bins, Base_Seq);
+        for (Index_type b = 0; b < m_num_bins; ++b) {
+          data[b] = static_cast<Real_type>(b+1) / m_num_bins;
+        }
+      } else if (init_random_sizes) {
+        allocAndInitDataRandValue(data, m_num_bins, Base_Seq);
+        std::sort(data, data+m_num_bins);
+      } else if (init_all_one) {
+        allocData(data, m_num_bins, Base_Seq);
+        for (Index_type b = 0; b < m_num_bins; ++b) {
+          data[b] = static_cast<Real_type>(0);
+        }
+      }
+
+      Index_type actual_prob_size = getActualProblemSize();
+      Index_type bin = 0;
+      for (Index_type i = 0; i < actual_prob_size; ++i) {
+        Real_type pos = static_cast<Real_type>(i) / actual_prob_size;
+        while (bin+1 < m_num_bins && pos >= data[bin]) {
+          bin += 1;
+        }
+        m_bins[i] = bin;
+      }
+
+      deallocData(data, Base_Seq);
+
+    } else if (init_random_per_iterate) {
+      Real_ptr data;
+      allocAndInitDataRandValue(data, getActualProblemSize(), Base_Seq);
+
+      for (Index_type i = 0; i < getActualProblemSize(); ++i) {
+        m_bins[i] = static_cast<Index_type>(data[i] * m_num_bins);
+        if (m_bins[i] >= m_num_bins) {
+          m_bins[i] = m_num_bins - 1;
+        }
+        if (m_bins[i] < 0) {
+          m_bins[i] = 0;
+        }
+      }
+
+      deallocData(data, Base_Seq);
+    } else {
+      throw 1;
+    }
+  }
+
+  m_values_init.resize(m_num_bins, 0.0);
+  m_values_final.resize(m_num_bins, 0.0);
+}
+
+void MULTI_REDUCE::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  checksum[vid][tune_idx] += calcChecksum(m_values_final.data(), m_num_bins, vid);
+}
+
+void MULTI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  (void) vid;
+  deallocData(m_bins, vid);
+  deallocData(m_data, vid);
+  m_values_init.clear(); m_values_init.shrink_to_fit();
+  m_values_final.clear(); m_values_final.shrink_to_fit();
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp
new file mode 100644
index 000000000..cf8d99185
--- /dev/null
+++ b/src/basic/MULTI_REDUCE.hpp
@@ -0,0 +1,139 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// MULTI_REDUCE kernel reference implementation:
+///
+/// double* values = calloc(num_bins, sizeof(double));
+/// for (Index_type i = 0; i < N; ++i ) {
+///   values[bins[i]] += data[i];
+/// }
+///
+
+#ifndef RAJAPerf_Basic_MULTI_REDUCE_HPP
+#define RAJAPerf_Basic_MULTI_REDUCE_HPP
+
+#define MULTI_REDUCE_DATA_SETUP \
+  Index_type num_bins = m_num_bins; \
+  Index_ptr bins = m_bins; \
+  Data_ptr data = m_data; \
+  std::vector<Data_type>& values_init = m_values_init; \
+  std::vector<Data_type>& values_final = m_values_final;
+
+#define MULTI_REDUCE_DATA_TEARDOWN
+
+
+#define MULTI_REDUCE_SETUP_VALUES \
+  Data_ptr values; \
+  allocData(getReductionDataSpace(vid), values, num_bins);
+
+#define MULTI_REDUCE_TEARDOWN_VALUES \
+  deallocData(values, vid);
+
+#define MULTI_REDUCE_INIT_VALUES \
+  for (Index_type b = 0; b < num_bins; ++b ) { \
+    values[b] = values_init[b]; \
+  }
+
+#define MULTI_REDUCE_FINALIZE_VALUES \
+  for (Index_type b = 0; b < num_bins; ++b ) { \
+    values_final[b] = values[b]; \
+  }
+
+#define MULTI_REDUCE_INIT_VALUES_RAJA(policy) \
+  RAJA::MultiReduceSum<policy, Data_type> values(values_init);
+
+#define MULTI_REDUCE_FINALIZE_VALUES_RAJA(policy) \
+  values.get_all(values_final);
+
+#define MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication) \
+  for (Index_type b = 0; b < (num_bins); ++b) { \
+    Data_type val_final = 0; \
+    for (size_t r = 0; r < (replication); ++r) { \
+      val_final += (hvalues)[b*(replication) + r]; \
+    } \
+    values_final[b] = val_final; \
+  }
+
+
+#define MULTI_REDUCE_BODY \
+  values[bins[i]] += data[i];
+
+#define MULTI_REDUCE_RAJA_BODY(policy) \
+  RAJA::atomicAdd<policy>(&values[bins[i]], data[i]);
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf
+{
+class RunParams;
+
+namespace basic
+{
+
+class MULTI_REDUCE : public KernelBase
+{
+public:
+  using Data_type = Real_type;
+  using Data_ptr = Real_ptr;
+
+  MULTI_REDUCE(const RunParams& params);
+
+  ~MULTI_REDUCE();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runKokkosVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < Index_type block_size,
+             Index_type preferred_global_replication,
+             Index_type preferred_shared_replication,
+             typename MappingHelper >
+  void runCudaVariantAtomicRuntime(VariantID vid);
+  template < Index_type block_size,
+             Index_type preferred_global_replication,
+             Index_type preferred_shared_replication,
+             typename MappingHelper >
+  void runHipVariantAtomicRuntime(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
+
+  static const size_t default_cuda_atomic_global_replication = 2;
+  static const size_t default_cuda_atomic_shared_replication = 16;
+  using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty
+  using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty
+
+  static const size_t default_hip_atomic_global_replication = 32;
+  static const size_t default_hip_atomic_shared_replication = 4;
+  using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty
+  using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty
+
+  Index_type m_num_bins;
+  RunParams::BinAssignmentAlgorithm m_bin_assignment_algorithm;
+  Index_ptr m_bins;
+  Data_ptr m_data;
+  std::vector<Data_type> m_values_init;
+  std::vector<Data_type> m_values_final;
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp
index 605778eb7..74e4136d2 100644
--- a/src/basic/NESTED_INIT-Cuda.cpp
+++ b/src/basic/NESTED_INIT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -89,10 +89,11 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid)
       NESTED_INIT_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      nested_init<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                 <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(array,
-                                                   ni, nj, nk);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel(
+        (nested_init<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        array, ni, nj, nk );
 
     }
     stopTimer();
@@ -102,17 +103,23 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto nested_init_lambda = [=] __device__ (Index_type i, 
+                                                Index_type j, 
+                                                Index_type k) {
+          NESTED_INIT_BODY;
+      };
+
       NESTED_INIT_THREADS_PER_BLOCK_CUDA;
       NESTED_INIT_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      nested_init_lam<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                     <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(ni, nj, nk,
-        [=] __device__ (Index_type i, Index_type j, Index_type k) {
-          NESTED_INIT_BODY;
-        }
-      );
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel(
+        (nested_init_lam<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                         decltype(nested_init_lambda)>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nj, nk,
+        nested_init_lambda );
 
     }
     stopTimer();
@@ -136,10 +143,11 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment(0, ni),
-                                               RAJA::RangeSegment(0, nj),
-                                               RAJA::RangeSegment(0, nk)),
-                                       res,
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                         RAJA::RangeSegment(0, nj),
+                         RAJA::RangeSegment(0, nk)),
+        res,
         [=] __device__ (Index_type i, Index_type j, Index_type k) {
         NESTED_INIT_BODY;
       });
diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp
index b7d023d7f..f7ea66dd4 100644
--- a/src/basic/NESTED_INIT-Hip.cpp
+++ b/src/basic/NESTED_INIT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -89,10 +89,11 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid)
       NESTED_INIT_NBLOCKS_HIP;
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((nested_init<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         array, ni, nj, nk);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (nested_init<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        array, ni, nj, nk );
 
     }
     stopTimer();
@@ -102,19 +103,23 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      NESTED_INIT_THREADS_PER_BLOCK_HIP;
-      NESTED_INIT_NBLOCKS_HIP;
-      constexpr size_t shmem = 0;
-
-      auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j,
+      auto nested_init_lambda = [=] __device__ (Index_type i, 
+                                                Index_type j,
                                                 Index_type k) {
         NESTED_INIT_BODY;
       };
 
-      hipLaunchKernelGGL((nested_init_lam<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(nested_init_lambda) >),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         ni, nj, nk, nested_init_lambda);
-      hipErrchk( hipGetLastError() );
+      NESTED_INIT_THREADS_PER_BLOCK_HIP;
+      NESTED_INIT_NBLOCKS_HIP;
+      constexpr size_t shmem = 0;
+
+      RPlaunchHipKernel(
+        (nested_init_lam<NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                         decltype(nested_init_lambda)>),
+        nblocks, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nj, nk,
+        nested_init_lambda );
 
     }
     stopTimer();
@@ -138,10 +143,11 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment(0, ni),
-                                               RAJA::RangeSegment(0, nj),
-                                               RAJA::RangeSegment(0, nk)),
-                                       res,
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                         RAJA::RangeSegment(0, nj),
+                         RAJA::RangeSegment(0, nk)),
+        res,
         [=] __device__ (Index_type i, Index_type j, Index_type k) {
         NESTED_INIT_BODY;
       });
diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp
index 3b1e07767..3fa73fa5b 100644
--- a/src/basic/NESTED_INIT-OMP.cpp
+++ b/src/basic/NESTED_INIT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp
index 607c8befe..6e6538dfd 100644
--- a/src/basic/NESTED_INIT-OMPTarget.cpp
+++ b/src/basic/NESTED_INIT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp
index d3ce50d65..bc277ce27 100644
--- a/src/basic/NESTED_INIT-Seq.cpp
+++ b/src/basic/NESTED_INIT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp
new file mode 100644
index 000000000..950a6b56b
--- /dev/null
+++ b/src/basic/NESTED_INIT-Sycl.cpp
@@ -0,0 +1,109 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NESTED_INIT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define i_wg_sz (32)
+#define j_wg_sz (work_group_size / i_wg_sz)
+#define k_wg_sz (1)
+
+template <size_t work_group_size >
+void NESTED_INIT::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  NESTED_INIT_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    sycl::range<3> global_dim(k_wg_sz * RAJA_DIVIDE_CEILING_INT(nk, k_wg_sz),
+                              j_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, j_wg_sz),
+                              i_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, i_wg_sz));
+    sycl::range<3> wkgroup_dim(k_wg_sz, j_wg_sz, i_wg_sz);
+  
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&] (cl::sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) { 
+
+          Index_type i = item.get_global_id(2);
+          Index_type j = item.get_global_id(1);
+          Index_type k = item.get_global_id(0);
+
+          if (i < ni && j < nj && k < nk) {
+            NESTED_INIT_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+  
+  } else if ( vid == RAJA_SYCL ) {
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<2, RAJA::sycl_global_0<k_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_1<j_wg_sz>,
+              RAJA::statement::For<0, RAJA::sycl_global_2<i_wg_sz>,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                         RAJA::RangeSegment(0, nj),
+                         RAJA::RangeSegment(0, nk)),
+        res,
+        [=] (Index_type i, Index_type j, Index_type k) {
+        NESTED_INIT_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  NESTED_INIT : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(NESTED_INIT, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp
index fc64f5a0d..67d1d017b 100644
--- a/src/basic/NESTED_INIT.cpp
+++ b/src/basic/NESTED_INIT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,7 +29,7 @@ NESTED_INIT::NESTED_INIT(const RunParams& params)
   setDefaultProblemSize(m_n_init * m_n_init * m_n_init);
   setDefaultReps(1000);
 
-  auto n_final = std::cbrt( getTargetProblemSize() );
+  auto n_final = std::cbrt( getTargetProblemSize() ) + std::cbrt(3)-1;
   m_ni = n_final;
   m_nj = n_final;
   m_nk = n_final;
@@ -39,7 +39,9 @@ NESTED_INIT::NESTED_INIT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * getActualProblemSize());
 
   setUsesFeature(Kernel);
@@ -63,6 +65,9 @@ NESTED_INIT::NESTED_INIT(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp
index ccaf7079e..0c579dd3b 100644
--- a/src/basic/NESTED_INIT.hpp
+++ b/src/basic/NESTED_INIT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -58,19 +58,25 @@ class NESTED_INIT : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_array_length;
 
diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp
index 644d358dc..f49c53518 100644
--- a/src/basic/PI_ATOMIC-Cuda.cpp
+++ b/src/basic/PI_ATOMIC-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -45,25 +45,29 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid)
 
   auto res{getCudaResource()};
 
-  PI_ATOMIC_DATA_SETUP;
+  PI_ATOMIC_GPU_DATA_SETUP;
+
+  RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1);
 
   if ( vid == Base_CUDA ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type),
-                                   cudaMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      pi_atomic<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( pi, dx, iend );
-      cudaErrchk( cudaGetLastError() );
 
-      cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
-      m_pi_final *= 4.0;
+      RPlaunchCudaKernel( (pi_atomic<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          pi,
+                          dx, 
+                          iend );
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1);
+      m_pi_final = hpi[0] * static_cast<Real_type>(4);
 
     }
     stopTimer();
@@ -73,22 +77,24 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type),
-                                   cudaMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1);
+
+      auto pi_atomic_lambda = [=] __device__ (Index_type i) {
+        double x = (double(i) + 0.5) * dx;
+        RAJA::atomicAdd<RAJA::cuda_atomic>(pi, dx / (1.0 + x * x));
+      };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-          double x = (double(i) + 0.5) * dx;
-          RAJA::atomicAdd<RAJA::cuda_atomic>(pi, dx / (1.0 + x * x));
-      });
-      cudaErrchk( cudaGetLastError() );
 
-      cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
-      m_pi_final *= 4.0;
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(pi_atomic_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, pi_atomic_lambda );
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1);
+      m_pi_final = hpi[0] * static_cast<Real_type>(4);
 
     }
     stopTimer();
@@ -98,8 +104,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type),
-                                   cudaMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1);
 
       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
@@ -107,10 +112,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid)
           RAJA::atomicAdd<RAJA::cuda_atomic>(pi, dx / (1.0 + x * x));
       });
 
-      cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
-      m_pi_final *= 4.0;
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1);
+      m_pi_final = hpi[0] * static_cast<Real_type>(4);
 
     }
     stopTimer();
@@ -118,6 +121,9 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid)
   } else {
      getCout() << "\n  PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl;
   }
+
+  RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi);
+
 }
 
 RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_ATOMIC, Cuda)
diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp
index 1db304a52..637c10156 100644
--- a/src/basic/PI_ATOMIC-Hip.cpp
+++ b/src/basic/PI_ATOMIC-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,7 +23,7 @@ namespace basic
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void atomic_pi(Real_ptr pi,
+__global__ void pi_atomic(Real_ptr pi,
                           Real_type dx,
                           Index_type iend)
 {
@@ -45,25 +45,29 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid)
 
   auto res{getHipResource()};
 
-  PI_ATOMIC_DATA_SETUP;
+  PI_ATOMIC_GPU_DATA_SETUP;
+
+  RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1);
 
   if ( vid == Base_HIP ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type),
-                                 hipMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1);
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((atomic_pi<block_size>),grid_size, block_size, shmem, res.get_stream(), pi, dx, iend );
-      hipErrchk( hipGetLastError() );
 
-      hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-      m_pi_final *= 4.0;
+      RPlaunchHipKernel( (pi_atomic<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         pi,
+                         dx,
+                         iend );
+
+      RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1);
+      m_pi_final = hpi[0] * static_cast<Real_type>(4);
 
     }
     stopTimer();
@@ -73,24 +77,24 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type),
-                                 hipMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1);
 
-      auto atomic_pi_lambda = [=] __device__ (Index_type i) {
+      auto pi_atomic_lambda = [=] __device__ (Index_type i) {
           double x = (double(i) + 0.5) * dx;
           RAJA::atomicAdd<RAJA::hip_atomic>(pi, dx / (1.0 + x * x));
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(atomic_pi_lambda)>),
-          grid_size, block_size, shmem, res.get_stream(), ibegin, iend, atomic_pi_lambda);
-      hipErrchk( hipGetLastError() );
 
-      hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-      m_pi_final *= 4.0;
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(pi_atomic_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, pi_atomic_lambda );
+
+      RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1);
+      m_pi_final = hpi[0] * static_cast<Real_type>(4);
 
     }
     stopTimer();
@@ -100,8 +104,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type),
-                                 hipMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1);
 
       RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
@@ -109,10 +112,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid)
           RAJA::atomicAdd<RAJA::hip_atomic>(pi, dx / (1.0 + x * x));
       });
 
-      hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-      m_pi_final *= 4.0;
+      RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1);
+      m_pi_final = hpi[0] * static_cast<Real_type>(4);
 
     }
     stopTimer();
@@ -120,6 +121,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid)
   } else {
      getCout() << "\n  PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl;
   }
+
+  RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi);
 }
 
 RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_ATOMIC, Hip)
diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp
index c031dcf32..d1f0eb784 100644
--- a/src/basic/PI_ATOMIC-OMP.cpp
+++ b/src/basic/PI_ATOMIC-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -99,6 +99,8 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
 
   }
 
+  PI_ATOMIC_DATA_TEARDOWN;
+
 #else
   RAJA_UNUSED_VAR(vid);
 #endif
diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp
index 9d4f2649f..5f3fe4c82 100644
--- a/src/basic/PI_ATOMIC-OMPTarget.cpp
+++ b/src/basic/PI_ATOMIC-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -78,6 +78,9 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
   } else {
      getCout() << "\n  PI_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl;
   }
+
+  PI_ATOMIC_DATA_TEARDOWN;
+
 }
 
 } // end namespace basic
diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp
index 9d3864713..698361107 100644
--- a/src/basic/PI_ATOMIC-Seq.cpp
+++ b/src/basic/PI_ATOMIC-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -95,6 +95,8 @@ void PI_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx
 
   }
 
+  PI_ATOMIC_DATA_TEARDOWN;
+
 }
 
 } // end namespace basic
diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp
index af33d01fc..5a6a5bc04 100644
--- a/src/basic/PI_ATOMIC.cpp
+++ b/src/basic/PI_ATOMIC.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ PI_ATOMIC::PI_ATOMIC(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) +
-                  (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0  );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) );
   setFLOPsPerRep(6 * getActualProblemSize() + 1);
 
   setUsesFeature(Forall);
@@ -64,7 +65,6 @@ PI_ATOMIC::~PI_ATOMIC()
 void PI_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   m_dx = 1.0 / double(getActualProblemSize());
-  allocAndInitDataConst(m_pi, 1, 0.0, vid);
   m_pi_init = 0.0;
   m_pi_final = -static_cast<int>(vid);
 }
@@ -77,7 +77,6 @@ void PI_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx)
 void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   (void) vid;
-  deallocData(m_pi, vid);
 }
 
 } // end namespace basic
diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp
index fe26d9beb..26a3a7016 100644
--- a/src/basic/PI_ATOMIC.hpp
+++ b/src/basic/PI_ATOMIC.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,7 +25,14 @@
 
 #define PI_ATOMIC_DATA_SETUP \
   Real_type dx = m_dx; \
-  Real_ptr pi = m_pi;
+  Real_ptr pi; \
+  allocData(getReductionDataSpace(vid), pi, 1);
+
+#define PI_ATOMIC_DATA_TEARDOWN \
+  deallocData(pi, vid);
+
+#define PI_ATOMIC_GPU_DATA_SETUP \
+  Real_type dx = m_dx;
 
 
 #include "common/KernelBase.hpp"
@@ -54,10 +61,12 @@ class PI_ATOMIC : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
@@ -65,10 +74,9 @@ class PI_ATOMIC : public KernelBase
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_type m_dx;
-  Real_ptr m_pi;
   Real_type m_pi_init;
   Real_type m_pi_final;
 };
diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp
index b0577bc58..8529897c3 100644
--- a/src/basic/PI_REDUCE-Cuda.cpp
+++ b/src/basic/PI_REDUCE-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,10 @@
 #include "common/CudaDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -24,7 +28,7 @@ namespace basic
 template < size_t block_size >
 __launch_bounds__(block_size)
 __global__ void pi_reduce(Real_type dx,
-                          Real_ptr dpi, Real_type pi_init,
+                          Real_ptr pi, Real_type pi_init,
                           Index_type iend)
 {
   extern __shared__ Real_type ppi[ ];
@@ -45,24 +49,16 @@ __global__ void pi_reduce(Real_type dx,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
-  if ( threadIdx.x == 0 ) {
-    RAJA::atomicAdd<RAJA::cuda_atomic>( dpi, ppi[ 0 ] );
-  }
-#else // this doesn't work due to data races
   if ( threadIdx.x == 0 ) {
-    *dpi += ppi[ 0 ];
+    RAJA::atomicAdd<RAJA::cuda_atomic>( pi, ppi[ 0 ] );
   }
-#endif
 }
 
 
-
-template < size_t block_size >
-void PI_REDUCE::runCudaVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void PI_REDUCE::runCudaVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getCudaResource()};
@@ -71,46 +67,113 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    Real_ptr dpi;
-    allocData(DataSpace::CudaDevice, dpi, 1);
+    RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1);
+
+    constexpr size_t shmem = sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, (pi_reduce<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk( cudaMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type),
-                                   cudaMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1);
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = sizeof(Real_type)*block_size;
-      pi_reduce<block_size><<<grid_size, block_size,
-                  shmem, res.get_stream()>>>( dx,
-                                                   dpi, m_pi_init,
-                                                   iend );
-      cudaErrchk( cudaGetLastError() );
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-      cudaErrchk( cudaMemcpyAsync( &m_pi, dpi, sizeof(Real_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
-      m_pi *= 4.0;
+      RPlaunchCudaKernel( (pi_reduce<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          dx,
+                          pi, m_pi_init,
+                          iend );
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1);
+      m_pi = hpi[0] * static_cast<Real_type>(4);
 
     }
     stopTimer();
 
-    deallocData(DataSpace::CudaDevice, dpi);
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi);
 
-  } else if ( vid == RAJA_CUDA ) {
+  } else {
+     getCout() << "\n  PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void PI_REDUCE::runCudaVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::cuda_reduce_atomic,
+      RAJA::cuda_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  PI_REDUCE_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> pi(m_pi_init);
+      RAJA::ReduceSum<reduction_policy, Real_type> pi(m_pi_init);
 
-      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
          PI_REDUCE_BODY;
        });
 
-      m_pi = 4.0 * static_cast<Real_type>(pi.get());
+      m_pi = static_cast<Real_type>(4) * static_cast<Real_type>(pi.get());
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+
+template < size_t block_size, typename MappingHelper >
+void PI_REDUCE::runCudaVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  PI_REDUCE_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type tpi = m_pi_init;
+
+      RAJA::forall< exec_policy >( res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tpi),
+        [=] __device__ (Index_type i, Real_type& pi) {
+          PI_REDUCE_BODY;
+        }
+      );
+
+      m_pi = static_cast<Real_type>(tpi) * 4.0;
 
     }
     stopTimer();
@@ -120,7 +183,122 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Cuda)
+void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantBase<decltype(block_size){},
+                                 decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runCudaVariantRAJA<decltype(block_size){},
+                                   decltype(algorithm_helper),
+                                   decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+  
+              setBlockSize(block_size);
+              runCudaVariantRAJANewReduce<decltype(block_size){},
+                                          decltype(mapping_helper)>(vid);
+  
+            }
+  
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void PI_REDUCE::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+              
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+              
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp
index dd56426c2..ed2dfd8dd 100644
--- a/src/basic/PI_REDUCE-Hip.cpp
+++ b/src/basic/PI_REDUCE-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,10 @@
 #include "common/HipDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -24,7 +28,7 @@ namespace basic
 template < size_t block_size >
 __launch_bounds__(block_size)
 __global__ void pi_reduce(Real_type dx,
-                          Real_ptr dpi, Real_type pi_init,
+                          Real_ptr pi, Real_type pi_init,
                           Index_type iend)
 {
   HIP_DYNAMIC_SHARED(Real_type, ppi);
@@ -45,24 +49,16 @@ __global__ void pi_reduce(Real_type dx,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
   if ( threadIdx.x == 0 ) {
-    RAJA::atomicAdd(RAJA::hip_atomic{}, dpi, ppi[ 0 ] );
-  }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) i{
-    *dpi += ppi[ 0 ];
+    RAJA::atomicAdd<RAJA::hip_atomic>( pi, ppi[ 0 ] );
   }
-#endif
 }
 
 
-
-template < size_t block_size >
-void PI_REDUCE::runHipVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void PI_REDUCE::runHipVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getHipResource()};
@@ -71,45 +67,72 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    Real_ptr dpi;
-    allocData(DataSpace::HipDevice, dpi, 1);
+    RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1);
+
+    constexpr size_t shmem = sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, (pi_reduce<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk( hipMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type),
-                                 hipMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1);
+
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = sizeof(Real_type)*block_size;
-      hipLaunchKernelGGL( (pi_reduce<block_size>), dim3(grid_size), dim3(block_size),
-                          shmem, res.get_stream(),
-                          dx, dpi, m_pi_init, iend );
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (pi_reduce<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         dx,
+                         pi, m_pi_init,
+                         iend );
 
-      hipErrchk( hipMemcpyAsync( &m_pi, dpi, sizeof(Real_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-      m_pi *= 4.0;
+      RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1);
+      m_pi = hpi[0] * static_cast<Real_type>(4);
 
     }
     stopTimer();
 
-    deallocData(DataSpace::HipDevice, dpi);
+    RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi);
+
+  } else {
+     getCout() << "\n  PI_REDUCE : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void PI_REDUCE::runHipVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::hip_reduce_atomic,
+      RAJA::hip_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
 
-  } else if ( vid == RAJA_HIP ) {
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  PI_REDUCE_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::hip_reduce, Real_type> pi(m_pi_init);
+      RAJA::ReduceSum<reduction_policy, Real_type> pi(m_pi_init);
 
-      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
          PI_REDUCE_BODY;
        });
 
-      m_pi = 4.0 * static_cast<Real_type>(pi.get());
+      m_pi = static_cast<Real_type>(4) * static_cast<Real_type>(pi.get());
 
     }
     stopTimer();
@@ -119,7 +142,162 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Hip)
+template < size_t block_size, typename MappingHelper >
+void PI_REDUCE::runHipVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  PI_REDUCE_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type tpi = m_pi_init;
+
+      RAJA::forall< exec_policy >(
+        res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tpi),
+        [=] __device__ (Index_type i, Real_type& pi) {
+          PI_REDUCE_BODY;
+        }
+      );
+
+      m_pi = static_cast<Real_type>(tpi) * 4.0;
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  PI_REDUCE : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantBase<decltype(block_size){},
+                                decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runHipVariantRAJA<decltype(block_size){},
+                                  decltype(algorithm_helper),
+                                  decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantRAJANewReduce<decltype(block_size){},
+                                         decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  PI_REDUCE : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void PI_REDUCE::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning 
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp
index 44da3e5b5..5c83aba6f 100644
--- a/src/basic/PI_REDUCE-OMP.cpp
+++ b/src/basic/PI_REDUCE-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace basic
 {
 
 
-void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t tune_idx)
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -77,21 +77,47 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
 
     case RAJA_OpenMP : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceSum<RAJA::omp_reduce, Real_type> pi(m_pi_init);
+
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            [=](Index_type i) {
+              PI_REDUCE_BODY;
+          });
+
+          m_pi = 4.0 * pi.get();
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
 
-        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> pi(m_pi_init);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(
-          RAJA::RangeSegment(ibegin, iend),
-          [=](Index_type i) {
-            PI_REDUCE_BODY;
-        });
+          Real_type tpi = m_pi_init;
 
-        m_pi = 4.0 * pi.get();
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tpi),
+            [=] (Index_type i, Real_type& pi) {
+              PI_REDUCE_BODY;
+            }
+          );
 
+          m_pi = static_cast<Real_type>(tpi) * 4.0;
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  PI_REDUCE : Unknown OpenMP tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -104,8 +130,17 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
 
 #else
   RAJA_UNUSED_VAR(vid);
+  RAJA_UNUSED_VAR(tune_idx);
 #endif
 }
 
+void PI_REDUCE::setOpenMPTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMP) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp
index f4c20a665..351580471 100644
--- a/src/basic/PI_REDUCE-OMPTarget.cpp
+++ b/src/basic/PI_REDUCE-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,7 +27,7 @@ namespace basic
   const size_t threads_per_team = 256;
 
 
-void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -56,21 +56,47 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    if (tune_idx == 0) {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> pi(m_pi_init);
+
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend),
+          [=](Index_type i) {
+            PI_REDUCE_BODY;
+        });
+
+        m_pi = 4.0 * pi.get();
+
+      }
+      stopTimer();
+
+    } else if (tune_idx == 1) {
 
-      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> pi(m_pi_init);
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
-        RAJA::RangeSegment(ibegin, iend),
-        [=](Index_type i) {
-          PI_REDUCE_BODY;
-      });
+        Real_type tpi = m_pi_init;
 
-      m_pi = 4.0 * pi.get();
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend),
+          RAJA::expt::Reduce<RAJA::operators::plus>(&tpi),
+          [=] (Index_type i, Real_type& pi) {
+            PI_REDUCE_BODY;
+          }
+        );
 
+        m_pi = static_cast<Real_type>(tpi) * 4.0;
+
+      }
+      stopTimer();
+
+    } else {
+       getCout() << "\n  PI_REDUCE : Unknown OMP Target tuning index = " << tune_idx << std::endl;
     }
-    stopTimer();
 
   } else {
     getCout() << "\n  PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl;
@@ -78,6 +104,14 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
 
 }
 
+void PI_REDUCE::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMPTarget) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp
index 4bd888dd0..4a5b28815 100644
--- a/src/basic/PI_REDUCE-Seq.cpp
+++ b/src/basic/PI_REDUCE-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,8 +18,11 @@ namespace basic
 {
 
 
-void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx)
 {
+#if !defined(RUN_RAJA_SEQ)
+  RAJA_UNUSED_VAR(tune_idx);
+#endif
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
@@ -74,20 +77,45 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx
 
     case RAJA_Seq : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceSum<RAJA::seq_reduce, Real_type> pi(m_pi_init);
+  
+          RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
+            [=](Index_type i) {
+              PI_REDUCE_BODY;
+          });
+
+          m_pi = 4.0 * pi.get();
+
+        }
+        stopTimer();
 
-        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> pi(m_pi_init);
+      } else if (tune_idx == 1) {
 
-        RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
-          [=](Index_type i) {
-            PI_REDUCE_BODY;
-        });
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        m_pi = 4.0 * pi.get();
+          Real_type tpi = m_pi_init;
+ 
+          RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tpi),
+            [=] (Index_type i, Real_type& pi) {
+              PI_REDUCE_BODY;
+            }
+          );
 
+          m_pi = static_cast<Real_type>(tpi) * 4.0;
+
+        }
+        stopTimer();       
+  
+      } else {
+        getCout() << "\n  PI_REDUCE : Unknown Seq tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -101,5 +129,13 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx
 
 }
 
+void PI_REDUCE::setSeqTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_Seq) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/PI_REDUCE-Sycl.cpp b/src/basic/PI_REDUCE-Sycl.cpp
new file mode 100644
index 000000000..c95e29583
--- /dev/null
+++ b/src/basic/PI_REDUCE-Sycl.cpp
@@ -0,0 +1,110 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "PI_REDUCE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+template < size_t work_group_size >
+void PI_REDUCE::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  PI_REDUCE_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    Real_ptr pi;
+    allocAndInitSyclDeviceData(pi, &m_pi_init, 1, qu);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      initSyclDeviceData(pi, &m_pi_init, 1, qu);
+
+      qu->submit([&] (sycl::handler& hdl) {
+
+        auto sum_reduction = sycl::reduction(pi, sycl::plus<>());
+
+        hdl.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                         sum_reduction,
+                         [=] (sycl::nd_item<1> item, auto& pi) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            PI_REDUCE_BODY;
+          }
+
+        });
+      });
+
+      Real_type lpi;
+      Real_ptr plpi = &lpi;
+      getSyclDeviceData(plpi, pi, 1, qu);
+      m_pi = 4.0 * lpi;
+
+    }
+    stopTimer();
+
+    deallocSyclDeviceData(pi, qu);
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type tpi = m_pi_init;
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, false /*async*/> >(
+        res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tpi),
+        [=] (Index_type i, Real_type& pi) {
+          PI_REDUCE_BODY;
+        }
+      );
+
+      m_pi = static_cast<Real_type>(tpi) * 4.0;
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  PI_REDUCE : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp
index 84c38ce67..a258ae8cd 100644
--- a/src/basic/PI_REDUCE.cpp
+++ b/src/basic/PI_REDUCE.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ PI_REDUCE::PI_REDUCE(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) +
-                  (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(6 * getActualProblemSize() + 1);
 
   setUsesFeature(Forall);
@@ -51,6 +52,9 @@ PI_REDUCE::PI_REDUCE(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 PI_REDUCE::~PI_REDUCE()
diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp
index 49fca096d..ca6860350 100644
--- a/src/basic/PI_REDUCE.hpp
+++ b/src/basic/PI_REDUCE.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -56,17 +56,35 @@ class PI_REDUCE : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
-  template < size_t block_size >
-  void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size >
-  void runHipVariantImpl(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runCudaVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantRAJANewReduce(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runHipVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantRAJANewReduce(VariantID vid);
+
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_type m_dx;
   Real_type m_pi;
diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp
index 6843dcab3..a8d68b31c 100644
--- a/src/basic/REDUCE3_INT-Cuda.cpp
+++ b/src/basic/REDUCE3_INT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,10 @@
 #include "common/CudaDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -56,28 +60,18 @@ __global__ void reduce3int(Int_ptr vec,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
   if ( threadIdx.x == 0 ) {
     RAJA::atomicAdd<RAJA::cuda_atomic>( vsum, psum[ 0 ] );
     RAJA::atomicMin<RAJA::cuda_atomic>( vmin, pmin[ 0 ] );
     RAJA::atomicMax<RAJA::cuda_atomic>( vmax, pmax[ 0 ] );
   }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *vsum += psum[ 0 ];
-    *vmin = RAJA_MIN( *vmin, pmin[ 0 ] );
-    *vmax = RAJA_MAX( *vmax, pmax[ 0 ] );
-  }
-#endif
 }
 
 
-
-template < size_t block_size >
-void REDUCE3_INT::runCudaVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void REDUCE3_INT::runCudaVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getCudaResource()};
@@ -86,55 +80,74 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    Int_ptr vmem_init;
-    allocData(DataSpace::CudaPinned, vmem_init, 3);
+    RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3, 1);
 
-    Int_ptr vmem;
-    allocData(DataSpace::CudaDevice, vmem, 3);
+    constexpr size_t shmem = 3*sizeof(Int_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, (reduce3int<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      vmem_init[0] = m_vsum_init;
-      vmem_init[1] = m_vmin_init;
-      vmem_init[2] = m_vmax_init;
-      cudaErrchk( cudaMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type),
-                                   cudaMemcpyHostToDevice, res.get_stream() ) );
-
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = 3*sizeof(Int_type)*block_size;
-      reduce3int<block_size><<<grid_size, block_size,
-                   shmem, res.get_stream()>>>(vec,
-                                                    vmem + 0, m_vsum_init,
-                                                    vmem + 1, m_vmin_init,
-                                                    vmem + 2, m_vmax_init,
-                                                    iend );
-      cudaErrchk( cudaGetLastError() );
-
-      Int_type lmem[3];
-      cudaErrchk( cudaMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
-      m_vsum += lmem[0];
-      m_vmin = RAJA_MIN(m_vmin, lmem[1]);
-      m_vmax = RAJA_MAX(m_vmax, lmem[2]);
+      Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init};
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3, 1);
+
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
+
+      RPlaunchCudaKernel( (reduce3int<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          vec,
+                          vmem + 0, m_vsum_init,
+                          vmem + 1, m_vmin_init,
+                          vmem + 2, m_vmax_init,
+                          iend );
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(vmem, hvmem, 3, 1);
+      m_vsum += hvmem[0];
+      m_vmin = RAJA_MIN(m_vmin, hvmem[1]);
+      m_vmax = RAJA_MAX(m_vmax, hvmem[2]);
 
     }
     stopTimer();
 
-    deallocData(DataSpace::CudaDevice, vmem);
-    deallocData(DataSpace::CudaPinned, vmem_init);
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(vmem, hvmem);
+
+  } else {
+     getCout() << "\n  REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void REDUCE3_INT::runCudaVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::cuda_reduce_atomic,
+      RAJA::cuda_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
 
-  } else if ( vid == RAJA_CUDA ) {
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  REDUCE3_INT_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::cuda_reduce, Int_type> vsum(m_vsum_init);
-      RAJA::ReduceMin<RAJA::cuda_reduce, Int_type> vmin(m_vmin_init);
-      RAJA::ReduceMax<RAJA::cuda_reduce, Int_type> vmax(m_vmax_init);
+      RAJA::ReduceSum<reduction_policy, Int_type> vsum(m_vsum_init);
+      RAJA::ReduceMin<reduction_policy, Int_type> vmin(m_vmin_init);
+      RAJA::ReduceMax<reduction_policy, Int_type> vmax(m_vmax_init);
 
-      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
         REDUCE3_INT_BODY_RAJA;
       });
@@ -151,7 +164,168 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Cuda)
+template < size_t block_size, typename MappingHelper >
+void REDUCE3_INT::runCudaVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  REDUCE3_INT_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Int_type tvsum = m_vsum_init;
+      Int_type tvmin = m_vmin_init;
+      Int_type tvmax = m_vmax_init;
+
+      RAJA::forall<exec_policy>( res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tvsum),
+        RAJA::expt::Reduce<RAJA::operators::minimum>(&tvmin),
+        RAJA::expt::Reduce<RAJA::operators::maximum>(&tvmax),
+        [=] __device__ (Index_type i,
+                        Int_type& vsum, Int_type& vmin, Int_type& vmax) {
+          REDUCE3_INT_BODY;
+        }
+      );
+
+      m_vsum += static_cast<Int_type>(tvsum);
+      m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(tvmin));
+      m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(tvmax));
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantBase<decltype(block_size){},
+                                 decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runCudaVariantRAJA<decltype(block_size){},
+                                   decltype(algorithm_helper),
+                                   decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantRAJANewReduce<decltype(block_size){},
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp
index bd524565a..12d172de7 100644
--- a/src/basic/REDUCE3_INT-Hip.cpp
+++ b/src/basic/REDUCE3_INT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,10 @@
 #include "common/HipDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -56,28 +60,18 @@ __global__ void reduce3int(Int_ptr vec,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
   if ( threadIdx.x == 0 ) {
     RAJA::atomicAdd<RAJA::hip_atomic>( vsum, psum[ 0 ] );
     RAJA::atomicMin<RAJA::hip_atomic>( vmin, pmin[ 0 ] );
     RAJA::atomicMax<RAJA::hip_atomic>( vmax, pmax[ 0 ] );
   }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *vsum += psum[ 0 ];
-    *vmin = RAJA_MIN( *vmin, pmin[ 0 ] );
-    *vmax = RAJA_MAX( *vmax, pmax[ 0 ] );
-  }
-#endif
 }
 
 
-
-template < size_t block_size >
-void REDUCE3_INT::runHipVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void REDUCE3_INT::runHipVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getHipResource()};
@@ -86,55 +80,74 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    Int_ptr vmem_init;
-    allocData(DataSpace::HipPinned, vmem_init, 3);
+    RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3, 1);
 
-    Int_ptr vmem;
-    allocData(DataSpace::HipDevice, vmem, 3);
+    constexpr size_t shmem = 3*sizeof(Int_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, (reduce3int<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      vmem_init[0] = m_vsum_init;
-      vmem_init[1] = m_vmin_init;
-      vmem_init[2] = m_vmax_init;
-      hipErrchk( hipMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type),
-                                 hipMemcpyHostToDevice, res.get_stream() ) );
-
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = 3*sizeof(Int_type)*block_size;
-      hipLaunchKernelGGL((reduce3int<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                                                    vec,
-                                                    vmem + 0, m_vsum_init,
-                                                    vmem + 1, m_vmin_init,
-                                                    vmem + 2, m_vmax_init,
-                                                    iend );
-      hipErrchk( hipGetLastError() );
-
-      Int_type lmem[3];
-      hipErrchk( hipMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-      m_vsum += lmem[0];
-      m_vmin = RAJA_MIN(m_vmin, lmem[1]);
-      m_vmax = RAJA_MAX(m_vmax, lmem[2]);
+      Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init};
+      RAJAPERF_HIP_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3, 1);
+
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
+
+      RPlaunchHipKernel( (reduce3int<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         vec,
+                         vmem + 0, m_vsum_init,
+                         vmem + 1, m_vmin_init,
+                         vmem + 2, m_vmax_init,
+                         iend );
+
+      RAJAPERF_HIP_REDUCER_COPY_BACK(vmem, hvmem, 3, 1);
+      m_vsum += hvmem[0];
+      m_vmin = RAJA_MIN(m_vmin, hvmem[1]);
+      m_vmax = RAJA_MAX(m_vmax, hvmem[2]);
 
     }
     stopTimer();
 
-    deallocData(DataSpace::HipDevice, vmem);
-    deallocData(DataSpace::HipPinned, vmem_init);
+    RAJAPERF_HIP_REDUCER_TEARDOWN(vmem, hvmem);
+
+  } else {
+     getCout() << "\n  REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void REDUCE3_INT::runHipVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::hip_reduce_atomic,
+      RAJA::hip_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
 
-  } else if ( vid == RAJA_HIP ) {
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  REDUCE3_INT_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::hip_reduce, Int_type> vsum(m_vsum_init);
-      RAJA::ReduceMin<RAJA::hip_reduce, Int_type> vmin(m_vmin_init);
-      RAJA::ReduceMax<RAJA::hip_reduce, Int_type> vmax(m_vmax_init);
+      RAJA::ReduceSum<reduction_policy, Int_type> vsum(m_vsum_init);
+      RAJA::ReduceMin<reduction_policy, Int_type> vmin(m_vmin_init);
+      RAJA::ReduceMax<reduction_policy, Int_type> vmax(m_vmax_init);
 
-      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
         REDUCE3_INT_BODY_RAJA;
       });
@@ -151,7 +164,166 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Hip)
+template < size_t block_size, typename MappingHelper >
+void REDUCE3_INT::runHipVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  REDUCE3_INT_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Int_type tvsum = m_vsum_init;
+      Int_type tvmin = m_vmin_init;
+      Int_type tvmax = m_vmax_init;
+
+      RAJA::forall<exec_policy>( res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tvsum),
+        RAJA::expt::Reduce<RAJA::operators::minimum>(&tvmin),
+        RAJA::expt::Reduce<RAJA::operators::maximum>(&tvmax),
+        [=] __device__ (Index_type i,
+                        Int_type& vsum, Int_type& vmin, Int_type& vmax) {
+          REDUCE3_INT_BODY;
+        }
+      );
+
+      m_vsum += static_cast<Int_type>(tvsum);
+      m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(tvmin));
+      m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(tvmax));
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantBase<decltype(block_size){},
+                                decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runHipVariantRAJA<decltype(block_size){},
+                                  decltype(algorithm_helper),
+                                  decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantRAJANewReduce<decltype(block_size){},
+                                         decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void REDUCE3_INT::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp
index 5428d6087..c9848ac98 100644
--- a/src/basic/REDUCE3_INT-OMP.cpp
+++ b/src/basic/REDUCE3_INT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -19,7 +19,7 @@ namespace basic
 {
 
 
-void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t tune_idx)
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -91,24 +91,56 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun
 
     case RAJA_OpenMP : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceSum<RAJA::omp_reduce, Int_type> vsum(m_vsum_init);
+          RAJA::ReduceMin<RAJA::omp_reduce, Int_type> vmin(m_vmin_init);
+          RAJA::ReduceMax<RAJA::omp_reduce, Int_type> vmax(m_vmax_init);
+  
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            REDUCE3_INT_BODY_RAJA;
+          });
+
+          m_vsum += static_cast<Int_type>(vsum.get());
+          m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(vmin.get()));
+          m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(vmax.get()));
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
 
-        RAJA::ReduceSum<RAJA::omp_reduce, Int_type> vsum(m_vsum_init);
-        RAJA::ReduceMin<RAJA::omp_reduce, Int_type> vmin(m_vmin_init);
-        RAJA::ReduceMax<RAJA::omp_reduce, Int_type> vmax(m_vmax_init);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          REDUCE3_INT_BODY_RAJA;
-        });
+          Int_type tvsum = m_vsum_init;
+          Int_type tvmin = m_vmin_init;
+          Int_type tvmax = m_vmax_init;
 
-        m_vsum += static_cast<Int_type>(vsum.get());
-        m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(vmin.get()));
-        m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(vmax.get()));
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tvsum),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&tvmin),
+            RAJA::expt::Reduce<RAJA::operators::maximum>(&tvmax),
+            [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) {
+              REDUCE3_INT_BODY;
+            }
+          );
 
+          m_vsum += static_cast<Int_type>(tvsum);
+          m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(tvmin));
+          m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(tvmax));
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  REDUCE3_INT : Unknown OpenMP tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -121,8 +153,17 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun
 
 #else
   RAJA_UNUSED_VAR(vid);
+  RAJA_UNUSED_VAR(tune_idx);
 #endif
 }
 
+void REDUCE3_INT::setOpenMPTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMP) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp
index 0d261edec..d92d37667 100644
--- a/src/basic/REDUCE3_INT-OMPTarget.cpp
+++ b/src/basic/REDUCE3_INT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,7 +27,7 @@ namespace basic
   const size_t threads_per_team = 256;
 
 
-void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -62,31 +62,70 @@ void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    if (tune_idx == 0) {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceSum<RAJA::omp_target_reduce, Int_type> vsum(m_vsum_init);
+        RAJA::ReduceMin<RAJA::omp_target_reduce, Int_type> vmin(m_vmin_init);
+        RAJA::ReduceMax<RAJA::omp_target_reduce, Int_type> vmax(m_vmax_init);
+
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          REDUCE3_INT_BODY_RAJA;
+        });
+
+        m_vsum += static_cast<Int_type>(vsum.get());
+        m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(vmin.get()));
+        m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(vmax.get()));
+
+      }
+      stopTimer();
+
+    } else if (tune_idx == 1) {
 
-      RAJA::ReduceSum<RAJA::omp_target_reduce, Int_type> vsum(m_vsum_init);
-      RAJA::ReduceMin<RAJA::omp_target_reduce, Int_type> vmin(m_vmin_init);
-      RAJA::ReduceMax<RAJA::omp_target_reduce, Int_type> vmax(m_vmax_init);
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
-        RAJA::RangeSegment(ibegin, iend),
-        [=](Index_type i) {
-        REDUCE3_INT_BODY_RAJA;
-      });
+        Int_type tvsum = m_vsum_init;
+        Int_type tvmin = m_vmin_init;
+        Int_type tvmax = m_vmax_init;
 
-      m_vsum += static_cast<Real_type>(vsum.get());
-      m_vmin = RAJA_MIN(m_vmin, static_cast<Real_type>(vmin.get()));
-      m_vmax = RAJA_MAX(m_vmax, static_cast<Real_type>(vmax.get()));
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend),
+          RAJA::expt::Reduce<RAJA::operators::plus>(&tvsum),
+          RAJA::expt::Reduce<RAJA::operators::minimum>(&tvmin),
+          RAJA::expt::Reduce<RAJA::operators::maximum>(&tvmax),
+          [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) {
+            REDUCE3_INT_BODY;
+          }
+        );
 
+        m_vsum += static_cast<Int_type>(tvsum);
+        m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(tvmin));
+        m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(tvmax));
+
+      }
+      stopTimer();
+
+    } else {
+      getCout() << "\n  REDUCE3_INT : Unknown OMP Target tuning index = " << tune_idx << std::endl;
     }
-    stopTimer();
 
   } else {
      getCout() << "\n  REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
+void REDUCE3_INT::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMPTarget) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp
index f204bd345..32bcfbef6 100644
--- a/src/basic/REDUCE3_INT-Seq.cpp
+++ b/src/basic/REDUCE3_INT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -19,8 +19,11 @@ namespace basic
 {
 
 
-void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE3_INT::runSeqVariant(VariantID vid, size_t tune_idx)
 {
+#if !defined(RUN_RAJA_SEQ)
+  RAJA_UNUSED_VAR(tune_idx);
+#endif
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
@@ -84,24 +87,56 @@ void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i
 
     case RAJA_Seq : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceSum<RAJA::seq_reduce, Int_type> vsum(m_vsum_init);
+          RAJA::ReduceMin<RAJA::seq_reduce, Int_type> vmin(m_vmin_init);
+          RAJA::ReduceMax<RAJA::seq_reduce, Int_type> vmax(m_vmax_init);
+  
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            REDUCE3_INT_BODY_RAJA;
+          });
+  
+          m_vsum += static_cast<Int_type>(vsum.get());
+          m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(vmin.get()));
+          m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(vmax.get()));
+  
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          Int_type tvsum = m_vsum_init; 
+          Int_type tvmin = m_vmin_init; 
+          Int_type tvmax = m_vmax_init; 
 
-        RAJA::ReduceSum<RAJA::seq_reduce, Int_type> vsum(m_vsum_init);
-        RAJA::ReduceMin<RAJA::seq_reduce, Int_type> vmin(m_vmin_init);
-        RAJA::ReduceMax<RAJA::seq_reduce, Int_type> vmax(m_vmax_init);
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tvsum),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&tvmin),
+            RAJA::expt::Reduce<RAJA::operators::maximum>(&tvmax),
+            [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) {
+              REDUCE3_INT_BODY;
+            }
+          );
 
-        RAJA::forall<RAJA::seq_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          REDUCE3_INT_BODY_RAJA;
-        });
+          m_vsum += static_cast<Int_type>(tvsum);
+          m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(tvmin));
+          m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(tvmax));
 
-        m_vsum += static_cast<Int_type>(vsum.get());
-        m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(vmin.get()));
-        m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(vmax.get()));
+        }
+        stopTimer();
 
+      } else {
+        getCout() << "\n  REDUCE3_INT : Unknown Seq tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -115,5 +150,13 @@ void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i
 
 }
 
+void REDUCE3_INT::setSeqTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_Seq) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp
new file mode 100644
index 000000000..58ac6f082
--- /dev/null
+++ b/src/basic/REDUCE3_INT-Sycl.cpp
@@ -0,0 +1,135 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE3_INT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+template <size_t work_group_size >
+void REDUCE3_INT::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  REDUCE3_INT_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    Int_ptr hsum;
+    allocAndInitSyclDeviceData(hsum, &m_vsum_init, 1, qu);
+    Int_ptr hmin;
+    allocAndInitSyclDeviceData(hmin, &m_vmin_init, 1, qu);
+    Int_ptr hmax;
+    allocAndInitSyclDeviceData(hmax, &m_vmax_init, 1, qu);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      initSyclDeviceData(hsum, &m_vsum_init, 1, qu);
+      initSyclDeviceData(hmin, &m_vmin_init, 1, qu);
+      initSyclDeviceData(hmax, &m_vmax_init, 1, qu);
+
+      qu->submit([&] (sycl::handler& h) {
+
+        auto sum_reduction = sycl::reduction(hsum, sycl::plus<>());
+        auto min_reduction = sycl::reduction(hmin, sycl::minimum<>());
+        auto max_reduction = sycl::reduction(hmax, sycl::maximum<>());
+
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       sum_reduction, min_reduction, max_reduction,
+                       [=] (sycl::nd_item<1> item, auto& vsum, auto& vmin, auto& vmax) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+           // REDUCE3_INT_BODY
+              vsum += vec[i];
+              vmin.combine(vec[i]); 
+              vmax.combine(vec[i]);
+          }
+
+        });
+      });
+
+      Int_type lsum;
+      Int_ptr plsum = &lsum;
+      getSyclDeviceData(plsum, hsum, 1, qu);
+      m_vsum += lsum;
+
+      Int_type lmin;
+      Int_ptr plmin = &lmin;
+      getSyclDeviceData(plmin, hmin, 1, qu);
+      m_vmin = RAJA_MIN(m_vmin, lmin);
+
+      Int_type lmax;
+      Int_ptr plmax = &lmax;
+      getSyclDeviceData(plmax, hmax, 1, qu);
+      m_vmax = RAJA_MAX(m_vmax, lmax);
+
+    } // for (RepIndex_type irep = ...
+    stopTimer();
+  
+    deallocSyclDeviceData(hsum, qu);
+    deallocSyclDeviceData(hmin, qu);
+    deallocSyclDeviceData(hmax, qu);
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Int_type tvsum = m_vsum_init;
+      Int_type tvmin = m_vmin_init;
+      Int_type tvmax = m_vmax_init;
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, false /*async*/> >(
+        res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tvsum),
+        RAJA::expt::Reduce<RAJA::operators::minimum>(&tvmin),
+        RAJA::expt::Reduce<RAJA::operators::maximum>(&tvmax),
+        [=] (Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) {
+          REDUCE3_INT_BODY;
+        }
+      );
+
+      m_vsum += static_cast<Int_type>(tvsum);
+      m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(tvmin));
+      m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(tvmax));
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  REDUCE3_INT : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp
index 975bf8f24..3be262b77 100644
--- a/src/basic/REDUCE3_INT.cpp
+++ b/src/basic/REDUCE3_INT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -33,8 +33,10 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (3*sizeof(Int_type) + 3*sizeof(Int_type)) +
-                  (0*sizeof(Int_type) + 1*sizeof(Int_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 3*sizeof(Int_type) +
+                      1*sizeof(Int_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 3*sizeof(Int_type) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize() + 1);
 
   setUsesFeature(Forall);
@@ -57,6 +59,9 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp
index e82c2cf05..a3719a845 100644
--- a/src/basic/REDUCE3_INT.hpp
+++ b/src/basic/REDUCE3_INT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -70,18 +70,37 @@ class REDUCE3_INT : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
-  template < size_t block_size >
-  void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size >
-  void runHipVariantImpl(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid); 
+  void setSyclTuningDefinitions(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runCudaVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantRAJANewReduce(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runHipVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantRAJANewReduce(VariantID vid);
+
+  template < size_t work_group_size > 
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Int_ptr m_vec;
   Int_type m_vsum;
diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp
index 2961af4cc..898b453f0 100644
--- a/src/basic/REDUCE_STRUCT-Cuda.cpp
+++ b/src/basic/REDUCE_STRUCT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,10 @@
 #include "common/CudaDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -83,7 +87,6 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y,
      __syncthreads();
   }
 
-// serialized access to shared data;
   if ( threadIdx.x == 0 ) {
     RAJA::atomicAdd<RAJA::cuda_atomic>( xsum, pxsum[ 0 ] );
     RAJA::atomicMin<RAJA::cuda_atomic>( xmin, pxmin[ 0 ] );
@@ -95,11 +98,10 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y,
   }
 }
 
-template < size_t block_size >
-void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void REDUCE_STRUCT::runCudaVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getCudaResource()};
@@ -108,64 +110,90 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax
-    allocData(DataSpace::CudaDevice, mem,6);
+    RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6, 1);
+
+    constexpr size_t shmem = 6*sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, (reduce_struct<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream()));
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = 6*sizeof(Real_type)*block_size;
-                                                            
-      reduce_struct<block_size><<<grid_size, block_size,
-                                  shmem, res.get_stream()>>>(
-        points.x, points.y,
-        mem, mem+1, mem+2,    // xcenter,xmin,xmax
-        mem+3, mem+4, mem+5,  // ycenter,ymin,ymax
-        m_init_sum, m_init_min, m_init_max,
-        points.N);
-      cudaErrchk( cudaGetLastError() );
-
-      Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-      cudaErrchk( cudaMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
-
-      points.SetCenter(lmem[0]/points.N, lmem[3]/points.N);
-      points.SetXMin(lmem[1]);
-      points.SetXMax(lmem[2]);
-      points.SetYMin(lmem[4]);
-      points.SetYMax(lmem[5]);
+      Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max};
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(imem, mem, hmem, 6, 1);
+
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
+
+      RPlaunchCudaKernel( (reduce_struct<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          points.x, points.y,
+                          mem, mem+1, mem+2,    // xcenter,xmin,xmax
+                          mem+3, mem+4, mem+5,  // ycenter,ymin,ymax
+                          m_init_sum, m_init_min, m_init_max,
+                          points.N );
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(mem, hmem, 6, 1);
+      points.SetCenter(hmem[0]/points.N, hmem[3]/points.N);
+      points.SetXMin(hmem[1]);
+      points.SetXMax(hmem[2]);
+      points.SetYMin(hmem[4]);
+      points.SetYMax(hmem[5]);
       m_points=points;
 
     }
     stopTimer();
 
-    deallocData(DataSpace::CudaDevice, mem);
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(mem, hmem);
 
-  } else if ( vid == RAJA_CUDA ) {
+  } else {
+     getCout() << "\n  REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl;
+  }
+
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void REDUCE_STRUCT::runCudaVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::cuda_reduce_atomic,
+      RAJA::cuda_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> xsum(m_init_sum);
-      RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> ysum(m_init_sum);
-      RAJA::ReduceMin<RAJA::cuda_reduce, Real_type> xmin(m_init_min); 
-      RAJA::ReduceMin<RAJA::cuda_reduce, Real_type> ymin(m_init_min);
-      RAJA::ReduceMax<RAJA::cuda_reduce, Real_type> xmax(m_init_max); 
-      RAJA::ReduceMax<RAJA::cuda_reduce, Real_type> ymax(m_init_max);
+      RAJA::ReduceSum<reduction_policy, Real_type> xsum(m_init_sum);
+      RAJA::ReduceSum<reduction_policy, Real_type> ysum(m_init_sum);
+      RAJA::ReduceMin<reduction_policy, Real_type> xmin(m_init_min);
+      RAJA::ReduceMin<reduction_policy, Real_type> ymin(m_init_min);
+      RAJA::ReduceMax<reduction_policy, Real_type> xmax(m_init_max);
+      RAJA::ReduceMax<reduction_policy, Real_type> ymax(m_init_max);
 
-      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
           REDUCE_STRUCT_BODY_RAJA;
       });
 
       points.SetCenter((xsum.get()/(points.N)),
                        (ysum.get()/(points.N)));
-      points.SetXMin((xmin.get())); 
+      points.SetXMin((xmin.get()));
       points.SetXMax((xmax.get()));
-      points.SetYMin((ymin.get())); 
+      points.SetYMin((ymin.get()));
       points.SetYMax((ymax.get()));
       m_points=points;
 
@@ -178,7 +206,181 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid)
 
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_STRUCT, Cuda)
+template < size_t block_size, typename MappingHelper >
+void REDUCE_STRUCT::runCudaVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type txsum = m_init_sum;
+      Real_type tysum = m_init_sum;
+      Real_type txmin = m_init_min;
+      Real_type tymin = m_init_min;
+      Real_type txmax = m_init_max;
+      Real_type tymax = m_init_max;
+
+      RAJA::forall<exec_policy>(
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&txsum),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tysum),
+        RAJA::expt::Reduce<RAJA::operators::minimum>(&txmin),
+        RAJA::expt::Reduce<RAJA::operators::minimum>(&tymin),
+        RAJA::expt::Reduce<RAJA::operators::maximum>(&txmax),
+        RAJA::expt::Reduce<RAJA::operators::maximum>(&tymax),
+        [=] __device__ (Index_type i, Real_type& xsum, Real_type& ysum,
+                                      Real_type& xmin, Real_type& ymin,
+                                      Real_type& xmax, Real_type& ymax) {
+          REDUCE_STRUCT_BODY;
+        }
+      );
+
+      points.SetCenter(static_cast<Real_type>(txsum)/(points.N),
+                       static_cast<Real_type>(tysum)/(points.N));
+      points.SetXMin(static_cast<Real_type>(txmin));
+      points.SetXMax(static_cast<Real_type>(txmax));
+      points.SetYMin(static_cast<Real_type>(tymin));
+      points.SetYMax(static_cast<Real_type>(tymax));
+      m_points = points;
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl;
+  }
+
+}
+
+void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantBase<decltype(block_size){},
+                                 decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runCudaVariantRAJA<decltype(block_size){},
+                                   decltype(algorithm_helper),
+                                   decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantRAJANewReduce<decltype(block_size){},
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  REDUCE_STRUCT : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void REDUCE_STRUCT::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp
index 236e3e7f2..17fe5ad83 100644
--- a/src/basic/REDUCE_STRUCT-Hip.cpp
+++ b/src/basic/REDUCE_STRUCT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,10 @@
 #include "common/HipDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -83,7 +87,6 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y,
      __syncthreads();
   }
 
-// serialized access to shared data;
   if ( threadIdx.x == 0 ) {
     RAJA::atomicAdd<RAJA::hip_atomic>( xsum, pxsum[ 0 ] );
     RAJA::atomicMin<RAJA::hip_atomic>( xmin, pxmin[ 0 ] );
@@ -95,12 +98,10 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y,
   }
 }
 
-
-template < size_t block_size >
-void REDUCE_STRUCT::runHipVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void REDUCE_STRUCT::runHipVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getHipResource()};
@@ -109,68 +110,92 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax
-    allocData(DataSpace::HipDevice, mem,6);
+    RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6, 1);
+
+    constexpr size_t shmem = 6*sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, (reduce_struct<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream()));
+      Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max};
+      RAJAPERF_HIP_REDUCER_INITIALIZE(imem, mem, hmem, 6, 1);
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = 6*sizeof(Real_type)*block_size;
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-      hipLaunchKernelGGL((reduce_struct<block_size>), 
-                         dim3(grid_size), dim3(block_size), 
+      RPlaunchHipKernel( (reduce_struct<block_size>),
+                         grid_size, block_size,
                          shmem, res.get_stream(),
-	                 points.x, points.y,
+                         points.x, points.y,
                          mem, mem+1, mem+2,    // xcenter,xmin,xmax
                          mem+3, mem+4, mem+5,  // ycenter,ymin,ymax
                          m_init_sum, m_init_min, m_init_max,
-                         points.N);
-      hipErrchk( hipGetLastError() );
-
-      Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-      hipErrchk( hipMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+                         points.N ); 
 
-      points.SetCenter(lmem[0]/points.N, lmem[3]/points.N);
-      points.SetXMin(lmem[1]);
-      points.SetXMax(lmem[2]);
-      points.SetYMin(lmem[4]);
-      points.SetYMax(lmem[5]);
-      m_points=points;
+      RAJAPERF_HIP_REDUCER_COPY_BACK(mem, hmem, 6, 1);
+      points.SetCenter(hmem[0]/points.N, hmem[3]/points.N);
+      points.SetXMin(hmem[1]);
+      points.SetXMax(hmem[2]);
+      points.SetYMin(hmem[4]);
+      points.SetYMax(hmem[5]);
+      m_points = points;
 
     }
     stopTimer();
 
-    deallocData(DataSpace::HipDevice, mem);
+    RAJAPERF_HIP_REDUCER_TEARDOWN(mem, hmem);
 
-  } else if ( vid == RAJA_HIP ) {
+  } else {
+     getCout() << "\n  REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl;
+  }
+
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void REDUCE_STRUCT::runHipVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::hip_reduce_atomic,
+      RAJA::hip_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::hip_reduce, Real_type> xsum(m_init_sum);
-      RAJA::ReduceSum<RAJA::hip_reduce, Real_type> ysum(m_init_sum);
-      RAJA::ReduceMin<RAJA::hip_reduce, Real_type> xmin(m_init_min);
-      RAJA::ReduceMin<RAJA::hip_reduce, Real_type> ymin(m_init_min);
-      RAJA::ReduceMax<RAJA::hip_reduce, Real_type> xmax(m_init_max);
-      RAJA::ReduceMax<RAJA::hip_reduce, Real_type> ymax(m_init_max);
+      RAJA::ReduceSum<reduction_policy, Real_type> xsum(m_init_sum);
+      RAJA::ReduceSum<reduction_policy, Real_type> ysum(m_init_sum);
+      RAJA::ReduceMin<reduction_policy, Real_type> xmin(m_init_min);
+      RAJA::ReduceMin<reduction_policy, Real_type> ymin(m_init_min);
+      RAJA::ReduceMax<reduction_policy, Real_type> xmax(m_init_max);
+      RAJA::ReduceMax<reduction_policy, Real_type> ymax(m_init_max);
 
-      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
           REDUCE_STRUCT_BODY_RAJA;
       });
 
       points.SetCenter((xsum.get()/(points.N)),
                        (ysum.get()/(points.N)));
-      points.SetXMin((xmin.get())); 
+      points.SetXMin((xmin.get()));
       points.SetXMax((xmax.get()));
-      points.SetYMin((ymin.get())); 
+      points.SetYMin((ymin.get()));
       points.SetYMax((ymax.get()));
-      m_points=points;
+      m_points = points;
 
     }
     stopTimer();
@@ -181,7 +206,179 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid)
 
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_STRUCT, Hip)
+template < size_t block_size, typename MappingHelper >
+void REDUCE_STRUCT::runHipVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  REDUCE_STRUCT_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type txsum = m_init_sum;
+      Real_type tysum = m_init_sum;
+      Real_type txmin = m_init_min;
+      Real_type tymin = m_init_min;
+      Real_type txmax = m_init_max;
+      Real_type tymax = m_init_max;
+
+      RAJA::forall<exec_policy>( res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&txsum),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tysum),
+        RAJA::expt::Reduce<RAJA::operators::minimum>(&txmin),
+        RAJA::expt::Reduce<RAJA::operators::minimum>(&tymin),
+        RAJA::expt::Reduce<RAJA::operators::maximum>(&txmax),
+        RAJA::expt::Reduce<RAJA::operators::maximum>(&tymax),
+        [=] __device__ (Index_type i, Real_type& xsum, Real_type& ysum,
+                                      Real_type& xmin, Real_type& ymin,
+                                      Real_type& xmax, Real_type& ymax) {
+          REDUCE_STRUCT_BODY;
+        }
+      );
+
+      points.SetCenter(static_cast<Real_type>(txsum)/(points.N),
+                       static_cast<Real_type>(tysum)/(points.N));
+      points.SetXMin(static_cast<Real_type>(txmin));
+      points.SetXMax(static_cast<Real_type>(txmax));
+      points.SetYMin(static_cast<Real_type>(tymin));
+      points.SetYMax(static_cast<Real_type>(tymax));
+      m_points = points;
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  REDUCE_STRUCT : Unknown HIP variant id = " << vid << std::endl;
+  }
+
+}
+
+void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantBase<decltype(block_size){},
+                                decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runHipVariantRAJA<decltype(block_size){},
+                                  decltype(algorithm_helper),
+                                  decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantRAJANewReduce<decltype(block_size){},
+                                         decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void REDUCE_STRUCT::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp
index 7ac22faa2..8c44d02c0 100644
--- a/src/basic/REDUCE_STRUCT-OMP.cpp
+++ b/src/basic/REDUCE_STRUCT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -19,7 +19,7 @@ namespace basic
 {
 
 
-void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t tune_idx)
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -55,7 +55,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t
         points.SetXMax(xmax);
         points.SetYMin(ymin); 
         points.SetYMax(ymax);
-        m_points=points;
+        m_points = points;
 
       }
       stopTimer();
@@ -100,7 +100,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t
         points.SetXMax(xmax);
         points.SetYMin(ymin);
         points.SetYMax(ymax);
-        m_points=points;
+        m_points = points;
 
       } 
       stopTimer();
@@ -110,31 +110,75 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t
 
     case RAJA_OpenMP : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
  
-        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> xsum(m_init_sum);
-        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> ysum(m_init_sum);
-        RAJA::ReduceMin<RAJA::omp_reduce, Real_type> xmin(m_init_min); 
-        RAJA::ReduceMin<RAJA::omp_reduce, Real_type> ymin(m_init_min);
-        RAJA::ReduceMax<RAJA::omp_reduce, Real_type> xmax(m_init_max); 
-        RAJA::ReduceMax<RAJA::omp_reduce, Real_type> ymax(m_init_max);
-
-        RAJA::forall<RAJA::omp_parallel_for_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          REDUCE_STRUCT_BODY_RAJA;
-        });
-
-        points.SetCenter((xsum.get()/(points.N)),
-                         (ysum.get()/(points.N)));
-        points.SetXMin((xmin.get())); 
-        points.SetXMax((xmax.get()));
-        points.SetYMin((ymin.get())); 
-        points.SetYMax((ymax.get()));
-        m_points=points;
+          RAJA::ReduceSum<RAJA::omp_reduce, Real_type> xsum(m_init_sum);
+          RAJA::ReduceSum<RAJA::omp_reduce, Real_type> ysum(m_init_sum);
+          RAJA::ReduceMin<RAJA::omp_reduce, Real_type> xmin(m_init_min); 
+          RAJA::ReduceMin<RAJA::omp_reduce, Real_type> ymin(m_init_min);
+          RAJA::ReduceMax<RAJA::omp_reduce, Real_type> xmax(m_init_max); 
+          RAJA::ReduceMax<RAJA::omp_reduce, Real_type> ymax(m_init_max);
+
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+              REDUCE_STRUCT_BODY_RAJA;
+          });
+  
+          points.SetCenter((xsum.get()/(points.N)),
+                           (ysum.get()/(points.N)));
+          points.SetXMin((xmin.get())); 
+          points.SetXMax((xmax.get()));
+          points.SetYMin((ymin.get())); 
+          points.SetYMax((ymax.get()));
+          m_points = points;
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          Real_type txsum = m_init_sum;
+          Real_type tysum = m_init_sum;
+          Real_type txmin = m_init_min;
+          Real_type tymin = m_init_min;
+          Real_type txmax = m_init_max;
+          Real_type tymax = m_init_max;
+
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&txsum),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tysum),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&txmin),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&tymin),
+            RAJA::expt::Reduce<RAJA::operators::maximum>(&txmax),
+            RAJA::expt::Reduce<RAJA::operators::maximum>(&tymax),
+            [=](Index_type i, Real_type& xsum, Real_type& ysum,
+                              Real_type& xmin, Real_type& ymin,
+                              Real_type& xmax, Real_type& ymax) {
+              REDUCE_STRUCT_BODY;
+            }
+          );
+
+          points.SetCenter(static_cast<Real_type>(txsum)/(points.N),
+                           static_cast<Real_type>(tysum)/(points.N));
+          points.SetXMin(static_cast<Real_type>(txmin));
+          points.SetXMax(static_cast<Real_type>(txmax));
+          points.SetYMin(static_cast<Real_type>(tymin));
+          points.SetYMax(static_cast<Real_type>(tymax));
+          m_points = points;
+
+        }
+        stopTimer();
 
+      } else {
+        getCout() << "\n  REDUCE_STRUCT : Unknown OpenMP tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -147,8 +191,17 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t
 
 #else 
   RAJA_UNUSED_VAR(vid);
+  RAJA_UNUSED_VAR(tune_idx);
 #endif
 }
 
+void REDUCE_STRUCT::setOpenMPTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMP) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp
index cfbcba44a..594c62ccb 100644
--- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp
+++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,8 +27,7 @@ namespace basic
   const size_t threads_per_team = 256;
 
 
-void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, 
-                                           size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -36,80 +35,142 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid,
 
   REDUCE_STRUCT_DATA_SETUP;
 
-  if ( vid == Base_OpenMPTarget ) {
+  switch ( vid ) {
 
-    Real_ptr xa = points.x;
-    Real_ptr ya = points.y;
+    case Base_OpenMPTarget : {
 
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      Real_ptr xa = points.x;
+      Real_ptr ya = points.y;
 
-      Real_type xsum = m_init_sum; Real_type ysum = m_init_sum;
-      Real_type xmin = m_init_min; Real_type ymin = m_init_min;
-      Real_type xmax = m_init_max; Real_type ymax = m_init_max;
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      #pragma omp target is_device_ptr(xa, ya) device( did ) map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax)
-      #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \
+        Real_type xsum = m_init_sum; Real_type ysum = m_init_sum;
+        Real_type xmin = m_init_min; Real_type ymin = m_init_min;
+        Real_type xmax = m_init_max; Real_type ymax = m_init_max;
+
+        #pragma omp target is_device_ptr(xa, ya) device( did ) \
+                           map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax)
+        #pragma omp teams distribute parallel for \
+                          thread_limit(threads_per_team) schedule(static,1) \
                                reduction(+:xsum) \
                                reduction(min:xmin) \
                                reduction(max:xmax), \
                                reduction(+:ysum), \
                                reduction(min:ymin), \
                                reduction(max:ymax)
-      for (Index_type i = ibegin; i < iend; ++i ) {
-        xsum += xa[i] ;
-        xmin = RAJA_MIN(xmin, xa[i]) ;
-        xmax = RAJA_MAX(xmax, xa[i]) ;
-        ysum += ya[i] ;
-        ymin = RAJA_MIN(ymin, ya[i]) ;
-        ymax = RAJA_MAX(ymax, ya[i]) ;
-      }
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          xsum += xa[i] ;
+          xmin = RAJA_MIN(xmin, xa[i]) ;
+          xmax = RAJA_MAX(xmax, xa[i]) ;
+          ysum += ya[i] ;
+          ymin = RAJA_MIN(ymin, ya[i]) ;
+          ymax = RAJA_MAX(ymax, ya[i]) ;
+        }
+
+        points.SetCenter(xsum/points.N, ysum/points.N);
+        points.SetXMin(xmin);
+        points.SetXMax(xmax);
+        points.SetYMin(ymin); 
+        points.SetYMax(ymax);
+        m_points = points;
 
-      points.SetCenter(xsum/points.N, ysum/points.N);
-      points.SetXMin(xmin);
-      points.SetXMax(xmax);
-      points.SetYMin(ymin); 
-      points.SetYMax(ymax);
-      m_points=points;
+      }
+      stopTimer();
 
+      break;
     }
-    stopTimer();
-
-  } else if ( vid == RAJA_OpenMPTarget ) {
-
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> xsum(m_init_sum);
-      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> ysum(m_init_sum);
-      RAJA::ReduceMin<RAJA::omp_target_reduce, Real_type> xmin(m_init_min);
-      RAJA::ReduceMin<RAJA::omp_target_reduce, Real_type> ymin(m_init_min);
-      RAJA::ReduceMax<RAJA::omp_target_reduce, Real_type> xmax(m_init_max);
-      RAJA::ReduceMax<RAJA::omp_target_reduce, Real_type> ymax(m_init_max);
-
-      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
-        RAJA::RangeSegment(ibegin, iend),
-        [=](Index_type i) {
-        REDUCE_STRUCT_BODY_RAJA;
-      });
-
-      points.SetCenter(xsum.get()/(points.N),
-                       ysum.get()/(points.N));
-      points.SetXMin(xmin.get());
-      points.SetXMax(xmax.get());
-      points.SetYMin(ymin.get());
-      points.SetYMax(ymax.get());
-      m_points=points;
 
+    case RAJA_OpenMPTarget : {
+
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> xsum(m_init_sum);
+          RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> ysum(m_init_sum);
+          RAJA::ReduceMin<RAJA::omp_target_reduce, Real_type> xmin(m_init_min);
+          RAJA::ReduceMin<RAJA::omp_target_reduce, Real_type> ymin(m_init_min);
+          RAJA::ReduceMax<RAJA::omp_target_reduce, Real_type> xmax(m_init_max);
+          RAJA::ReduceMax<RAJA::omp_target_reduce, Real_type> ymax(m_init_max);
+
+          RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+            RAJA::RangeSegment(ibegin, iend),
+            [=](Index_type i) {
+              REDUCE_STRUCT_BODY_RAJA;
+          });
+
+          points.SetCenter(xsum.get()/(points.N),
+                           ysum.get()/(points.N));
+          points.SetXMin(xmin.get());
+          points.SetXMax(xmax.get());
+          points.SetYMin(ymin.get());
+          points.SetYMax(ymax.get());
+          m_points = points;
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          Real_type txsum = m_init_sum;
+          Real_type tysum = m_init_sum;
+          Real_type txmin = m_init_min;
+          Real_type tymin = m_init_min;
+          Real_type txmax = m_init_max;
+          Real_type tymax = m_init_max;
+
+          RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&txsum),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tysum),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&txmin),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&tymin),
+            RAJA::expt::Reduce<RAJA::operators::maximum>(&txmax),
+            RAJA::expt::Reduce<RAJA::operators::maximum>(&tymax),
+            [=](Index_type i, Real_type& xsum, Real_type& ysum,
+                              Real_type& xmin, Real_type& ymin,
+                              Real_type& xmax, Real_type& ymax) {
+              REDUCE_STRUCT_BODY;
+            }
+          );
+
+          points.SetCenter(static_cast<Real_type>(txsum)/(points.N),
+                           static_cast<Real_type>(tysum)/(points.N));
+          points.SetXMin(static_cast<Real_type>(txmin));
+          points.SetXMax(static_cast<Real_type>(txmax));
+          points.SetYMin(static_cast<Real_type>(tymin));
+          points.SetYMax(static_cast<Real_type>(tymax));
+          m_points = points;
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  REDUCE_STRUCT : Unknown OMP Target tuning index = " << tune_idx << std::endl;
+      }
+
+      break;
     }
-    stopTimer();
 
-  } else {
+    default:
      getCout() << "\n  REDUCE_STRUCT : Unknown OMP Target variant id = " << vid << std::endl;
   }
 
 }
 
+void REDUCE_STRUCT::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMPTarget) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
 
diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp
index 377b19b84..1e2a68d43 100644
--- a/src/basic/REDUCE_STRUCT-Seq.cpp
+++ b/src/basic/REDUCE_STRUCT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -19,8 +19,11 @@ namespace basic
 {
 
 
-void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t tune_idx)
 {
+#if !defined(RUN_RAJA_SEQ)
+  RAJA_UNUSED_VAR(tune_idx);
+#endif
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
@@ -47,7 +50,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune
         points.SetXMax(xmax);
         points.SetYMin(ymin);
         points.SetYMax(ymax);
-        m_points=points;
+        m_points = points;
 
       }
       stopTimer();
@@ -87,7 +90,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune
         points.SetXMax(xmax);
         points.SetYMin(ymin); 
         points.SetYMax(ymax);
-        m_points=points;
+        m_points = points;
 
       }
       stopTimer();
@@ -97,31 +100,75 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune
 
     case RAJA_Seq : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+  
+          RAJA::ReduceSum<RAJA::seq_reduce, Real_type> xsum(m_init_sum);
+          RAJA::ReduceSum<RAJA::seq_reduce, Real_type> ysum(m_init_sum);
+          RAJA::ReduceMin<RAJA::seq_reduce, Real_type> xmin(m_init_min);
+          RAJA::ReduceMin<RAJA::seq_reduce, Real_type> ymin(m_init_min);
+          RAJA::ReduceMax<RAJA::seq_reduce, Real_type> xmax(m_init_max);
+          RAJA::ReduceMax<RAJA::seq_reduce, Real_type> ymax(m_init_max);
+  
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            REDUCE_STRUCT_BODY_RAJA;
+          });
+  
+          points.SetCenter(xsum.get()/(points.N),
+                           ysum.get()/(points.N));
+          points.SetXMin(xmin.get());
+          points.SetXMax(xmax.get());
+          points.SetYMin(ymin.get());
+          points.SetYMax(ymax.get());
+          m_points = points;
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          Real_type txsum = m_init_sum; 
+          Real_type tysum = m_init_sum; 
+          Real_type txmin = m_init_min; 
+          Real_type tymin = m_init_min; 
+          Real_type txmax = m_init_max; 
+          Real_type tymax = m_init_max; 
+ 
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&txsum),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tysum),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&txmin),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&tymin),
+            RAJA::expt::Reduce<RAJA::operators::maximum>(&txmax), 
+            RAJA::expt::Reduce<RAJA::operators::maximum>(&tymax), 
+            [=](Index_type i, Real_type& xsum, Real_type& ysum,
+                              Real_type& xmin, Real_type& ymin,
+                              Real_type& xmax, Real_type& ymax) {
+              REDUCE_STRUCT_BODY;
+            }
+          );
+ 
+          points.SetCenter(static_cast<Real_type>(txsum)/(points.N),
+                           static_cast<Real_type>(tysum)/(points.N));
+          points.SetXMin(static_cast<Real_type>(txmin));
+          points.SetXMax(static_cast<Real_type>(txmax));
+          points.SetYMin(static_cast<Real_type>(tymin));
+          points.SetYMax(static_cast<Real_type>(tymax));
+          m_points = points;
 
-        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> xsum(m_init_sum);
-        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> ysum(m_init_sum);
-        RAJA::ReduceMin<RAJA::seq_reduce, Real_type> xmin(m_init_min);
-        RAJA::ReduceMin<RAJA::seq_reduce, Real_type> ymin(m_init_min);
-        RAJA::ReduceMax<RAJA::seq_reduce, Real_type> xmax(m_init_max);
-        RAJA::ReduceMax<RAJA::seq_reduce, Real_type> ymax(m_init_max);
-
-        RAJA::forall<RAJA::seq_exec>(
-        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-        REDUCE_STRUCT_BODY_RAJA;
-        });
-
-      	points.SetCenter(xsum.get()/(points.N),
-                         ysum.get()/(points.N));
-        points.SetXMin(xmin.get());
-        points.SetXMax(xmax.get());
-        points.SetYMin(ymin.get());
-        points.SetYMax(ymax.get());
-        m_points=points;
+        }
+        stopTimer();
 
+      } else {
+        getCout() << "\n  REDUCE_STRUCT : Unknown Seq tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -132,7 +179,14 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune
     }
 
   }
+}
 
+void REDUCE_STRUCT::setSeqTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_Seq) {
+    addVariantTuningName(vid, "new");
+  }
 }
 
 } // end namespace basic
diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp
index f18319eb2..764e82f67 100644
--- a/src/basic/REDUCE_STRUCT.cpp
+++ b/src/basic/REDUCE_STRUCT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -33,7 +33,10 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( 6*sizeof(Real_type) + 2*sizeof(Real_type)*getActualProblemSize());
+  setBytesReadPerRep( 6*sizeof(Real_type) +
+                      2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 6*sizeof(Real_type) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize() + 2);
     
 
diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp
index 425e7796e..658d9eae4 100644
--- a/src/basic/REDUCE_STRUCT.hpp
+++ b/src/basic/REDUCE_STRUCT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -87,15 +87,28 @@ class REDUCE_STRUCT : public KernelBase
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
 
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
-  template < size_t block_size >
-  void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size >
-  void runHipVariantImpl(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runCudaVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantRAJANewReduce(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runHipVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantRAJANewReduce(VariantID vid);
 
   struct PointsType {
-    Int_type N;
+    Index_type N;
     Real_ptr x, y;
 
     Real_ptr GetCenter(){return &center[0];};
@@ -118,7 +131,7 @@ class REDUCE_STRUCT : public KernelBase
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
   Real_ptr m_x; Real_ptr m_y;
   Real_type	m_init_sum; 
   Real_type	m_init_min; 
diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp
index f4de3cf6a..e58e86923 100644
--- a/src/basic/TRAP_INT-Cuda.cpp
+++ b/src/basic/TRAP_INT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,30 +12,21 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
+#include "TRAP_INT-func.hpp"
+
 #include "common/CudaDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
 namespace basic
 {
 
-//
-// Function used in TRAP_INT loop.
-//
-RAJA_INLINE
-RAJA_DEVICE
-Real_type trap_int_func(Real_type x,
-                        Real_type y,
-                        Real_type xp,
-                        Real_type yp)
-{
-   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
-   denom = 1.0/sqrt(denom);
-   return denom;
-}
-
 
 template < size_t block_size >
 __launch_bounds__(block_size)
@@ -64,25 +55,16 @@ __global__ void trapint(Real_type x0, Real_type xp,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
   if ( threadIdx.x == 0 ) {
     RAJA::atomicAdd<RAJA::cuda_atomic>( sumx, psumx[ 0 ] );
   }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *sumx += psumx[ 0 ];
-  }
-#endif
-
 }
 
 
-
-template < size_t block_size >
-void TRAP_INT::runCudaVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void TRAP_INT::runCudaVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getCudaResource()};
@@ -91,44 +73,69 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    Real_ptr sumx;
-    allocData(DataSpace::CudaDevice, sumx, 1);
+    RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1, 1);
+
+    constexpr size_t shmem = sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, (trapint<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk( cudaMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type),
-                                   cudaMemcpyHostToDevice, res.get_stream() ) );
-
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = sizeof(Real_type)*block_size;
-      trapint<block_size><<<grid_size, block_size,
-                shmem, res.get_stream()>>>(x0, xp,
-                                                y, yp,
-                                                h,
-                                                sumx,
-                                                iend);
-      cudaErrchk( cudaGetLastError() );
-
-      Real_type lsumx;
-      cudaErrchk( cudaMemcpyAsync( &lsumx, sumx, sizeof(Real_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
-      m_sumx += lsumx * h;
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1, 1);
+
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
+
+      RPlaunchCudaKernel( (trapint<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x0, xp,
+                          y, yp,
+                          h,
+                          sumx,
+                          iend);
+
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(sumx, hsumx, 1, 1);
+      m_sumx += hsumx[0] * h;
 
     }
     stopTimer();
 
-    deallocData(DataSpace::CudaDevice, sumx);
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(sumx, hsumx);
+
+  } else {
+     getCout() << "\n  TRAP_INT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void TRAP_INT::runCudaVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::cuda_reduce_atomic,
+      RAJA::cuda_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
 
-  } else if ( vid == RAJA_CUDA ) {
+  TRAP_INT_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> sumx(m_sumx_init);
+      RAJA::ReduceSum<reduction_policy, Real_type> sumx(m_sumx_init);
 
-      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
         TRAP_INT_BODY;
       });
@@ -143,7 +150,162 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Cuda)
+template < size_t block_size, typename MappingHelper >
+void TRAP_INT::runCudaVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  TRAP_INT_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type tsumx = m_sumx_init;
+
+      RAJA::forall<exec_policy>( res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tsumx),
+        [=] __device__ (Index_type i, Real_type& sumx) {
+          TRAP_INT_BODY;
+        }
+      );
+
+      m_sumx += static_cast<Real_type>(tsumx) * h;
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  TRAP_INT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantBase<decltype(block_size){},
+                                 decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runCudaVariantRAJA<decltype(block_size){},
+                                   decltype(algorithm_helper),
+                                   decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantRAJANewReduce<decltype(block_size){},
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  TRAP_INT : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void TRAP_INT::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp
index 1b5f4b2be..e60b3ccff 100644
--- a/src/basic/TRAP_INT-Hip.cpp
+++ b/src/basic/TRAP_INT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,30 +12,21 @@
 
 #if defined(RAJA_ENABLE_HIP)
 
+#include "TRAP_INT-func.hpp"
+
 #include "common/HipDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
 namespace basic
 {
 
-//
-// Function used in TRAP_INT loop.
-//
-RAJA_INLINE
-RAJA_DEVICE
-Real_type trap_int_func(Real_type x,
-                        Real_type y,
-                        Real_type xp,
-                        Real_type yp)
-{
-   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
-   denom = 1.0/sqrt(denom);
-   return denom;
-}
-
 
 template < size_t block_size >
 __launch_bounds__(block_size)
@@ -64,25 +55,16 @@ __global__ void trapint(Real_type x0, Real_type xp,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
   if ( threadIdx.x == 0 ) {
     RAJA::atomicAdd<RAJA::hip_atomic>( sumx, psumx[ 0 ] );
   }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *sumx += psumx[ 0 ];
-  }
-#endif
-
 }
 
 
-
-template < size_t block_size >
-void TRAP_INT::runHipVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void TRAP_INT::runHipVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getHipResource()};
@@ -91,43 +73,69 @@ void TRAP_INT::runHipVariantImpl(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    Real_ptr sumx;
-    allocData(DataSpace::HipDevice, sumx, 1);
+    RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1, 1);
+
+    constexpr size_t shmem = sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, (trapint<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk( hipMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type),
-                                 hipMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1, 1);
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = sizeof(Real_type)*block_size;
-      hipLaunchKernelGGL((trapint<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x0, xp,
-                                                y, yp,
-                                                h,
-                                                sumx,
-                                                iend);
-      hipErrchk( hipGetLastError() );
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-      Real_type lsumx;
-      hipErrchk( hipMemcpyAsync( &lsumx, sumx, sizeof(Real_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-      m_sumx += lsumx * h;
+      RPlaunchHipKernel( (trapint<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         x0, xp,
+                         y, yp,
+                         h,
+                         sumx,
+                         iend);
+
+      RAJAPERF_HIP_REDUCER_COPY_BACK(sumx, hsumx, 1, 1);
+      m_sumx += hsumx[0] * h;
 
     }
     stopTimer();
 
-    deallocData(DataSpace::HipDevice, sumx);
+    RAJAPERF_HIP_REDUCER_TEARDOWN(sumx, hsumx);
+
+  } else {
+     getCout() << "\n  TRAP_INT : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void TRAP_INT::runHipVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::hip_reduce_atomic,
+      RAJA::hip_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
 
-  } else if ( vid == RAJA_HIP ) {
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  TRAP_INT_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::hip_reduce, Real_type> sumx(m_sumx_init);
+      RAJA::ReduceSum<reduction_policy, Real_type> sumx(m_sumx_init);
 
-      RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+      RAJA::forall<exec_policy>( res,
         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
         TRAP_INT_BODY;
       });
@@ -142,7 +150,161 @@ void TRAP_INT::runHipVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Hip)
+template < size_t block_size, typename MappingHelper >
+void TRAP_INT::runHipVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  TRAP_INT_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type tsumx = m_sumx_init;
+
+      RAJA::forall<exec_policy>(
+        res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tsumx),
+        [=] __device__ (Index_type i, Real_type& sumx) {
+          TRAP_INT_BODY;
+        }
+      );
+
+      m_sumx += static_cast<Real_type>(tsumx) * h;
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  TRAP_INT : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantBase<decltype(block_size){},
+                                decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runHipVariantRAJA<decltype(block_size){},
+                                  decltype(algorithm_helper),
+                                  decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantRAJANewReduce<decltype(block_size){},
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  TRAP_INT : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void TRAP_INT::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp
index dadaa5baa..f1961483a 100644
--- a/src/basic/TRAP_INT-OMP.cpp
+++ b/src/basic/TRAP_INT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -10,6 +10,8 @@
 
 #include "RAJA/RAJA.hpp"
 
+#include "TRAP_INT-func.hpp"
+
 #include <iostream>
 
 namespace rajaperf
@@ -17,22 +19,8 @@ namespace rajaperf
 namespace basic
 {
 
-//
-// Function used in TRAP_INT loop.
-//
-RAJA_INLINE
-Real_type trap_int_func(Real_type x,
-                        Real_type y,
-                        Real_type xp,
-                        Real_type yp)
-{
-   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
-   denom = 1.0/sqrt(denom);
-   return denom;
-}
-
 
-void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void TRAP_INT::runOpenMPVariant(VariantID vid, size_t tune_idx)
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -91,20 +79,46 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i
 
     case RAJA_OpenMP : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
 
-        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> sumx(m_sumx_init);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          TRAP_INT_BODY;
-        });
+          RAJA::ReduceSum<RAJA::omp_reduce, Real_type> sumx(m_sumx_init);
 
-        m_sumx += static_cast<Real_type>(sumx.get()) * h;
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            TRAP_INT_BODY;
+          });
+
+          m_sumx += static_cast<Real_type>(sumx.get()) * h;
+
+        }
+        stopTimer();
+  
+      } else if (tune_idx == 1) {
 
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          Real_type tsumx = m_sumx_init;
+
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tsumx),
+            [=] (Index_type i, Real_type& sumx) {
+              TRAP_INT_BODY;
+            }
+          );
+
+          m_sumx += static_cast<Real_type>(tsumx) * h;
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  TRAP_INT : Unknown OpenMP tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -117,8 +131,17 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i
 
 #else
   RAJA_UNUSED_VAR(vid);
+  RAJA_UNUSED_VAR(tune_idx);
 #endif
 }
 
+void TRAP_INT::setOpenMPTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMP) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp
index b9bdcd6a6..2c5e4cf56 100644
--- a/src/basic/TRAP_INT-OMPTarget.cpp
+++ b/src/basic/TRAP_INT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,6 +12,8 @@
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
+#include "TRAP_INT-func.hpp"
+
 #include "common/OpenMPTargetDataUtils.hpp"
 
 #include <iostream>
@@ -21,27 +23,13 @@ namespace rajaperf
 namespace basic
 {
 
-//
-// Function used in TRAP_INT loop.
-//
-RAJA_INLINE
-Real_type trap_int_func(Real_type x,
-                        Real_type y,
-                        Real_type xp,
-                        Real_type yp)
-{
-   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
-   denom = 1.0/sqrt(denom);
-   return denom;
-}
-
   //
   // Define threads per team for target execution
   //
   const size_t threads_per_team = 256;
 
 
-void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -58,7 +46,8 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
 
       Real_type sumx = m_sumx_init;
 
-      #pragma omp target teams distribute parallel for map(tofrom: sumx) reduction(+:sumx) \
+      #pragma omp target teams distribute parallel for \
+                         map(tofrom: sumx) reduction(+:sumx) \
                          thread_limit(threads_per_team) schedule(static, 1)
 
       for (Index_type i = ibegin; i < iend; ++i ) {
@@ -74,23 +63,57 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    if (tune_idx == 0) {
 
-      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> sumx(m_sumx_init);
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
-        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-        TRAP_INT_BODY;
-      });
+        RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> sumx(m_sumx_init);
+
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          TRAP_INT_BODY;
+        });
+
+        m_sumx += static_cast<Real_type>(sumx.get()) * h;
+
+      }
+      stopTimer();
+
+    } else if (tune_idx == 1) {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        Real_type tsumx = m_sumx_init;
+
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend),
+          RAJA::expt::Reduce<RAJA::operators::plus>(&tsumx),
+          [=] (Index_type i, Real_type& sumx) {
+            TRAP_INT_BODY;
+          }
+        );
+
+        m_sumx += static_cast<Real_type>(tsumx) * h;
 
-      m_sumx += static_cast<Real_type>(sumx.get()) * h;
+      }
+      stopTimer();
 
+    } else {
+      getCout() << "\n  TRAP_INT : Unknown OMP Target tuning index = " << tune_idx << std::endl;
     }
-    stopTimer();
 
   } else {
-     getCout() << "\n  TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl;
+     getCout() << "\n  TRAP_INT : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+void TRAP_INT::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMPTarget) {
+    addVariantTuningName(vid, "new");
   }
 }
 
diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp
index 9b1264b4d..fa74efdcf 100644
--- a/src/basic/TRAP_INT-Seq.cpp
+++ b/src/basic/TRAP_INT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -10,6 +10,8 @@
 
 #include "RAJA/RAJA.hpp"
 
+#include "TRAP_INT-func.hpp"
+
 #include <iostream>
 
 namespace rajaperf
@@ -17,23 +19,12 @@ namespace rajaperf
 namespace basic
 {
 
-//
-// Function used in TRAP_INT loop.
-//
-RAJA_INLINE
-Real_type trap_int_func(Real_type x,
-                        Real_type y,
-                        Real_type xp,
-                        Real_type yp)
-{
-   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
-   denom = 1.0/sqrt(denom);
-   return denom;
-}
-
 
-void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void TRAP_INT::runSeqVariant(VariantID vid, size_t tune_idx)
 {
+#if !defined(RUN_RAJA_SEQ)
+  RAJA_UNUSED_VAR(tune_idx);
+#endif
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
@@ -88,20 +79,46 @@ void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)
 
     case RAJA_Seq : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
 
-        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> sumx(m_sumx_init);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::seq_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          TRAP_INT_BODY;
-        });
+          RAJA::ReduceSum<RAJA::seq_reduce, Real_type> sumx(m_sumx_init);
+
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            TRAP_INT_BODY;
+          });
+
+          m_sumx += static_cast<Real_type>(sumx.get()) * h;
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          Real_type tsumx = m_sumx_init;
+
+          RAJA::forall<RAJA::seq_exec>( 
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tsumx),
+            [=] (Index_type i, Real_type& sumx) {
+              TRAP_INT_BODY;
+            }
+          );
+
+          m_sumx += static_cast<Real_type>(tsumx) * h;
 
-        m_sumx += static_cast<Real_type>(sumx.get()) * h;
+        }
+        stopTimer();
 
+      } else {
+        getCout() << "\n  TRAP_INT : Unknown Seq tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -115,5 +132,13 @@ void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)
 
 }
 
+void TRAP_INT::setSeqTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_Seq) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp
new file mode 100644
index 000000000..a9795c77e
--- /dev/null
+++ b/src/basic/TRAP_INT-Sycl.cpp
@@ -0,0 +1,108 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRAP_INT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "TRAP_INT-func.hpp"
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+template <size_t work_group_size >
+void TRAP_INT::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  TRAP_INT_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    Real_ptr sumx;
+    allocAndInitSyclDeviceData(sumx, &m_sumx_init, 1, qu);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      initSyclDeviceData(sumx, &m_sumx_init, 1, qu);
+  
+      qu->submit([&] (sycl::handler& hdl) {
+
+        auto sum_reduction = sycl::reduction(sumx, sycl::plus<>());
+
+        hdl.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       sum_reduction,
+                       [=] (sycl::nd_item<1> item, auto& sumx) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            TRAP_INT_BODY
+          }
+
+        });
+      });
+
+      Real_type lsumx;
+      Real_ptr plsumx = &lsumx;
+      getSyclDeviceData(plsumx, sumx, 1, qu);
+      m_sumx += lsumx * h;
+
+    }
+    stopTimer();
+  
+    deallocSyclDeviceData(sumx, qu);
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type tsumx = m_sumx_init;
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, false /*async*/> >(
+        res,
+        RAJA::RangeSegment(ibegin, iend),
+        RAJA::expt::Reduce<RAJA::operators::plus>(&tsumx),
+        [=] (Index_type i, Real_type& sumx) {
+          TRAP_INT_BODY;
+        }
+      );
+
+      m_sumx += static_cast<Real_type>(tsumx) * h;
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  TRAP_INT : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Sycl)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/basic/TRAP_INT-func.hpp b/src/basic/TRAP_INT-func.hpp
new file mode 100644
index 000000000..9c4b90c52
--- /dev/null
+++ b/src/basic/TRAP_INT-func.hpp
@@ -0,0 +1,35 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJAPerf_Basic_TRAP_INT_FUNC_HPP
+#define RAJAPerf_Basic_TRAP_INT_FUNC_HPP
+
+namespace rajaperf
+{
+namespace basic
+{
+
+//
+// Function used in TRAP_INT loop in each variant.
+//
+RAJA_INLINE
+RAJA_HOST_DEVICE
+Real_type trap_int_func(Real_type x,
+                        Real_type y,
+                        Real_type xp,
+                        Real_type yp)
+{
+   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
+   denom = 1.0/sqrt(denom);
+   return denom;
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // closing endif for header file include guard
diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp
index eaac3ffda..09da695ea 100644
--- a/src/basic/TRAP_INT.cpp
+++ b/src/basic/TRAP_INT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ TRAP_INT::TRAP_INT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) +
-                  (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(10 * getActualProblemSize()); // 1 sqrt
 
   setUsesFeature(Forall);
@@ -52,6 +53,9 @@ TRAP_INT::TRAP_INT(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp
index e64932dbe..8f8ca9337 100644
--- a/src/basic/TRAP_INT.hpp
+++ b/src/basic/TRAP_INT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -67,18 +67,37 @@ class TRAP_INT : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
-  template < size_t block_size >
-  void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size >
-  void runHipVariantImpl(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runCudaVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantRAJANewReduce(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runHipVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantRAJANewReduce(VariantID vid);
+
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_type m_x0;
   Real_type m_xp;
diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt
new file mode 100644
index 000000000..9298e7bce
--- /dev/null
+++ b/src/comm/CMakeLists.txt
@@ -0,0 +1,43 @@
+###############################################################################
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+# and RAJA Performance Suite project contributors.
+# See the RAJAPerf/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+blt_add_library(
+  NAME comm
+  SOURCES HALO_base.cpp
+          HALO_PACKING.cpp
+          HALO_PACKING-Seq.cpp
+          HALO_PACKING-Hip.cpp
+          HALO_PACKING-Cuda.cpp
+          HALO_PACKING-OMP.cpp
+          HALO_PACKING-OMPTarget.cpp
+          HALO_PACKING_FUSED.cpp
+          HALO_PACKING_FUSED-Seq.cpp
+          HALO_PACKING_FUSED-Hip.cpp
+          HALO_PACKING_FUSED-Cuda.cpp
+          HALO_PACKING_FUSED-OMP.cpp
+          HALO_PACKING_FUSED-OMPTarget.cpp
+          HALO_SENDRECV.cpp
+          HALO_SENDRECV-Seq.cpp
+          HALO_SENDRECV-Hip.cpp
+          HALO_SENDRECV-Cuda.cpp
+          HALO_SENDRECV-OMP.cpp
+          HALO_SENDRECV-OMPTarget.cpp
+          HALO_EXCHANGE.cpp
+          HALO_EXCHANGE-Seq.cpp
+          HALO_EXCHANGE-Hip.cpp
+          HALO_EXCHANGE-Cuda.cpp
+          HALO_EXCHANGE-OMP.cpp
+          HALO_EXCHANGE-OMPTarget.cpp
+          HALO_EXCHANGE_FUSED.cpp
+          HALO_EXCHANGE_FUSED-Seq.cpp
+          HALO_EXCHANGE_FUSED-Hip.cpp
+          HALO_EXCHANGE_FUSED-Cuda.cpp
+          HALO_EXCHANGE_FUSED-OMP.cpp
+          HALO_EXCHANGE_FUSED-OMPTarget.cpp
+  DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
+  )
diff --git a/src/comm/HALO_EXCHANGE-Cuda.cpp b/src/comm/HALO_EXCHANGE-Cuda.cpp
new file mode 100644
index 000000000..ad5482d18
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE-Cuda.cpp
@@ -0,0 +1,208 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var,
+                                  Index_type len)
+{
+   Index_type i = threadIdx.x + blockIdx.x * block_size;
+
+   if (i < len) {
+     HALO_PACK_BODY;
+   }
+}
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
+                                    Index_type len)
+{
+   Index_type i = threadIdx.x + blockIdx.x * block_size;
+
+   if (i < len) {
+     HALO_UNPACK_BODY;
+   }
+}
+
+
+template < size_t block_size >
+void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  HALO_EXCHANGE_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          dim3 nthreads_per_block(block_size);
+          dim3 nblocks((len + block_size-1) / block_size);
+          constexpr size_t shmem = 0;
+          RPlaunchCudaKernel( (halo_exchange_pack<block_size>),
+                              nblocks, nthreads_per_block,
+                              shmem, res.get_stream(),
+                              buffer, list, var, len);
+          buffer += len;
+        }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+        int l = -1;
+        MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          dim3 nthreads_per_block(block_size);
+          dim3 nblocks((len + block_size-1) / block_size);
+          constexpr size_t shmem = 0;
+          RPlaunchCudaKernel( (halo_exchange_unpack<block_size>),
+                              nblocks, nthreads_per_block,
+                              shmem, res.get_stream(),
+                              buffer, list, var, len);
+          buffer += len;
+        }
+      }
+      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    using EXEC_POL = RAJA::cuda_exec<block_size, true /*async*/>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          auto halo_exchange_pack_base_lam = [=] __device__ (Index_type i) {
+                HALO_PACK_BODY;
+              };
+          RAJA::forall<EXEC_POL>( res,
+              RAJA::TypedRangeSegment<Index_type>(0, len),
+              halo_exchange_pack_base_lam );
+          buffer += len;
+        }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        res.wait();
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+        int l = -1;
+        MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          auto halo_exchange_unpack_base_lam = [=] __device__ (Index_type i) {
+                HALO_UNPACK_BODY;
+              };
+          RAJA::forall<EXEC_POL>( res,
+              RAJA::TypedRangeSegment<Index_type>(0, len),
+              halo_exchange_unpack_base_lam );
+          buffer += len;
+        }
+      }
+      res.wait();
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_EXCHANGE, Cuda)
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/comm/HALO_EXCHANGE-Hip.cpp b/src/comm/HALO_EXCHANGE-Hip.cpp
new file mode 100644
index 000000000..1b7ffb04a
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE-Hip.cpp
@@ -0,0 +1,208 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var,
+                                  Index_type len)
+{
+   Index_type i = threadIdx.x + blockIdx.x * block_size;
+
+   if (i < len) {
+     HALO_PACK_BODY;
+   }
+}
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
+                                    Index_type len)
+{
+   Index_type i = threadIdx.x + blockIdx.x * block_size;
+
+   if (i < len) {
+     HALO_UNPACK_BODY;
+   }
+}
+
+
+template < size_t block_size >
+void HALO_EXCHANGE::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  HALO_EXCHANGE_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          dim3 nthreads_per_block(block_size);
+          dim3 nblocks((len + block_size-1) / block_size);
+          constexpr size_t shmem = 0;
+          RPlaunchHipKernel( (halo_exchange_pack<block_size>),
+                             nblocks, nthreads_per_block,
+                             shmem, res.get_stream(),
+                             buffer, list, var, len);
+          buffer += len;
+        }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+        int l = -1;
+        MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          dim3 nthreads_per_block(block_size);
+          dim3 nblocks((len + block_size-1) / block_size);
+          constexpr size_t shmem = 0;
+          RPlaunchHipKernel( (halo_exchange_unpack<block_size>),
+                             nblocks, nthreads_per_block,
+                             shmem, res.get_stream(),
+                             buffer, list, var, len);
+          buffer += len;
+        }
+      }
+      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_HIP ) {
+
+    using EXEC_POL = RAJA::hip_exec<block_size, true /*async*/>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          auto halo_exchange_pack_base_lam = [=] __device__ (Index_type i) {
+                HALO_PACK_BODY;
+              };
+          RAJA::forall<EXEC_POL>( res,
+              RAJA::TypedRangeSegment<Index_type>(0, len),
+              halo_exchange_pack_base_lam );
+          buffer += len;
+        }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        res.wait();
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+        int l = -1;
+        MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          auto halo_exchange_unpack_base_lam = [=] __device__ (Index_type i) {
+                HALO_UNPACK_BODY;
+              };
+          RAJA::forall<EXEC_POL>( res,
+              RAJA::TypedRangeSegment<Index_type>(0, len),
+              halo_exchange_unpack_base_lam );
+          buffer += len;
+        }
+      }
+      res.wait();
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_EXCHANGE, Hip)
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/comm/HALO_EXCHANGE-OMP.cpp b/src/comm/HALO_EXCHANGE-OMP.cpp
new file mode 100644
index 000000000..922151704
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE-OMP.cpp
@@ -0,0 +1,254 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+
+void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            #pragma omp parallel for
+            for (Index_type i = 0; i < len; i++) {
+              HALO_PACK_BODY;
+            }
+            buffer += len;
+          }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+          int l = -1;
+          MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            #pragma omp parallel for
+            for (Index_type i = 0; i < len; i++) {
+              HALO_UNPACK_BODY;
+            }
+            buffer += len;
+          }
+        }
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            auto halo_exchange_pack_base_lam = [=](Index_type i) {
+                  HALO_PACK_BODY;
+                };
+            #pragma omp parallel for
+            for (Index_type i = 0; i < len; i++) {
+              halo_exchange_pack_base_lam(i);
+            }
+            buffer += len;
+          }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+          int l = -1;
+          MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            auto halo_exchange_unpack_base_lam = [=](Index_type i) {
+                  HALO_UNPACK_BODY;
+                };
+            #pragma omp parallel for
+            for (Index_type i = 0; i < len; i++) {
+              halo_exchange_unpack_base_lam(i);
+            }
+            buffer += len;
+          }
+        }
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      using EXEC_POL = RAJA::omp_parallel_for_exec;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            auto halo_exchange_pack_base_lam = [=](Index_type i) {
+                  HALO_PACK_BODY;
+                };
+            RAJA::forall<EXEC_POL>(
+                RAJA::TypedRangeSegment<Index_type>(0, len),
+                halo_exchange_pack_base_lam );
+            buffer += len;
+          }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+          int l = -1;
+          MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            auto halo_exchange_unpack_base_lam = [=](Index_type i) {
+                  HALO_UNPACK_BODY;
+                };
+            RAJA::forall<EXEC_POL>(
+                RAJA::TypedRangeSegment<Index_type>(0, len),
+                halo_exchange_unpack_base_lam );
+            buffer += len;
+          }
+        }
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n HALO_EXCHANGE : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_EXCHANGE-OMPTarget.cpp b/src/comm/HALO_EXCHANGE-OMPTarget.cpp
new file mode 100644
index 000000000..f83eb2826
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE-OMPTarget.cpp
@@ -0,0 +1,176 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+
+void HALO_EXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          #pragma omp target is_device_ptr(buffer, list, var) device( did )
+          #pragma omp teams distribute parallel for schedule(static, 1)
+          for (Index_type i = 0; i < len; i++) {
+            HALO_PACK_BODY;
+          }
+          buffer += len;
+        }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+        int l = -1;
+        MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          #pragma omp target is_device_ptr(buffer, list, var) device( did )
+          #pragma omp teams distribute parallel for schedule(static, 1)
+          for (Index_type i = 0; i < len; i++) {
+            HALO_UNPACK_BODY;
+          }
+          buffer += len;
+        }
+      }
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    using EXEC_POL = RAJA::omp_target_parallel_for_exec<threads_per_team>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          auto halo_exchange_pack_base_lam = [=](Index_type i) {
+                HALO_PACK_BODY;
+              };
+          RAJA::forall<EXEC_POL>(
+              RAJA::TypedRangeSegment<Index_type>(0, len),
+              halo_exchange_pack_base_lam );
+          buffer += len;
+        }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+        int l = -1;
+        MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          auto halo_exchange_unpack_base_lam = [=](Index_type i) {
+                HALO_UNPACK_BODY;
+              };
+          RAJA::forall<EXEC_POL>(
+              RAJA::TypedRangeSegment<Index_type>(0, len),
+              halo_exchange_unpack_base_lam );
+          buffer += len;
+        }
+      }
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/comm/HALO_EXCHANGE-Seq.cpp b/src/comm/HALO_EXCHANGE-Seq.cpp
new file mode 100644
index 000000000..b5cbbf6f6
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE-Seq.cpp
@@ -0,0 +1,247 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+
+void HALO_EXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            for (Index_type i = 0; i < len; i++) {
+              HALO_PACK_BODY;
+            }
+            buffer += len;
+          }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+          int l = -1;
+          MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            for (Index_type i = 0; i < len; i++) {
+              HALO_UNPACK_BODY;
+            }
+            buffer += len;
+          }
+        }
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            auto halo_exchange_pack_base_lam = [=](Index_type i) {
+                  HALO_PACK_BODY;
+                };
+            for (Index_type i = 0; i < len; i++) {
+              halo_exchange_pack_base_lam(i);
+            }
+            buffer += len;
+          }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+          int l = -1;
+          MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            auto halo_exchange_unpack_base_lam = [=](Index_type i) {
+                  HALO_UNPACK_BODY;
+                };
+            for (Index_type i = 0; i < len; i++) {
+              halo_exchange_unpack_base_lam(i);
+            }
+            buffer += len;
+          }
+        }
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      using EXEC_POL = RAJA::seq_exec;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            auto halo_exchange_pack_base_lam = [=](Index_type i) {
+                  HALO_PACK_BODY;
+                };
+            RAJA::forall<EXEC_POL>(
+                RAJA::TypedRangeSegment<Index_type>(0, len),
+                halo_exchange_pack_base_lam );
+            buffer += len;
+          }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        for (Index_type ll = 0; ll < num_neighbors; ++ll) {
+          int l = -1;
+          MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE);
+
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            auto halo_exchange_unpack_base_lam = [=](Index_type i) {
+                  HALO_UNPACK_BODY;
+                };
+            RAJA::forall<EXEC_POL>(
+                RAJA::TypedRangeSegment<Index_type>(0, len),
+                halo_exchange_unpack_base_lam );
+            buffer += len;
+          }
+        }
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      getCout() << "\n HALO_EXCHANGE : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_EXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp
new file mode 100644
index 000000000..bbca8851f
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE.cpp
@@ -0,0 +1,165 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+namespace rajaperf
+{
+namespace comm
+{
+
+HALO_EXCHANGE::HALO_EXCHANGE(const RunParams& params)
+  : HALO_base(rajaperf::Comm_HALO_EXCHANGE, params)
+{
+  m_mpi_size = params.getMPISize();
+  m_my_mpi_rank = params.getMPIRank();
+  m_mpi_dims = params.getMPI3DDivision();
+
+  setDefaultReps(200);
+
+  m_num_vars = params.getHaloNumVars();
+  m_var_size = m_grid_plus_halo_size ;
+
+  setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) );
+  setKernelsPerRep( 2 * s_num_neighbors * m_num_vars );
+  setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() +   // pack
+                      1*sizeof(Real_type) * getItsPerRep() +  // pack
+
+                      1*sizeof(Real_type) * getItsPerRep() +  // send
+
+                      1*sizeof(Int_type) * getItsPerRep() +   // unpack
+                      1*sizeof(Real_type) * getItsPerRep() ); // unpack
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() +  // pack
+
+                         1*sizeof(Real_type) * getItsPerRep() +  // recv
+
+                         1*sizeof(Real_type) * getItsPerRep() ); // unpack
+  setBytesAtomicModifyWrittenPerRep( 0 );
+  setFLOPsPerRep(0);
+
+  setUsesFeature(Forall);
+  setUsesFeature(MPI);
+
+  if (params.validMPI3DDivision()) {
+    setVariantDefined( Base_Seq );
+    setVariantDefined( Lambda_Seq );
+    setVariantDefined( RAJA_Seq );
+
+    setVariantDefined( Base_OpenMP );
+    setVariantDefined( Lambda_OpenMP );
+    setVariantDefined( RAJA_OpenMP );
+
+    setVariantDefined( Base_OpenMPTarget );
+    setVariantDefined( RAJA_OpenMPTarget );
+
+    setVariantDefined( Base_CUDA );
+    setVariantDefined( RAJA_CUDA );
+
+    setVariantDefined( Base_HIP );
+    setVariantDefined( RAJA_HIP );
+  }
+}
+
+HALO_EXCHANGE::~HALO_EXCHANGE()
+{
+}
+
+void HALO_EXCHANGE::setUp(VariantID vid, size_t tune_idx)
+{
+  setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx);
+
+  m_vars.resize(m_num_vars, nullptr);
+  for (Index_type v = 0; v < m_num_vars; ++v) {
+    allocAndInitData(m_vars[v], m_var_size, vid);
+    auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid);
+
+    Real_ptr var = m_vars[v];
+
+    for (Index_type i = 0; i < m_var_size; i++) {
+      var[i] = i + v;
+    }
+  }
+
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  m_pack_buffers.resize(s_num_neighbors, nullptr);
+  m_send_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len);
+      allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len);
+      m_send_buffers[l] = m_pack_buffers[l];
+    }
+  }
+
+  m_unpack_buffers.resize(s_num_neighbors, nullptr);
+  m_recv_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len);
+      allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len);
+      m_recv_buffers[l] = m_unpack_buffers[l];
+    }
+  }
+}
+
+void HALO_EXCHANGE::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  for (Real_ptr var : m_vars) {
+    checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid);
+  }
+}
+
+void HALO_EXCHANGE::tearDown(VariantID vid, size_t tune_idx)
+{
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_recv_buffers[l]);
+      deallocData(getDataSpace(vid), m_unpack_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]);
+    }
+  }
+  m_recv_buffers.clear();
+  m_unpack_buffers.clear();
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_send_buffers[l]);
+      deallocData(getDataSpace(vid), m_pack_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_pack_buffers[l]);
+    }
+  }
+  m_send_buffers.clear();
+  m_pack_buffers.clear();
+
+  for (int v = 0; v < m_num_vars; ++v) {
+    deallocData(m_vars[v], vid);
+  }
+  m_vars.clear();
+
+  tearDown_base(vid, tune_idx);
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_EXCHANGE.hpp b/src/comm/HALO_EXCHANGE.hpp
new file mode 100644
index 000000000..8f3cf1cda
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE.hpp
@@ -0,0 +1,147 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// HALO_EXCHANGE kernel reference implementation:
+///
+/// // post a recv for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Index_type len = unpack_index_list_lengths[l];
+///   MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+///       mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+/// }
+///
+/// // pack a buffer for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Real_ptr buffer = pack_buffers[l];
+///   Int_ptr list = pack_index_lists[l];
+///   Index_type len = pack_index_list_lengths[l];
+///   // pack part of each variable
+///   for (Index_type v = 0; v < num_vars; ++v) {
+///     Real_ptr var = vars[v];
+///     for (Index_type i = 0; i < len; i++) {
+///       buffer[i] = var[list[i]];
+///     }
+///     buffer += len;
+///   }
+///   // send buffer to neighbor
+///   MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+///       mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+/// }
+///
+/// // unpack a buffer for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   // receive buffer from neighbor
+///   MPI_Wait(&unpack_mpi_requests[l], MPI_STATUS_IGNORE);
+///   Real_ptr buffer = unpack_buffers[l];
+///   Int_ptr list = unpack_index_lists[l];
+///   Index_type len = unpack_index_list_lengths[l];
+///   // unpack part of each variable
+///   for (Index_type v = 0; v < num_vars; ++v) {
+///     Real_ptr var = vars[v];
+///     for (Index_type i = 0; i < len; i++) {
+///       var[list[i]] = buffer[i];
+///     }
+///     buffer += len;
+///   }
+/// }
+///
+/// // wait for all sends to complete
+/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+///
+
+
+#ifndef RAJAPerf_Comm_HALO_EXCHANGE_HPP
+#define RAJAPerf_Comm_HALO_EXCHANGE_HPP
+
+#define HALO_EXCHANGE_DATA_SETUP \
+  HALO_BASE_DATA_SETUP \
+  \
+  Index_type num_vars = m_num_vars; \
+  std::vector<Real_ptr> vars = m_vars; \
+  \
+  std::vector<int> mpi_ranks = m_mpi_ranks; \
+  \
+  std::vector<MPI_Request> pack_mpi_requests(num_neighbors); \
+  std::vector<MPI_Request> unpack_mpi_requests(num_neighbors); \
+  \
+  const DataSpace dataSpace = getDataSpace(vid); \
+  \
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \
+  \
+  std::vector<Real_ptr> pack_buffers = m_pack_buffers; \
+  std::vector<Real_ptr> unpack_buffers = m_unpack_buffers; \
+  \
+  std::vector<Real_ptr> send_buffers = m_send_buffers; \
+  std::vector<Real_ptr> recv_buffers = m_recv_buffers;
+
+
+#include "HALO_base.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+#include <vector>
+#include <array>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+class HALO_EXCHANGE : public HALO_base
+{
+public:
+
+  HALO_EXCHANGE(const RunParams& params);
+
+  ~HALO_EXCHANGE();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
+
+  int m_mpi_size = -1;
+  int m_my_mpi_rank = -1;
+  std::array<int, 3> m_mpi_dims = {-1, -1, -1};
+
+  Index_type m_num_vars;
+  Index_type m_var_size;
+
+  std::vector<Real_ptr> m_vars;
+
+  std::vector<Real_ptr> m_pack_buffers;
+  std::vector<Real_ptr> m_unpack_buffers;
+
+  std::vector<Real_ptr> m_send_buffers;
+  std::vector<Real_ptr> m_recv_buffers;
+};
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
+#endif // closing endif for header file include guard
diff --git a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp
new file mode 100644
index 000000000..a9d161183
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp
@@ -0,0 +1,416 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \
+  Real_ptr*   pack_buffer_ptrs; \
+  Int_ptr*    pack_list_ptrs; \
+  Real_ptr*   pack_var_ptrs; \
+  Index_type* pack_len_ptrs; \
+  allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(DataSpace::CudaPinned, pack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(DataSpace::CudaPinned, pack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(DataSpace::CudaPinned, pack_len_ptrs,    num_neighbors * num_vars); \
+  Real_ptr*   unpack_buffer_ptrs; \
+  Int_ptr*    unpack_list_ptrs; \
+  Real_ptr*   unpack_var_ptrs; \
+  Index_type* unpack_len_ptrs; \
+  allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(DataSpace::CudaPinned, unpack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(DataSpace::CudaPinned, unpack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(DataSpace::CudaPinned, unpack_len_ptrs,    num_neighbors * num_vars);
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \
+  deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \
+  deallocData(DataSpace::CudaPinned, pack_list_ptrs); \
+  deallocData(DataSpace::CudaPinned, pack_var_ptrs); \
+  deallocData(DataSpace::CudaPinned, pack_len_ptrs); \
+  deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \
+  deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \
+  deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \
+  deallocData(DataSpace::CudaPinned, unpack_len_ptrs);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs,
+                                        Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs)
+{
+  Index_type j = blockIdx.y;
+
+  Real_ptr   buffer = pack_buffer_ptrs[j];
+  Int_ptr    list   = pack_list_ptrs[j];
+  Real_ptr   var    = pack_var_ptrs[j];
+  Index_type len    = pack_len_ptrs[j];
+
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
+       i < len;
+       i += block_size * gridDim.x) {
+    HALO_PACK_BODY;
+  }
+}
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs,
+                                          Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs)
+{
+  Index_type j = blockIdx.y;
+
+  Real_ptr   buffer = unpack_buffer_ptrs[j];
+  Int_ptr    list   = unpack_list_ptrs[j];
+  Real_ptr   var    = unpack_var_ptrs[j];
+  Index_type len    = unpack_len_ptrs[j];
+
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
+       i < len;
+       i += block_size * gridDim.x) {
+    HALO_UNPACK_BODY;
+  }
+}
+
+
+template < size_t block_size >
+void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      constexpr size_t shmem = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      Index_type pack_index = 0;
+      Index_type pack_len_sum = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pack_buffer_ptrs[pack_index] = buffer;
+          pack_list_ptrs[pack_index] = list;
+          pack_var_ptrs[pack_index] = var;
+          pack_len_ptrs[pack_index] = len;
+          pack_len_sum += len;
+          pack_index += 1;
+          buffer += len;
+        }
+      }
+      Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
+      dim3 pack_nthreads_per_block(block_size);
+      dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index);
+      RPlaunchCudaKernel( (halo_exchange_fused_pack<block_size>),
+                          pack_nblocks, pack_nthreads_per_block,
+                          shmem, res.get_stream(),
+                          pack_buffer_ptrs,
+                          pack_list_ptrs,
+                          pack_var_ptrs,
+                          pack_len_ptrs);
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
+      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      Index_type unpack_index = 0;
+      Index_type unpack_len_sum = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          unpack_buffer_ptrs[unpack_index] = buffer;
+          unpack_list_ptrs[unpack_index] = list;
+          unpack_var_ptrs[unpack_index] = var;
+          unpack_len_ptrs[unpack_index] = len;
+          unpack_len_sum += len;
+          unpack_index += 1;
+          buffer += len;
+        }
+      }
+      Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index;
+      dim3 unpack_nthreads_per_block(block_size);
+      dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index);
+      RPlaunchCudaKernel( (halo_exchange_fused_unpack<block_size>),
+                          unpack_nblocks, unpack_nthreads_per_block,
+                          shmem, res.get_stream(),
+                          unpack_buffer_ptrs,
+                          unpack_list_ptrs,
+                          unpack_var_ptrs,
+                          unpack_len_ptrs);
+      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA;
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename dispatch_helper >
+void HALO_EXCHANGE_FUSED::runCudaVariantWorkGroup(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::cuda::pinned_mempool_type>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+    using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                              camp::list<range_segment, Packer>,
+                              camp::list<range_segment, UnPacker>>;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::cuda_work_async<block_size>,
+                                 RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 dispatch_policy >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool_pack  (allocatorHolder.template getAllocator<char>());
+    workpool pool_unpack(allocatorHolder.template getAllocator<char>());
+    pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+    pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+          buffer += len;
+        }
+      }
+      workgroup group_pack = pool_pack.instantiate();
+      worksite site_pack = group_pack.run(res);
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
+      res.wait();
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
+          buffer += len;
+        }
+      }
+      workgroup group_unpack = pool_unpack.instantiate();
+      worksite site_unpack = group_unpack.run(res);
+      res.wait();
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+void HALO_EXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_CUDA) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
+
+          runCudaVariantDirect<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  }
+
+  if (vid == RAJA_CUDA) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+          if (tune_idx == t) {
+
+            runCudaVariantWorkGroup<decltype(block_size){}, decltype(dispatch_helper)>(vid);
+
+          }
+
+          t += 1;
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
+
+void HALO_EXCHANGE_FUSED::setCudaTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_CUDA) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "direct_"+std::to_string(block_size));
+
+      }
+
+    });
+
+  }
+
+  if (vid == RAJA_CUDA) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+          addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size));
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp
new file mode 100644
index 000000000..2ac30479b
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp
@@ -0,0 +1,416 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \
+  Real_ptr*   pack_buffer_ptrs; \
+  Int_ptr*    pack_list_ptrs; \
+  Real_ptr*   pack_var_ptrs; \
+  Index_type* pack_len_ptrs; \
+  allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs,    num_neighbors * num_vars); \
+  Real_ptr*   unpack_buffer_ptrs; \
+  Int_ptr*    unpack_list_ptrs; \
+  Real_ptr*   unpack_var_ptrs; \
+  Index_type* unpack_len_ptrs; \
+  allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs,    num_neighbors * num_vars);
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \
+  deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs,
+                                        Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs)
+{
+  Index_type j = blockIdx.y;
+
+  Real_ptr   buffer = pack_buffer_ptrs[j];
+  Int_ptr    list   = pack_list_ptrs[j];
+  Real_ptr   var    = pack_var_ptrs[j];
+  Index_type len    = pack_len_ptrs[j];
+
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
+       i < len;
+       i += block_size * gridDim.x) {
+    HALO_PACK_BODY;
+  }
+}
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs,
+                                          Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs)
+{
+  Index_type j = blockIdx.y;
+
+  Real_ptr   buffer = unpack_buffer_ptrs[j];
+  Int_ptr    list   = unpack_list_ptrs[j];
+  Real_ptr   var    = unpack_var_ptrs[j];
+  Index_type len    = unpack_len_ptrs[j];
+
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
+       i < len;
+       i += block_size * gridDim.x) {
+    HALO_UNPACK_BODY;
+  }
+}
+
+
+template < size_t block_size >
+void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      constexpr size_t shmem = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      Index_type pack_index = 0;
+      Index_type pack_len_sum = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pack_buffer_ptrs[pack_index] = buffer;
+          pack_list_ptrs[pack_index] = list;
+          pack_var_ptrs[pack_index] = var;
+          pack_len_ptrs[pack_index] = len;
+          pack_len_sum += len;
+          pack_index += 1;
+          buffer += len;
+        }
+      }
+      Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
+      dim3 pack_nthreads_per_block(block_size);
+      dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index);
+      RPlaunchHipKernel( (halo_exchange_fused_pack<block_size>),
+                         pack_nblocks, pack_nthreads_per_block,
+                         shmem, res.get_stream(),
+                         pack_buffer_ptrs,
+                         pack_list_ptrs,
+                         pack_var_ptrs,
+                         pack_len_ptrs);
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
+      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      Index_type unpack_index = 0;
+      Index_type unpack_len_sum = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          unpack_buffer_ptrs[unpack_index] = buffer;
+          unpack_list_ptrs[unpack_index] = list;
+          unpack_var_ptrs[unpack_index] = var;
+          unpack_len_ptrs[unpack_index] = len;
+          unpack_len_sum += len;
+          unpack_index += 1;
+          buffer += len;
+        }
+      }
+      Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index;
+      dim3 unpack_nthreads_per_block(block_size);
+      dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index);
+      RPlaunchHipKernel( (halo_exchange_fused_unpack<block_size>),
+                         unpack_nblocks, unpack_nthreads_per_block,
+                         shmem, res.get_stream(),
+                         unpack_buffer_ptrs,
+                         unpack_list_ptrs,
+                         unpack_var_ptrs,
+                         unpack_len_ptrs);
+      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP;
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename dispatch_helper >
+void HALO_EXCHANGE_FUSED::runHipVariantWorkGroup(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::hip::pinned_mempool_type>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+    using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                              camp::list<range_segment, Packer>,
+                              camp::list<range_segment, UnPacker>>;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<block_size>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 dispatch_policy >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool_pack  (allocatorHolder.template getAllocator<char>());
+    workpool pool_unpack(allocatorHolder.template getAllocator<char>());
+    pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+    pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+          buffer += len;
+        }
+      }
+      workgroup group_pack = pool_pack.instantiate();
+      worksite site_pack = group_pack.run(res);
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
+      res.wait();
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
+          buffer += len;
+        }
+      }
+      workgroup group_unpack = pool_unpack.instantiate();
+      worksite site_unpack = group_unpack.run(res);
+      res.wait();
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+
+void HALO_EXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_HIP) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
+
+          runHipVariantDirect<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  }
+
+  if (vid == RAJA_HIP) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+          if (tune_idx == t) {
+
+            runHipVariantWorkGroup<decltype(block_size){}, decltype(dispatch_helper)>(vid);
+
+          }
+
+          t += 1;
+
+        });
+
+      }
+
+    });
+
+  }
+}
+
+void HALO_EXCHANGE_FUSED::setHipTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_HIP) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "direct_"+std::to_string(block_size));
+
+      }
+
+    });
+
+  }
+
+  if (vid == RAJA_HIP) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+          addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size));
+
+        });
+
+      }
+
+    });
+
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp
new file mode 100644
index 000000000..1af5d4bb9
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp
@@ -0,0 +1,477 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+
+void HALO_EXCHANGE_FUSED::runOpenMPVariantDirect(VariantID vid)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        Index_type pack_index = 0;
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var};
+            pack_lens[pack_index] = len;
+            pack_index += 1;
+            buffer += len;
+          }
+        }
+
+#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL)
+        #pragma omp parallel
+        #pragma omp single nowait
+        for (Index_type j = 0; j < pack_index; j++) {
+          #pragma omp task firstprivate(j)
+          {
+            Real_ptr   buffer = pack_ptr_holders[j].buffer;
+            Int_ptr    list   = pack_ptr_holders[j].list;
+            Real_ptr   var    = pack_ptr_holders[j].var;
+            Index_type len    = pack_lens[j];
+            for (Index_type i = 0; i < len; i++) {
+              HALO_PACK_BODY;
+            }
+          }
+        }
+#else
+        #pragma omp parallel for
+        for (Index_type j = 0; j < pack_index; j++) {
+          Real_ptr   buffer = pack_ptr_holders[j].buffer;
+          Int_ptr    list   = pack_ptr_holders[j].list;
+          Real_ptr   var    = pack_ptr_holders[j].var;
+          Index_type len    = pack_lens[j];
+          for (Index_type i = 0; i < len; i++) {
+            HALO_PACK_BODY;
+          }
+        }
+#endif
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+        Index_type unpack_index = 0;
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var};
+            unpack_lens[unpack_index] = len;
+            unpack_index += 1;
+            buffer += len;
+          }
+        }
+
+#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL)
+        #pragma omp parallel
+        #pragma omp single nowait
+        for (Index_type j = 0; j < unpack_index; j++) {
+          #pragma omp task firstprivate(j)
+          {
+            Real_ptr   buffer = unpack_ptr_holders[j].buffer;
+            Int_ptr    list   = unpack_ptr_holders[j].list;
+            Real_ptr   var    = unpack_ptr_holders[j].var;
+            Index_type len    = unpack_lens[j];
+            for (Index_type i = 0; i < len; i++) {
+              HALO_UNPACK_BODY;
+            }
+          }
+        }
+#else
+        #pragma omp parallel for
+        for (Index_type j = 0; j < unpack_index; j++) {
+          Real_ptr   buffer = unpack_ptr_holders[j].buffer;
+          Int_ptr    list   = unpack_ptr_holders[j].list;
+          Real_ptr   var    = unpack_ptr_holders[j].var;
+          Index_type len    = unpack_lens[j];
+          for (Index_type i = 0; i < len; i++) {
+            HALO_UNPACK_BODY;
+          }
+        }
+#endif
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN;
+
+      break;
+    }
+
+    case Lambda_OpenMP : {
+
+      HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        Index_type pack_index = 0;
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var));
+            pack_lens[pack_index] = len;
+            pack_index += 1;
+            buffer += len;
+          }
+        }
+
+#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL)
+        #pragma omp parallel
+        #pragma omp single nowait
+        for (Index_type j = 0; j < pack_index; j++) {
+          #pragma omp task firstprivate(j)
+          {
+            auto       pack_lambda = pack_lambdas[j];
+            Index_type len         = pack_lens[j];
+            for (Index_type i = 0; i < len; i++) {
+              pack_lambda(i);
+            }
+          }
+        }
+#else
+        #pragma omp parallel for
+        for (Index_type j = 0; j < pack_index; j++) {
+          auto       pack_lambda = pack_lambdas[j];
+          Index_type len         = pack_lens[j];
+          for (Index_type i = 0; i < len; i++) {
+            pack_lambda(i);
+          }
+        }
+#endif
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+        Index_type unpack_index = 0;
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var));
+            unpack_lens[unpack_index] = len;
+            unpack_index += 1;
+            buffer += len;
+          }
+        }
+
+#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL)
+        #pragma omp parallel
+        #pragma omp single nowait
+        for (Index_type j = 0; j < unpack_index; j++) {
+          #pragma omp task firstprivate(j)
+          {
+            auto       unpack_lambda = unpack_lambdas[j];
+            Index_type len           = unpack_lens[j];
+            for (Index_type i = 0; i < len; i++) {
+              unpack_lambda(i);
+            }
+          }
+        }
+#else
+        #pragma omp parallel for
+        for (Index_type j = 0; j < unpack_index; j++) {
+          auto       unpack_lambda = unpack_lambdas[j];
+          Index_type len           = unpack_lens[j];
+          for (Index_type i = 0; i < len; i++) {
+            unpack_lambda(i);
+          }
+        }
+#endif
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN;
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+template < typename dispatch_helper >
+void HALO_EXCHANGE_FUSED::runOpenMPVariantWorkGroup(VariantID vid)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case RAJA_OpenMP : {
+
+      using AllocatorHolder = RAJAPoolAllocatorHolder<
+        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+      using Allocator = AllocatorHolder::Allocator<char>;
+
+      AllocatorHolder allocatorHolder;
+
+      using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+      using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                                camp::list<range_segment, Packer>,
+                                camp::list<range_segment, UnPacker>>;
+
+      using workgroup_policy = RAJA::WorkGroupPolicy <
+                                   RAJA::omp_work,
+                                   RAJA::ordered,
+                                   RAJA::constant_stride_array_of_objects,
+                                   dispatch_policy >;
+
+      using workpool = RAJA::WorkPool< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+      using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                         Index_type,
+                                         RAJA::xargs<>,
+                                         Allocator >;
+
+      using worksite = RAJA::WorkSite< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+      workpool pool_pack  (allocatorHolder.template getAllocator<char>());
+      workpool pool_unpack(allocatorHolder.template getAllocator<char>());
+      pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+      pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+            buffer += len;
+          }
+        }
+        workgroup group_pack = pool_pack.instantiate();
+        worksite site_pack = group_pack.run();
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
+            buffer += len;
+          }
+        }
+        workgroup group_unpack = pool_unpack.instantiate();
+        worksite site_unpack = group_unpack.run();
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+void HALO_EXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_OpenMP || vid == Lambda_OpenMP) {
+
+    if (tune_idx == t) {
+
+      runOpenMPVariantDirect(vid);
+
+    }
+
+    t += 1;
+
+  }
+
+  if (vid == RAJA_OpenMP) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      if (tune_idx == t) {
+
+        runOpenMPVariantWorkGroup<decltype(dispatch_helper)>(vid);
+
+      }
+
+      t += 1;
+
+    });
+
+  }
+}
+
+void HALO_EXCHANGE_FUSED::setOpenMPTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_OpenMP || vid == Lambda_OpenMP) {
+
+    addVariantTuningName(vid, "direct");
+
+  }
+
+  if (vid == RAJA_OpenMP) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      addVariantTuningName(vid, decltype(dispatch_helper)::get_name());
+
+    });
+
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp
new file mode 100644
index 000000000..18c32437d
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp
@@ -0,0 +1,358 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+  //
+  // Define threads per team for target execution (unused)
+  //
+//const size_t threads_per_team = 256;
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \
+  void** pack_ptrs; \
+  allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \
+  Real_ptr*   pack_buffer_ptrs = reinterpret_cast<Real_ptr*>(pack_ptrs) + 0 * num_neighbors * num_vars; \
+  Int_ptr*    pack_list_ptrs   = reinterpret_cast<Int_ptr*>(pack_ptrs) + 1 * num_neighbors * num_vars; \
+  Real_ptr*   pack_var_ptrs    = reinterpret_cast<Real_ptr*>(pack_ptrs) + 2 * num_neighbors * num_vars; \
+  Index_type* pack_len_ptrs    = reinterpret_cast<Index_type*>(pack_ptrs) + 3 * num_neighbors * num_vars; \
+  void** h_pack_ptrs = new void*[4 * num_neighbors * num_vars]; \
+  Real_ptr*   h_pack_buffer_ptrs = reinterpret_cast<Real_ptr*>(h_pack_ptrs) + 0 * num_neighbors * num_vars; \
+  Int_ptr*    h_pack_list_ptrs   = reinterpret_cast<Int_ptr*>(h_pack_ptrs) + 1 * num_neighbors * num_vars; \
+  Real_ptr*   h_pack_var_ptrs    = reinterpret_cast<Real_ptr*>(h_pack_ptrs) + 2 * num_neighbors * num_vars; \
+  Index_type* h_pack_len_ptrs    = reinterpret_cast<Index_type*>(h_pack_ptrs) + 3 * num_neighbors * num_vars; \
+  void** unpack_ptrs; \
+  allocData(DataSpace::OmpTarget, unpack_ptrs, 4 * num_neighbors * num_vars); \
+  Real_ptr*   unpack_buffer_ptrs = reinterpret_cast<Real_ptr*>(unpack_ptrs) + 0 * num_neighbors * num_vars; \
+  Int_ptr*    unpack_list_ptrs   = reinterpret_cast<Int_ptr*>(unpack_ptrs) + 1 * num_neighbors * num_vars; \
+  Real_ptr*   unpack_var_ptrs    = reinterpret_cast<Real_ptr*>(unpack_ptrs) + 2 * num_neighbors * num_vars; \
+  Index_type* unpack_len_ptrs    = reinterpret_cast<Index_type*>(unpack_ptrs) + 3 * num_neighbors * num_vars; \
+  void** h_unpack_ptrs = new void*[4 * num_neighbors * num_vars]; \
+  Real_ptr*   h_unpack_buffer_ptrs = reinterpret_cast<Real_ptr*>(h_unpack_ptrs) + 0 * num_neighbors * num_vars; \
+  Int_ptr*    h_unpack_list_ptrs   = reinterpret_cast<Int_ptr*>(h_unpack_ptrs) + 1 * num_neighbors * num_vars; \
+  Real_ptr*   h_unpack_var_ptrs    = reinterpret_cast<Real_ptr*>(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \
+  Index_type* h_unpack_len_ptrs    = reinterpret_cast<Index_type*>(h_unpack_ptrs) + 3 * num_neighbors * num_vars;
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \
+  initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars);
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \
+  initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars);
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \
+  deallocData(DataSpace::OmpTarget, pack_ptrs); \
+  delete[] h_pack_ptrs; \
+  deallocData(DataSpace::OmpTarget, unpack_ptrs); \
+  delete[] h_unpack_ptrs;
+
+
+void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantDirect(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      Index_type pack_index = 0;
+      Index_type pack_len_sum = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          h_pack_buffer_ptrs[pack_index] = buffer;
+          h_pack_list_ptrs[pack_index] = list;
+          h_pack_var_ptrs[pack_index] = var;
+          h_pack_len_ptrs[pack_index] = len;
+          pack_len_sum += len;
+          pack_index += 1;
+          buffer += len;
+        }
+      }
+      HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET;
+      Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
+      #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did )
+      #pragma omp teams distribute parallel for collapse(2) schedule(static, 1)
+      for (Index_type j = 0; j < pack_index; j++) {
+        for (Index_type ii = 0; ii < pack_len_ave; ii++) {
+
+          Real_ptr   buffer = pack_buffer_ptrs[j];
+          Int_ptr    list   = pack_list_ptrs[j];
+          Real_ptr   var    = pack_var_ptrs[j];
+          Index_type len    = pack_len_ptrs[j];
+
+          for (Index_type i = ii; i < len; i += pack_len_ave) {
+            HALO_PACK_BODY;
+          }
+        }
+      }
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      Index_type unpack_index = 0;
+      Index_type unpack_len_sum = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          h_unpack_buffer_ptrs[unpack_index] = buffer;
+          h_unpack_list_ptrs[unpack_index] = list;
+          h_unpack_var_ptrs[unpack_index] = var;
+          h_unpack_len_ptrs[unpack_index] = len;
+          unpack_len_sum += len;
+          unpack_index += 1;
+          buffer += len;
+        }
+      }
+      HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET;
+      Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index;
+      #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did )
+      #pragma omp teams distribute parallel for collapse(2) schedule(static, 1)
+      for (Index_type j = 0; j < unpack_index; j++) {
+        for (Index_type ii = 0; ii < unpack_len_ave; ii++) {
+
+          Real_ptr   buffer = unpack_buffer_ptrs[j];
+          Int_ptr    list   = unpack_list_ptrs[j];
+          Real_ptr   var    = unpack_var_ptrs[j];
+          Index_type len    = unpack_len_ptrs[j];
+
+          for (Index_type i = ii; i < len; i += unpack_len_ave) {
+            HALO_UNPACK_BODY;
+          }
+        }
+      }
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+    HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET;
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+template < typename dispatch_helper >
+void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_OpenMPTarget ) {
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<
+        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+    using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                              camp::list<range_segment, Packer>,
+                              camp::list<range_segment, UnPacker>>;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::omp_target_work /*<threads_per_team>*/,
+                                 RAJA::ordered,
+                                 RAJA::constant_stride_array_of_objects,
+                                 dispatch_policy >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool_pack  (allocatorHolder.template getAllocator<char>());
+    workpool pool_unpack(allocatorHolder.template getAllocator<char>());
+    pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+    pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+          buffer += len;
+        }
+      }
+      workgroup group_pack = pool_pack.instantiate();
+      worksite site_pack = group_pack.run();
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
+          buffer += len;
+        }
+      }
+      workgroup group_unpack = pool_unpack.instantiate();
+      worksite site_unpack = group_unpack.run();
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_EXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+void HALO_PACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_OpenMPTarget) {
+
+    if (tune_idx == t) {
+
+      runOpenMPTargetVariantDirect(vid);
+
+    }
+
+    t += 1;
+
+  }
+
+  if (vid == RAJA_OpenMPTarget) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      if (tune_idx == t) {
+
+        runOpenMPTargetVariantWorkGroup<decltype(dispatch_helper)>(vid);
+
+      }
+
+      t += 1;
+
+    });
+
+  }
+}
+
+void HALO_PACKING_FUSED::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_OpenMPTarget) {
+
+    addVariantTuningName(vid, "direct");
+
+  }
+
+  if (vid == RAJA_OpenMPTarget) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      addVariantTuningName(vid, decltype(dispatch_helper)::get_name());
+
+    });
+
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp
new file mode 100644
index 000000000..bca51de0d
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp
@@ -0,0 +1,399 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+
+void HALO_EXCHANGE_FUSED::runSeqVariantDirect(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        Index_type pack_index = 0;
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var};
+            pack_lens[pack_index]        = len;
+            pack_index += 1;
+            buffer += len;
+          }
+        }
+        for (Index_type j = 0; j < pack_index; j++) {
+          Real_ptr   buffer = pack_ptr_holders[j].buffer;
+          Int_ptr    list   = pack_ptr_holders[j].list;
+          Real_ptr   var    = pack_ptr_holders[j].var;
+          Index_type len    = pack_lens[j];
+          for (Index_type i = 0; i < len; i++) {
+            HALO_PACK_BODY;
+          }
+        }
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+        Index_type unpack_index = 0;
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var};
+            unpack_lens[unpack_index]        = len;
+            unpack_index += 1;
+            buffer += len;
+          }
+        }
+        for (Index_type j = 0; j < unpack_index; j++) {
+          Real_ptr   buffer = unpack_ptr_holders[j].buffer;
+          Int_ptr    list   = unpack_ptr_holders[j].list;
+          Real_ptr   var    = unpack_ptr_holders[j].var;
+          Index_type len    = unpack_lens[j];
+          for (Index_type i = 0; i < len; i++) {
+            HALO_UNPACK_BODY;
+          }
+        }
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN;
+
+      break;
+    }
+
+#if defined(RUN_RAJA_SEQ)
+    case Lambda_Seq : {
+
+      HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        Index_type pack_index = 0;
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var));
+            pack_lens[pack_index] = len;
+            pack_index += 1;
+            buffer += len;
+          }
+        }
+        for (Index_type j = 0; j < pack_index; j++) {
+          auto       pack_lambda = pack_lambdas[j];
+          Index_type len         = pack_lens[j];
+          for (Index_type i = 0; i < len; i++) {
+            pack_lambda(i);
+          }
+        }
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+        Index_type unpack_index = 0;
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var));
+            unpack_lens[unpack_index] = len;
+            unpack_index += 1;
+            buffer += len;
+          }
+        }
+        for (Index_type j = 0; j < unpack_index; j++) {
+          auto       unpack_lambda = unpack_lambdas[j];
+          Index_type len           = unpack_lens[j];
+          for (Index_type i = 0; i < len; i++) {
+            unpack_lambda(i);
+          }
+        }
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN;
+
+      break;
+    }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+template < typename dispatch_helper >
+void HALO_EXCHANGE_FUSED::runSeqVariantWorkGroup(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_EXCHANGE_FUSED_DATA_SETUP;
+
+  switch ( vid ) {
+
+#if defined(RUN_RAJA_SEQ)
+    case RAJA_Seq : {
+
+      using AllocatorHolder = RAJAPoolAllocatorHolder<
+        RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
+      using Allocator = AllocatorHolder::Allocator<char>;
+
+      AllocatorHolder allocatorHolder;
+
+      using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+      using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                                camp::list<range_segment, Packer>,
+                                camp::list<range_segment, UnPacker>>;
+
+      using workgroup_policy = RAJA::WorkGroupPolicy <
+                                   RAJA::seq_work,
+                                   RAJA::ordered,
+                                   RAJA::constant_stride_array_of_objects,
+                                   dispatch_policy >;
+
+      using workpool = RAJA::WorkPool< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+      using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                         Index_type,
+                                         RAJA::xargs<>,
+                                         Allocator >;
+
+      using worksite = RAJA::WorkSite< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+      workpool pool_pack  (allocatorHolder.template getAllocator<char>());
+      workpool pool_unpack(allocatorHolder.template getAllocator<char>());
+      pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+      pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = pack_buffers[l];
+          Int_ptr list = pack_index_lists[l];
+          Index_type len = pack_index_list_lengths[l];
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+            buffer += len;
+          }
+        }
+        workgroup group_pack = pool_pack.instantiate();
+        worksite site_pack = group_pack.run();
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Real_ptr buffer = unpack_buffers[l];
+          Int_ptr list = unpack_index_lists[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
+          for (Index_type v = 0; v < num_vars; ++v) {
+            Real_ptr var = vars[v];
+            pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
+            buffer += len;
+          }
+        }
+        workgroup group_unpack = pool_unpack.instantiate();
+        worksite site_unpack = group_unpack.run();
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+void HALO_EXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_Seq || vid == Lambda_Seq) {
+
+    if (tune_idx == t) {
+
+      runSeqVariantDirect(vid);
+
+    }
+
+    t += 1;
+
+  }
+
+  if (vid == RAJA_Seq) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      if (tune_idx == t) {
+
+        runSeqVariantWorkGroup<decltype(dispatch_helper)>(vid);
+
+      }
+
+      t += 1;
+
+    });
+
+  }
+}
+
+void HALO_EXCHANGE_FUSED::setSeqTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_Seq || vid == Lambda_Seq) {
+
+    addVariantTuningName(vid, "direct");
+
+  }
+
+  if (vid == RAJA_Seq) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      addVariantTuningName(vid, decltype(dispatch_helper)::get_name());
+
+    });
+
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_EXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp
new file mode 100644
index 000000000..be76571a2
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE_FUSED.cpp
@@ -0,0 +1,165 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_EXCHANGE_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+namespace rajaperf
+{
+namespace comm
+{
+
+HALO_EXCHANGE_FUSED::HALO_EXCHANGE_FUSED(const RunParams& params)
+  : HALO_base(rajaperf::Comm_HALO_EXCHANGE_FUSED, params)
+{
+  m_mpi_size = params.getMPISize();
+  m_my_mpi_rank = params.getMPIRank();
+  m_mpi_dims = params.getMPI3DDivision();
+
+  setDefaultReps(200);
+
+  m_num_vars = params.getHaloNumVars();
+  m_var_size = m_grid_plus_halo_size ;
+
+  setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) );
+  setKernelsPerRep( 2 );
+  setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() +   // pack
+                      1*sizeof(Real_type) * getItsPerRep() +  // pack
+
+                      1*sizeof(Real_type) * getItsPerRep() +  // send
+
+                      1*sizeof(Int_type) * getItsPerRep() +   // unpack
+                      1*sizeof(Real_type) * getItsPerRep() ); // unpack
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() +  // pack
+
+                         1*sizeof(Real_type) * getItsPerRep() +  // recv
+
+                         1*sizeof(Real_type) * getItsPerRep() ); // unpack
+  setBytesAtomicModifyWrittenPerRep( 0 );
+  setFLOPsPerRep(0);
+
+  setUsesFeature(Workgroup);
+  setUsesFeature(MPI);
+
+  if (params.validMPI3DDivision()) {
+    setVariantDefined( Base_Seq );
+    setVariantDefined( Lambda_Seq );
+    setVariantDefined( RAJA_Seq );
+
+    setVariantDefined( Base_OpenMP );
+    setVariantDefined( Lambda_OpenMP );
+    setVariantDefined( RAJA_OpenMP );
+
+    setVariantDefined( Base_OpenMPTarget );
+    setVariantDefined( RAJA_OpenMPTarget );
+
+    setVariantDefined( Base_CUDA );
+    setVariantDefined( RAJA_CUDA );
+
+    setVariantDefined( Base_HIP );
+    setVariantDefined( RAJA_HIP );
+  }
+}
+
+HALO_EXCHANGE_FUSED::~HALO_EXCHANGE_FUSED()
+{
+}
+
+void HALO_EXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx)
+{
+  setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx);
+
+  m_vars.resize(m_num_vars, nullptr);
+  for (Index_type v = 0; v < m_num_vars; ++v) {
+    allocAndInitData(m_vars[v], m_var_size, vid);
+    auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid);
+
+    Real_ptr var = m_vars[v];
+
+    for (Index_type i = 0; i < m_var_size; i++) {
+      var[i] = i + v;
+    }
+  }
+
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  m_pack_buffers.resize(s_num_neighbors, nullptr);
+  m_send_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len);
+      allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len);
+      m_send_buffers[l] = m_pack_buffers[l];
+    }
+  }
+
+  m_unpack_buffers.resize(s_num_neighbors, nullptr);
+  m_recv_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len);
+      allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len);
+      m_recv_buffers[l] = m_unpack_buffers[l];
+    }
+  }
+}
+
+void HALO_EXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  for (Real_ptr var : m_vars) {
+    checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid);
+  }
+}
+
+void HALO_EXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx)
+{
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_recv_buffers[l]);
+      deallocData(getDataSpace(vid), m_unpack_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]);
+    }
+  }
+  m_recv_buffers.clear();
+  m_unpack_buffers.clear();
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_send_buffers[l]);
+      deallocData(getDataSpace(vid), m_pack_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_pack_buffers[l]);
+    }
+  }
+  m_send_buffers.clear();
+  m_pack_buffers.clear();
+
+  for (int v = 0; v < m_num_vars; ++v) {
+    deallocData(m_vars[v], vid);
+  }
+  m_vars.clear();
+
+  tearDown_base(vid, tune_idx);
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_EXCHANGE_FUSED.hpp b/src/comm/HALO_EXCHANGE_FUSED.hpp
new file mode 100644
index 000000000..a0962be3a
--- /dev/null
+++ b/src/comm/HALO_EXCHANGE_FUSED.hpp
@@ -0,0 +1,209 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// HALO_EXCHANGE_FUSED kernel reference implementation:
+///
+/// // post a recv for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Index_type len = unpack_index_list_lengths[l];
+///   MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+///       mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+/// }
+///
+/// // pack buffers for neighbors
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Real_ptr buffer = pack_buffers[l];
+///   Int_ptr list = pack_index_lists[l];
+///   Index_type len = pack_index_list_lengths[l];
+///   // pack part of each variable
+///   for (Index_type v = 0; v < num_vars; ++v) {
+///     Real_ptr var = vars[v];
+///     for (Index_type i = 0; i < len; i++) {
+///       buffer[i] = var[list[i]];
+///     }
+///     buffer += len;
+///   }
+/// }
+///
+/// // send buffers to neighbors
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+///       mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+/// }
+///
+/// // wait for all recvs to complete
+/// MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+///
+/// // unpack buffers for neighbors
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Real_ptr buffer = unpack_buffers[l];
+///   Int_ptr list = unpack_index_lists[l];
+///   Index_type len = unpack_index_list_lengths[l];
+///   // unpack part of each variable
+///   for (Index_type v = 0; v < num_vars; ++v) {
+///     Real_ptr var = vars[v];
+///     for (Index_type i = 0; i < len; i++) {
+///       var[list[i]] = buffer[i];
+///     }
+///     buffer += len;
+///   }
+/// }
+///
+/// // wait for all sends to complete
+/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+///
+
+#ifndef RAJAPerf_Comm_HALO_EXCHANGE_FUSED_HPP
+#define RAJAPerf_Comm_HALO_EXCHANGE_FUSED_HPP
+
+#define HALO_EXCHANGE_FUSED_DATA_SETUP \
+  HALO_BASE_DATA_SETUP \
+  \
+  Index_type num_vars = m_num_vars; \
+  std::vector<Real_ptr> vars = m_vars; \
+  \
+  std::vector<int> mpi_ranks = m_mpi_ranks; \
+  \
+  std::vector<MPI_Request> pack_mpi_requests(num_neighbors); \
+  std::vector<MPI_Request> unpack_mpi_requests(num_neighbors); \
+  \
+  const DataSpace dataSpace = getDataSpace(vid); \
+  \
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \
+  \
+  std::vector<Real_ptr> pack_buffers = m_pack_buffers; \
+  std::vector<Real_ptr> unpack_buffers = m_unpack_buffers; \
+  \
+  std::vector<Real_ptr> send_buffers = m_send_buffers; \
+  std::vector<Real_ptr> recv_buffers = m_recv_buffers;
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP \
+  struct ptr_holder { \
+    Real_ptr buffer; \
+    Int_ptr  list; \
+    Real_ptr var; \
+  }; \
+  ptr_holder* pack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \
+  Index_type* pack_lens        = new Index_type[num_neighbors * num_vars]; \
+  ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \
+  Index_type* unpack_lens        = new Index_type[num_neighbors * num_vars];
+
+#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \
+  delete[] pack_ptr_holders; \
+  delete[] pack_lens; \
+  delete[] unpack_ptr_holders; \
+  delete[] unpack_lens;
+
+
+#define HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \
+  auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \
+    return [=](Index_type i) { \
+      HALO_PACK_BODY; \
+    }; \
+  }; \
+  using pack_lambda_type = decltype(make_pack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \
+  pack_lambda_type* pack_lambdas = reinterpret_cast<pack_lambda_type*>( \
+      malloc(sizeof(pack_lambda_type) * (num_neighbors * num_vars))); \
+  Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \
+  auto make_unpack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \
+    return [=](Index_type i) { \
+      HALO_UNPACK_BODY; \
+    }; \
+  }; \
+  using unpack_lambda_type = decltype(make_unpack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \
+  unpack_lambda_type* unpack_lambdas = reinterpret_cast<unpack_lambda_type*>( \
+      malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \
+  Index_type* unpack_lens = new Index_type[num_neighbors * num_vars];
+
+#define HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \
+  free(pack_lambdas); \
+  delete[] pack_lens; \
+  free(unpack_lambdas); \
+  delete[] unpack_lens;
+
+
+#include "HALO_base.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+namespace rajaperf
+{
+namespace comm
+{
+
+class HALO_EXCHANGE_FUSED : public HALO_base
+{
+public:
+
+  HALO_EXCHANGE_FUSED(const RunParams& params);
+
+  ~HALO_EXCHANGE_FUSED();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+
+  void runSeqVariantDirect(VariantID vid);
+  void runOpenMPVariantDirect(VariantID vid);
+  void runOpenMPTargetVariantDirect(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantDirect(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantDirect(VariantID vid);
+
+  template < typename dispatch_helper >
+  void runSeqVariantWorkGroup(VariantID vid);
+  template < typename dispatch_helper >
+  void runOpenMPVariantWorkGroup(VariantID vid);
+  template < typename dispatch_helper >
+  void runOpenMPTargetVariantWorkGroup(VariantID vid);
+  template < size_t block_size, typename dispatch_helper >
+  void runCudaVariantWorkGroup(VariantID vid);
+  template < size_t block_size, typename dispatch_helper >
+  void runHipVariantWorkGroup(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 1024;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
+
+  int m_mpi_size = -1;
+  int m_my_mpi_rank = -1;
+  std::array<int, 3> m_mpi_dims = {-1, -1, -1};
+
+  Index_type m_num_vars;
+  Index_type m_var_size;
+
+  std::vector<Real_ptr> m_vars;
+
+  std::vector<Real_ptr> m_pack_buffers;
+  std::vector<Real_ptr> m_unpack_buffers;
+
+  std::vector<Real_ptr> m_send_buffers;
+  std::vector<Real_ptr> m_recv_buffers;
+};
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
+#endif // closing endif for header file include guard
diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/comm/HALO_PACKING-Cuda.cpp
similarity index 54%
rename from src/apps/HALOEXCHANGE-Cuda.cpp
rename to src/comm/HALO_PACKING-Cuda.cpp
index 3a8ae049b..6e09d0805 100644
--- a/src/apps/HALOEXCHANGE-Cuda.cpp
+++ b/src/comm/HALO_PACKING-Cuda.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE.hpp"
+#include "HALO_PACKING.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -18,42 +18,42 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var,
+__global__ void halo_packing_pack(Real_ptr buffer, Int_ptr list, Real_ptr var,
                                   Index_type len)
 {
    Index_type i = threadIdx.x + blockIdx.x * block_size;
 
    if (i < len) {
-     HALOEXCHANGE_PACK_BODY;
+     HALO_PACK_BODY;
    }
 }
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
+__global__ void halo_packing_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
                                     Index_type len)
 {
    Index_type i = threadIdx.x + blockIdx.x * block_size;
 
    if (i < len) {
-     HALOEXCHANGE_UNPACK_BODY;
+     HALO_UNPACK_BODY;
    }
 }
 
 
 template < size_t block_size >
-void HALOEXCHANGE::runCudaVariantImpl(VariantID vid)
+void HALO_PACKING::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
   auto res{getCudaResource()};
 
-  HALOEXCHANGE_DATA_SETUP;
+  HALO_PACKING_DATA_SETUP;
 
   if ( vid == Base_CUDA ) {
 
@@ -61,32 +61,49 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
+        Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           dim3 nthreads_per_block(block_size);
           dim3 nblocks((len + block_size-1) / block_size);
           constexpr size_t shmem = 0;
-          haloexchange_pack<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(buffer, list, var, len);
-          cudaErrchk( cudaGetLastError() );
+          RPlaunchCudaKernel( (halo_packing_pack<block_size>),
+                              nblocks, nthreads_per_block,
+                              shmem, res.get_stream(),
+                              buffer, list, var, len ); 
           buffer += len;
         }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
       }
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           dim3 nthreads_per_block(block_size);
           dim3 nblocks((len + block_size-1) / block_size);
           constexpr size_t shmem = 0;
-          haloexchange_unpack<block_size><<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(buffer, list, var, len);
-          cudaErrchk( cudaGetLastError() );
+          RPlaunchCudaKernel( (halo_packing_unpack<block_size>),
+                              nblocks, nthreads_per_block,
+                              shmem, res.get_stream(),
+                              buffer, list, var, len ); 
           buffer += len;
         }
       }
@@ -103,34 +120,47 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
+        Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) {
-                HALOEXCHANGE_PACK_BODY;
+          auto halo_packing_pack_base_lam = [=] __device__ (Index_type i) {
+                HALO_PACK_BODY;
               };
           RAJA::forall<EXEC_POL>( res,
               RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_pack_base_lam );
+              halo_packing_pack_base_lam );
           buffer += len;
         }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        res.wait();
       }
-      res.wait();
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) {
-                HALOEXCHANGE_UNPACK_BODY;
+          auto halo_packing_unpack_base_lam = [=] __device__ (Index_type i) {
+                HALO_UNPACK_BODY;
               };
           RAJA::forall<EXEC_POL>( res,
               RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_unpack_base_lam );
+              halo_packing_unpack_base_lam );
           buffer += len;
         }
       }
@@ -140,13 +170,13 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid)
     stopTimer();
 
   } else {
-     getCout() << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n HALO_PACKING : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Cuda)
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_PACKING, Cuda)
 
-} // end namespace apps
+} // end namespace comm
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/comm/HALO_PACKING-Hip.cpp
similarity index 54%
rename from src/apps/HALOEXCHANGE-Hip.cpp
rename to src/comm/HALO_PACKING-Hip.cpp
index 9831b6a69..583804396 100644
--- a/src/apps/HALOEXCHANGE-Hip.cpp
+++ b/src/comm/HALO_PACKING-Hip.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE.hpp"
+#include "HALO_PACKING.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -18,42 +18,42 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var,
+__global__ void halo_packing_pack(Real_ptr buffer, Int_ptr list, Real_ptr var,
                                   Index_type len)
 {
    Index_type i = threadIdx.x + blockIdx.x * block_size;
 
    if (i < len) {
-     HALOEXCHANGE_PACK_BODY;
+     HALO_PACK_BODY;
    }
 }
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
+__global__ void halo_packing_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var,
                                     Index_type len)
 {
    Index_type i = threadIdx.x + blockIdx.x * block_size;
 
    if (i < len) {
-     HALOEXCHANGE_UNPACK_BODY;
+     HALO_UNPACK_BODY;
    }
 }
 
 
 template < size_t block_size >
-void HALOEXCHANGE::runHipVariantImpl(VariantID vid)
+void HALO_PACKING::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
   auto res{getHipResource()};
 
-  HALOEXCHANGE_DATA_SETUP;
+  HALO_PACKING_DATA_SETUP;
 
   if ( vid == Base_HIP ) {
 
@@ -61,34 +61,49 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
+        Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           dim3 nthreads_per_block(block_size);
           dim3 nblocks((len + block_size-1) / block_size);
           constexpr size_t shmem = 0;
-          hipLaunchKernelGGL((haloexchange_pack<block_size>), nblocks, nthreads_per_block, shmem, res.get_stream(),
-              buffer, list, var, len);
-          hipErrchk( hipGetLastError() );
+          RPlaunchHipKernel( (halo_packing_pack<block_size>),
+                             nblocks, nthreads_per_block,
+                             shmem, res.get_stream(),
+                             buffer, list, var, len );
           buffer += len;
         }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        hipErrchk( hipStreamSynchronize( res.get_stream() ) );
       }
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           dim3 nthreads_per_block(block_size);
           dim3 nblocks((len + block_size-1) / block_size);
           constexpr size_t shmem = 0;
-          hipLaunchKernelGGL((haloexchange_unpack<block_size>), nblocks, nthreads_per_block, shmem, res.get_stream(),
-              buffer, list, var, len);
-          hipErrchk( hipGetLastError() );
+          RPlaunchHipKernel( (halo_packing_unpack<block_size>),
+                             nblocks, nthreads_per_block,
+                             shmem, res.get_stream(),
+                             buffer, list, var, len );
           buffer += len;
         }
       }
@@ -105,34 +120,47 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
+        Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) {
-                HALOEXCHANGE_PACK_BODY;
+          auto halo_packing_pack_base_lam = [=] __device__ (Index_type i) {
+                HALO_PACK_BODY;
               };
           RAJA::forall<EXEC_POL>( res,
               RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_pack_base_lam );
+              halo_packing_pack_base_lam );
           buffer += len;
         }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+
+        res.wait();
       }
-      res.wait();
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) {
-                HALOEXCHANGE_UNPACK_BODY;
+          auto halo_packing_unpack_base_lam = [=] __device__ (Index_type i) {
+                HALO_UNPACK_BODY;
               };
           RAJA::forall<EXEC_POL>( res,
               RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_unpack_base_lam );
+              halo_packing_unpack_base_lam );
           buffer += len;
         }
       }
@@ -142,13 +170,13 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid)
     stopTimer();
 
   } else {
-     getCout() << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl;
+     getCout() << "\n HALO_PACKING : Unknown Hip variant id = " << vid << std::endl;
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Hip)
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_PACKING, Hip)
 
-} // end namespace apps
+} // end namespace comm
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_HIP
diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/comm/HALO_PACKING-OMP.cpp
similarity index 55%
rename from src/apps/HALOEXCHANGE-OMP.cpp
rename to src/comm/HALO_PACKING-OMP.cpp
index 050046479..bb760f479 100644
--- a/src/apps/HALOEXCHANGE-OMP.cpp
+++ b/src/comm/HALO_PACKING-OMP.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE.hpp"
+#include "HALO_PACKING.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -14,17 +14,17 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
 
-void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
   const Index_type run_reps = getRunReps();
 
-  HALOEXCHANGE_DATA_SETUP;
+  HALO_PACKING_DATA_SETUP;
 
   switch ( vid ) {
 
@@ -34,28 +34,40 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             #pragma omp parallel for
             for (Index_type i = 0; i < len; i++) {
-              HALOEXCHANGE_PACK_BODY;
+              HALO_PACK_BODY;
             }
             buffer += len;
           }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
         }
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             #pragma omp parallel for
             for (Index_type i = 0; i < len; i++) {
-              HALOEXCHANGE_UNPACK_BODY;
+              HALO_UNPACK_BODY;
             }
             buffer += len;
           }
@@ -73,34 +85,46 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_pack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_PACK_BODY;
+            auto halo_packing_pack_base_lam = [=](Index_type i) {
+                  HALO_PACK_BODY;
                 };
             #pragma omp parallel for
             for (Index_type i = 0; i < len; i++) {
-              haloexchange_pack_base_lam(i);
+              halo_packing_pack_base_lam(i);
             }
             buffer += len;
           }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
         }
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_unpack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_UNPACK_BODY;
+            auto halo_packing_unpack_base_lam = [=](Index_type i) {
+                  HALO_UNPACK_BODY;
                 };
             #pragma omp parallel for
             for (Index_type i = 0; i < len; i++) {
-              haloexchange_unpack_base_lam(i);
+              halo_packing_unpack_base_lam(i);
             }
             buffer += len;
           }
@@ -120,33 +144,45 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_pack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_PACK_BODY;
+            auto halo_packing_pack_base_lam = [=](Index_type i) {
+                  HALO_PACK_BODY;
                 };
             RAJA::forall<EXEC_POL>(
                 RAJA::TypedRangeSegment<Index_type>(0, len),
-                haloexchange_pack_base_lam );
+                halo_packing_pack_base_lam );
             buffer += len;
           }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
         }
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_unpack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_UNPACK_BODY;
+            auto halo_packing_unpack_base_lam = [=](Index_type i) {
+                  HALO_UNPACK_BODY;
                 };
             RAJA::forall<EXEC_POL>(
                 RAJA::TypedRangeSegment<Index_type>(0, len),
-                haloexchange_unpack_base_lam );
+                halo_packing_unpack_base_lam );
             buffer += len;
           }
         }
@@ -158,7 +194,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
     }
 
     default : {
-      getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n HALO_PACKING : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -168,5 +204,5 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu
 #endif
 }
 
-} // end namespace apps
+} // end namespace comm
 } // end namespace rajaperf
diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/comm/HALO_PACKING-OMPTarget.cpp
similarity index 60%
rename from src/apps/HALOEXCHANGE-OMPTarget.cpp
rename to src/comm/HALO_PACKING-OMPTarget.cpp
index e4f0f561e..d25f4f747 100644
--- a/src/apps/HALOEXCHANGE-OMPTarget.cpp
+++ b/src/comm/HALO_PACKING-OMPTarget.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE.hpp"
+#include "HALO_PACKING.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -18,7 +18,7 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
   //
@@ -27,11 +27,11 @@ namespace apps
   const size_t threads_per_team = 256;
 
 
-void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void HALO_PACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
-  HALOEXCHANGE_DATA_SETUP;
+  HALO_PACKING_DATA_SETUP;
 
   if ( vid == Base_OpenMPTarget ) {
 
@@ -39,30 +39,42 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
+        Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           #pragma omp target is_device_ptr(buffer, list, var) device( did )
           #pragma omp teams distribute parallel for schedule(static, 1)
           for (Index_type i = 0; i < len; i++) {
-            HALOEXCHANGE_PACK_BODY;
+            HALO_PACK_BODY;
           }
           buffer += len;
         }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
       }
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           #pragma omp target is_device_ptr(buffer, list, var) device( did )
           #pragma omp teams distribute parallel for schedule(static, 1)
           for (Index_type i = 0; i < len; i++) {
-            HALOEXCHANGE_UNPACK_BODY;
+            HALO_UNPACK_BODY;
           }
           buffer += len;
         }
@@ -79,33 +91,45 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
+        Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_pack_base_lam = [=](Index_type i) {
-                HALOEXCHANGE_PACK_BODY;
+          auto halo_packing_pack_base_lam = [=](Index_type i) {
+                HALO_PACK_BODY;
               };
           RAJA::forall<EXEC_POL>(
               RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_pack_base_lam );
+              halo_packing_pack_base_lam );
           buffer += len;
         }
+
+        if (separate_buffers) {
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
       }
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_unpack_base_lam = [=](Index_type i) {
-                HALOEXCHANGE_UNPACK_BODY;
+          auto halo_packing_unpack_base_lam = [=](Index_type i) {
+                HALO_UNPACK_BODY;
               };
           RAJA::forall<EXEC_POL>(
               RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_unpack_base_lam );
+              halo_packing_unpack_base_lam );
           buffer += len;
         }
       }
@@ -114,11 +138,11 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_
     stopTimer();
 
   } else {
-     getCout() << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n HALO_PACKING : Unknown OMP Target variant id = " << vid << std::endl;
   }
 }
 
-} // end namespace apps
+} // end namespace comm
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/comm/HALO_PACKING-Seq.cpp
similarity index 53%
rename from src/apps/HALOEXCHANGE-Seq.cpp
rename to src/comm/HALO_PACKING-Seq.cpp
index fa9eb591f..066116433 100644
--- a/src/apps/HALOEXCHANGE-Seq.cpp
+++ b/src/comm/HALO_PACKING-Seq.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE.hpp"
+#include "HALO_PACKING.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -14,15 +14,15 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
 
-void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void HALO_PACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 {
   const Index_type run_reps = getRunReps();
 
-  HALOEXCHANGE_DATA_SETUP;
+  HALO_PACKING_DATA_SETUP;
 
   switch ( vid ) {
 
@@ -31,27 +31,40 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             for (Index_type i = 0; i < len; i++) {
-              HALOEXCHANGE_PACK_BODY;
+              HALO_PACK_BODY;
             }
             buffer += len;
           }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
         }
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             for (Index_type i = 0; i < len; i++) {
-              HALOEXCHANGE_UNPACK_BODY;
+              HALO_UNPACK_BODY;
             }
             buffer += len;
           }
@@ -70,32 +83,44 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_pack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_PACK_BODY;
+            auto halo_packing_pack_base_lam = [=](Index_type i) {
+                  HALO_PACK_BODY;
                 };
             for (Index_type i = 0; i < len; i++) {
-              haloexchange_pack_base_lam(i);
+              halo_packing_pack_base_lam(i);
             }
             buffer += len;
           }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
         }
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_unpack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_UNPACK_BODY;
+            auto halo_packing_unpack_base_lam = [=](Index_type i) {
+                  HALO_UNPACK_BODY;
                 };
             for (Index_type i = 0; i < len; i++) {
-              haloexchange_unpack_base_lam(i);
+              halo_packing_unpack_base_lam(i);
             }
             buffer += len;
           }
@@ -115,33 +140,45 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_pack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_PACK_BODY;
+            auto halo_packing_pack_base_lam = [=](Index_type i) {
+                  HALO_PACK_BODY;
                 };
             RAJA::forall<EXEC_POL>(
                 RAJA::TypedRangeSegment<Index_type>(0, len),
-                haloexchange_pack_base_lam );
+                halo_packing_pack_base_lam );
             buffer += len;
           }
+
+          if (separate_buffers) {
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
         }
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_unpack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_UNPACK_BODY;
+            auto halo_packing_unpack_base_lam = [=](Index_type i) {
+                  HALO_UNPACK_BODY;
                 };
             RAJA::forall<EXEC_POL>(
                 RAJA::TypedRangeSegment<Index_type>(0, len),
-                haloexchange_unpack_base_lam );
+                halo_packing_unpack_base_lam );
             buffer += len;
           }
         }
@@ -154,12 +191,12 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
 #endif // RUN_RAJA_SEQ
 
     default : {
-      getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n HALO_PACKING : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
 }
 
-} // end namespace apps
+} // end namespace comm
 } // end namespace rajaperf
diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp
new file mode 100644
index 000000000..f1569d3aa
--- /dev/null
+++ b/src/comm/HALO_PACKING.cpp
@@ -0,0 +1,163 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_PACKING.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+namespace rajaperf
+{
+namespace comm
+{
+
+HALO_PACKING::HALO_PACKING(const RunParams& params)
+  : HALO_base(rajaperf::Comm_HALO_PACKING, params)
+{
+  setDefaultReps(200);
+
+  m_num_vars = params.getHaloNumVars();
+  m_var_size = m_grid_plus_halo_size ;
+
+  setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) );
+  setKernelsPerRep( 2 * s_num_neighbors * m_num_vars );
+  setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() +   // pack
+                      1*sizeof(Real_type) * getItsPerRep() +  // pack
+
+                      1*sizeof(Int_type) * getItsPerRep() +   // unpack
+                      1*sizeof(Real_type) * getItsPerRep() ); // unpack
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() +  // pack
+
+                         1*sizeof(Real_type) * getItsPerRep() ); // unpack
+  setBytesAtomicModifyWrittenPerRep( 0 );
+  setFLOPsPerRep(0);
+
+  setUsesFeature(Forall);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+HALO_PACKING::~HALO_PACKING()
+{
+}
+
+void HALO_PACKING::setUp(VariantID vid, size_t tune_idx)
+{
+  int my_mpi_rank = 0;
+  const int mpi_dims[3] = {1,1,1};
+  setUp_base(my_mpi_rank, mpi_dims, vid, tune_idx);
+
+  m_vars.resize(m_num_vars, nullptr);
+  for (Index_type v = 0; v < m_num_vars; ++v) {
+    allocAndInitData(m_vars[v], m_var_size, vid);
+    auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid);
+
+    Real_ptr var = m_vars[v];
+
+    for (Index_type i = 0; i < m_var_size; i++) {
+      var[i] = i + v;
+    }
+  }
+
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  m_pack_buffers.resize(s_num_neighbors, nullptr);
+  m_send_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len);
+      allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len);
+      m_send_buffers[l] = m_pack_buffers[l];
+    }
+  }
+
+  m_unpack_buffers.resize(s_num_neighbors, nullptr);
+  m_recv_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len);
+      allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len);
+      m_recv_buffers[l] = m_unpack_buffers[l];
+    }
+  }
+}
+
+void HALO_PACKING::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  for (Real_ptr var : m_vars) {
+    checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid);
+  }
+
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
+    if (separate_buffers) {
+      checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_send_buffers[l], buffer_len, vid);
+    } else {
+      checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_send_buffers[l], buffer_len, vid);
+    }
+  }
+}
+
+void HALO_PACKING::tearDown(VariantID vid, size_t tune_idx)
+{
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_recv_buffers[l]);
+      deallocData(getDataSpace(vid), m_unpack_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]);
+    }
+  }
+  m_recv_buffers.clear();
+  m_unpack_buffers.clear();
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_send_buffers[l]);
+      deallocData(getDataSpace(vid), m_pack_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_pack_buffers[l]);
+    }
+  }
+  m_send_buffers.clear();
+  m_pack_buffers.clear();
+
+  for (int v = 0; v < m_num_vars; ++v) {
+    deallocData(m_vars[v], vid);
+  }
+  m_vars.clear();
+
+  tearDown_base(vid, tune_idx);
+}
+
+} // end namespace comm
+} // end namespace rajaperf
diff --git a/src/comm/HALO_PACKING.hpp b/src/comm/HALO_PACKING.hpp
new file mode 100644
index 000000000..7b4531e74
--- /dev/null
+++ b/src/comm/HALO_PACKING.hpp
@@ -0,0 +1,118 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// HALO_PACKING kernel reference implementation:
+///
+/// // pack a buffer for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Real_ptr buffer = pack_buffers[l];
+///   Int_ptr list = pack_index_lists[l];
+///   Index_type len = pack_index_list_lengths[l];
+///   // pack part of each variable
+///   for (Index_type v = 0; v < num_vars; ++v) {
+///     Real_ptr var = vars[v];
+///     for (Index_type i = 0; i < len; i++) {
+///       buffer[i] = var[list[i]];
+///     }
+///     buffer += len;
+///   }
+/// }
+///
+/// // unpack a buffer for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Real_ptr buffer = unpack_buffers[l];
+///   Int_ptr list = unpack_index_lists[l];
+///   Index_type len = unpack_index_list_lengths[l];
+///   // unpack part of each variable
+///   for (Index_type v = 0; v < num_vars; ++v) {
+///     Real_ptr var = vars[v];
+///     for (Index_type i = 0; i < len; i++) {
+///       var[list[i]] = buffer[i];
+///     }
+///     buffer += len;
+///   }
+/// }
+///
+
+#ifndef RAJAPerf_Comm_HALO_PACKING_HPP
+#define RAJAPerf_Comm_HALO_PACKING_HPP
+
+#define HALO_PACKING_DATA_SETUP \
+  HALO_BASE_DATA_SETUP \
+  \
+  Index_type num_vars = m_num_vars; \
+  std::vector<Real_ptr> vars = m_vars; \
+  \
+  const DataSpace dataSpace = getDataSpace(vid); \
+  \
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \
+  \
+  std::vector<Real_ptr> pack_buffers = m_pack_buffers; \
+  std::vector<Real_ptr> unpack_buffers = m_unpack_buffers; \
+  \
+  std::vector<Real_ptr> send_buffers = m_send_buffers; \
+  std::vector<Real_ptr> recv_buffers = m_recv_buffers;
+
+
+#include "HALO_base.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <vector>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+class HALO_PACKING : public HALO_base
+{
+public:
+
+  HALO_PACKING(const RunParams& params);
+
+  ~HALO_PACKING();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
+
+  Index_type m_num_vars;
+  Index_type m_var_size;
+
+  std::vector<Real_ptr> m_vars;
+
+  std::vector<Real_ptr> m_pack_buffers;
+  std::vector<Real_ptr> m_unpack_buffers;
+
+  std::vector<Real_ptr> m_send_buffers;
+  std::vector<Real_ptr> m_recv_buffers;
+};
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_PACKING_FUSED-Cuda.cpp
similarity index 54%
rename from src/apps/HALOEXCHANGE_FUSED-Cuda.cpp
rename to src/comm/HALO_PACKING_FUSED-Cuda.cpp
index 791742b72..7541a30ef 100644
--- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp
+++ b/src/comm/HALO_PACKING_FUSED-Cuda.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE_FUSED.hpp"
+#include "HALO_PACKING_FUSED.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -18,10 +18,10 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA \
   Real_ptr*   pack_buffer_ptrs; \
   Int_ptr*    pack_list_ptrs; \
   Real_ptr*   pack_var_ptrs; \
@@ -39,7 +39,7 @@ namespace apps
   allocData(DataSpace::CudaPinned, unpack_var_ptrs,    num_neighbors * num_vars); \
   allocData(DataSpace::CudaPinned, unpack_len_ptrs,    num_neighbors * num_vars);
 
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \
   deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \
   deallocData(DataSpace::CudaPinned, pack_list_ptrs); \
   deallocData(DataSpace::CudaPinned, pack_var_ptrs); \
@@ -51,8 +51,10 @@ namespace apps
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs,
-                                        Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs)
+__global__ void halo_packing_fused_pack(Real_ptr* pack_buffer_ptrs,
+                                         Int_ptr* pack_list_ptrs,
+                                         Real_ptr* pack_var_ptrs,
+                                         Index_type* pack_len_ptrs)
 {
   Index_type j = blockIdx.y;
 
@@ -64,14 +66,16 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac
   for (Index_type i = threadIdx.x + blockIdx.x * block_size;
        i < len;
        i += block_size * gridDim.x) {
-    HALOEXCHANGE_FUSED_PACK_BODY;
+    HALO_PACK_BODY;
   }
 }
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs,
-                                          Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs)
+__global__ void halo_packing_fused_unpack(Real_ptr* unpack_buffer_ptrs,
+                                           Int_ptr* unpack_list_ptrs,
+                                           Real_ptr* unpack_var_ptrs,
+                                           Index_type* unpack_len_ptrs)
 {
   Index_type j = blockIdx.y;
 
@@ -83,23 +87,23 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr*
   for (Index_type i = threadIdx.x + blockIdx.x * block_size;
        i < len;
        i += block_size * gridDim.x) {
-    HALOEXCHANGE_FUSED_UNPACK_BODY;
+    HALO_UNPACK_BODY;
   }
 }
 
 
 template < size_t block_size >
-void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid)
+void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
   auto res{getCudaResource()};
 
-  HALOEXCHANGE_FUSED_DATA_SETUP;
+  HALO_PACKING_FUSED_DATA_SETUP;
 
   if ( vid == Base_CUDA ) {
 
-    HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA;
+    HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -110,9 +114,9 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid)
       Index_type pack_len_sum = 0;
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
+        Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           pack_buffer_ptrs[pack_index] = buffer;
@@ -127,18 +131,36 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid)
       Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
       dim3 pack_nthreads_per_block(block_size);
       dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index);
-      haloexchange_fused_pack<block_size><<<pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream()>>>(
-          pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (halo_packing_fused_pack<block_size>),
+                          pack_nblocks, pack_nthreads_per_block,
+                          shmem, res.get_stream(),
+                          pack_buffer_ptrs,
+                          pack_list_ptrs,
+                          pack_var_ptrs,
+                          pack_len_ptrs );
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
       cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
 
       Index_type unpack_index = 0;
       Index_type unpack_len_sum = 0;
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           unpack_buffer_ptrs[unpack_index] = buffer;
@@ -150,30 +172,57 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid)
           buffer += len;
         }
       }
-      Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index;
+      Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) /
+                                  unpack_index;
       dim3 unpack_nthreads_per_block(block_size);
-      dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index);
-      haloexchange_fused_unpack<block_size><<<unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream()>>>(
-          unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs);
-      cudaErrchk( cudaGetLastError() );
+      dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size,
+                          unpack_index);
+      RPlaunchCudaKernel( (halo_packing_fused_unpack<block_size>),
+                          unpack_nblocks, unpack_nthreads_per_block,
+                          shmem, res.get_stream(),
+                          unpack_buffer_ptrs,
+                          unpack_list_ptrs,
+                          unpack_var_ptrs, 
+                          unpack_len_ptrs ); 
       cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
 
     }
     stopTimer();
 
-    HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA;
+    HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA;
 
-  } else if ( vid == RAJA_CUDA ) {
+  } else {
+     getCout() << "\n HALO_PACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename dispatch_helper >
+void HALO_PACKING_FUSED::runCudaVariantWorkGroup(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getCudaResource()};
+
+  HALO_PACKING_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
 
     using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::cuda::pinned_mempool_type>;
     using Allocator = AllocatorHolder::Allocator<char>;
 
     AllocatorHolder allocatorHolder;
 
+    using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+    using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                              camp::list<range_segment, Packer>,
+                              camp::list<range_segment, UnPacker>>;
+
     using workgroup_policy = RAJA::WorkGroupPolicy <
                                  RAJA::cuda_work_async<block_size>,
                                  RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-                                 RAJA::constant_stride_array_of_objects >;
+                                 RAJA::constant_stride_array_of_objects,
+                                 dispatch_policy >;
 
     using workpool = RAJA::WorkPool< workgroup_policy,
                                      Index_type,
@@ -199,36 +248,40 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
         Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) {
-                HALOEXCHANGE_FUSED_PACK_BODY;
-              };
-          pool_pack.enqueue(
-              RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_fused_pack_base_lam );
+          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
           buffer += len;
         }
       }
       workgroup group_pack = pool_pack.instantiate();
       worksite site_pack = group_pack.run(res);
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
       res.wait();
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
         Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) {
-                HALOEXCHANGE_FUSED_UNPACK_BODY;
-              };
-          pool_unpack.enqueue(
-              RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_fused_unpack_base_lam );
+          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
           buffer += len;
         }
       }
@@ -240,13 +293,102 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid)
     stopTimer();
 
   } else {
-     getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl;
+     getCout() << "\n HALO_PACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl;
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Cuda)
+void HALO_PACKING_FUSED::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_CUDA) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
+
+          runCudaVariantDirect<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  }
+
+  if (vid == RAJA_CUDA) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+          if (tune_idx == t) {
+
+            runCudaVariantWorkGroup<decltype(block_size){},
+                                    decltype(dispatch_helper)>(vid);
+
+          }
+
+          t += 1;
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
+
+void HALO_PACKING_FUSED::setCudaTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_CUDA) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "direct_"+std::to_string(block_size));
+
+      }
+
+    });
+
+  }
+
+  if (vid == RAJA_CUDA) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+          addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size));
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
-} // end namespace apps
+} // end namespace comm
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_CUDA
diff --git a/src/comm/HALO_PACKING_FUSED-Hip.cpp b/src/comm/HALO_PACKING_FUSED-Hip.cpp
new file mode 100644
index 000000000..7b4d9b064
--- /dev/null
+++ b/src/comm/HALO_PACKING_FUSED-Hip.cpp
@@ -0,0 +1,391 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_PACKING_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP \
+  Real_ptr*   pack_buffer_ptrs; \
+  Int_ptr*    pack_list_ptrs; \
+  Real_ptr*   pack_var_ptrs; \
+  Index_type* pack_len_ptrs; \
+  allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs,    num_neighbors * num_vars); \
+  Real_ptr*   unpack_buffer_ptrs; \
+  Int_ptr*    unpack_list_ptrs; \
+  Real_ptr*   unpack_var_ptrs; \
+  Index_type* unpack_len_ptrs; \
+  allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs,   num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs,    num_neighbors * num_vars); \
+  allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs,    num_neighbors * num_vars);
+
+#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP \
+  deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \
+  deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs);
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_packing_fused_pack(Real_ptr* pack_buffer_ptrs,
+                                         Int_ptr* pack_list_ptrs,
+                                         Real_ptr* pack_var_ptrs,
+                                         Index_type* pack_len_ptrs)
+{
+  Index_type j = blockIdx.y;
+
+  Real_ptr   buffer = pack_buffer_ptrs[j];
+  Int_ptr    list   = pack_list_ptrs[j];
+  Real_ptr   var    = pack_var_ptrs[j];
+  Index_type len    = pack_len_ptrs[j];
+
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
+       i < len;
+       i += block_size * gridDim.x) {
+    HALO_PACK_BODY;
+  }
+}
+
+template < size_t block_size >
+__launch_bounds__(block_size)
+__global__ void halo_packing_fused_unpack(Real_ptr* unpack_buffer_ptrs,
+                                           Int_ptr* unpack_list_ptrs,
+                                           Real_ptr* unpack_var_ptrs,
+                                           Index_type* unpack_len_ptrs)
+{
+  Index_type j = blockIdx.y;
+
+  Real_ptr   buffer = unpack_buffer_ptrs[j];
+  Int_ptr    list   = unpack_list_ptrs[j];
+  Real_ptr   var    = unpack_var_ptrs[j];
+  Index_type len    = unpack_len_ptrs[j];
+
+  for (Index_type i = threadIdx.x + blockIdx.x * block_size;
+       i < len;
+       i += block_size * gridDim.x) {
+    HALO_UNPACK_BODY;
+  }
+}
+
+
+template < size_t block_size >
+void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  HALO_PACKING_FUSED_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      constexpr size_t shmem = 0;
+
+      Index_type pack_index = 0;
+      Index_type pack_len_sum = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pack_buffer_ptrs[pack_index] = buffer;
+          pack_list_ptrs[pack_index] = list;
+          pack_var_ptrs[pack_index] = var;
+          pack_len_ptrs[pack_index] = len;
+          pack_len_sum += len;
+          pack_index += 1;
+          buffer += len;
+        }
+      }
+      Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
+      dim3 pack_nthreads_per_block(block_size);
+      dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index);
+      RPlaunchHipKernel( (halo_packing_fused_pack<block_size>),
+                         pack_nblocks, pack_nthreads_per_block,
+                         shmem, res.get_stream(),
+                         pack_buffer_ptrs, 
+                         pack_list_ptrs,
+                         pack_var_ptrs, 
+                         pack_len_ptrs );
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
+      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+
+      Index_type unpack_index = 0;
+      Index_type unpack_len_sum = 0;
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          unpack_buffer_ptrs[unpack_index] = buffer;
+          unpack_list_ptrs[unpack_index] = list;
+          unpack_var_ptrs[unpack_index] = var;
+          unpack_len_ptrs[unpack_index] = len;
+          unpack_len_sum += len;
+          unpack_index += 1;
+          buffer += len;
+        }
+      }
+      Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) /
+                                  unpack_index;
+      dim3 unpack_nthreads_per_block(block_size);
+      dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size,
+                          unpack_index);
+      RPlaunchHipKernel( (halo_packing_fused_unpack<block_size>),
+                         unpack_nblocks, unpack_nthreads_per_block,
+                         shmem, res.get_stream(),
+                         unpack_buffer_ptrs,
+                         unpack_list_ptrs,
+                         unpack_var_ptrs,
+                         unpack_len_ptrs );
+      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+
+    }
+    stopTimer();
+
+    HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP;
+
+  } else {
+     getCout() << "\n HALO_PACKING_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename dispatch_helper >
+void HALO_PACKING_FUSED::runHipVariantWorkGroup(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getHipResource()};
+
+  HALO_PACKING_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::hip::pinned_mempool_type>;
+    using Allocator = AllocatorHolder::Allocator<char>;
+
+    AllocatorHolder allocatorHolder;
+
+    using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+    using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                              camp::list<range_segment, Packer>,
+                              camp::list<range_segment, UnPacker>>;
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<block_size>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 dispatch_policy >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       Index_type,
+                                       RAJA::xargs<>,
+                                       Allocator >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     Index_type,
+                                     RAJA::xargs<>,
+                                     Allocator >;
+
+    workpool pool_pack  (allocatorHolder.template getAllocator<char>());
+    workpool pool_unpack(allocatorHolder.template getAllocator<char>());
+    pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+    pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = pack_buffers[l];
+        Int_ptr list = pack_index_lists[l];
+        Index_type len = pack_index_list_lengths[l];
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+          buffer += len;
+        }
+      }
+      workgroup group_pack = pool_pack.instantiate();
+      worksite site_pack = group_pack.run(res);
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
+      res.wait();
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Real_ptr buffer = unpack_buffers[l];
+        Int_ptr list = unpack_index_lists[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
+        for (Index_type v = 0; v < num_vars; ++v) {
+          Real_ptr var = vars[v];
+          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
+          buffer += len;
+        }
+      }
+      workgroup group_unpack = pool_unpack.instantiate();
+      worksite site_unpack = group_unpack.run(res);
+      res.wait();
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_PACKING_FUSED : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+void HALO_PACKING_FUSED::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_HIP) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
+
+          runHipVariantDirect<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+
+  }
+
+  if (vid == RAJA_HIP) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+          if (tune_idx == t) {
+
+            runHipVariantWorkGroup<decltype(block_size){}, decltype(dispatch_helper)>(vid);
+
+          }
+
+          t += 1;
+
+        });
+
+      }
+
+    });
+
+  }
+}
+
+void HALO_PACKING_FUSED::setHipTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_HIP) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "direct_"+std::to_string(block_size));
+
+      }
+
+    });
+
+  }
+
+  if (vid == RAJA_HIP) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+          addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size));
+
+        });
+
+      }
+
+    });
+
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_PACKING_FUSED-OMP.cpp
similarity index 64%
rename from src/apps/HALOEXCHANGE_FUSED-OMP.cpp
rename to src/comm/HALO_PACKING_FUSED-OMP.cpp
index 6f228a8f6..143a65501 100644
--- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp
+++ b/src/comm/HALO_PACKING_FUSED-OMP.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE_FUSED.hpp"
+#include "HALO_PACKING_FUSED.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -14,23 +14,23 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
 
-void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void HALO_PACKING_FUSED::runOpenMPVariantDirect(VariantID vid)
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
   const Index_type run_reps = getRunReps();
 
-  HALOEXCHANGE_FUSED_DATA_SETUP;
+  HALO_PACKING_FUSED_DATA_SETUP;
 
   switch ( vid ) {
 
     case Base_OpenMP : {
 
-      HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP;
+      HALO_PACKING_FUSED_MANUAL_FUSER_SETUP;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -38,9 +38,9 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
         Index_type pack_index = 0;
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var};
@@ -61,7 +61,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
             Real_ptr   var    = pack_ptr_holders[j].var;
             Index_type len    = pack_lens[j];
             for (Index_type i = 0; i < len; i++) {
-              HALOEXCHANGE_FUSED_PACK_BODY;
+              HALO_PACK_BODY;
             }
           }
         }
@@ -73,17 +73,31 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
           Real_ptr   var    = pack_ptr_holders[j].var;
           Index_type len    = pack_lens[j];
           for (Index_type i = 0; i < len; i++) {
-            HALOEXCHANGE_FUSED_PACK_BODY;
+            HALO_PACK_BODY;
           }
         }
 #endif
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
 
         Index_type unpack_index = 0;
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var};
@@ -104,7 +118,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
             Real_ptr   var    = unpack_ptr_holders[j].var;
             Index_type len    = unpack_lens[j];
             for (Index_type i = 0; i < len; i++) {
-              HALOEXCHANGE_FUSED_UNPACK_BODY;
+              HALO_UNPACK_BODY;
             }
           }
         }
@@ -116,7 +130,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
           Real_ptr   var    = unpack_ptr_holders[j].var;
           Index_type len    = unpack_lens[j];
           for (Index_type i = 0; i < len; i++) {
-            HALOEXCHANGE_FUSED_UNPACK_BODY;
+            HALO_UNPACK_BODY;
           }
         }
 #endif
@@ -124,14 +138,14 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
       }
       stopTimer();
 
-      HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN;
+      HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN;
 
       break;
     }
 
     case Lambda_OpenMP : {
 
-      HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP;
+      HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -139,9 +153,9 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
         Index_type pack_index = 0;
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var));
@@ -174,13 +188,27 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
           }
         }
 #endif
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
 
         Index_type unpack_index = 0;
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var));
@@ -217,11 +245,33 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
       }
       stopTimer();
 
-      HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN;
+      HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN;
 
       break;
     }
 
+    default : {
+      getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+template < typename dispatch_helper >
+void HALO_PACKING_FUSED::runOpenMPVariantWorkGroup(VariantID vid)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+
+  HALO_PACKING_FUSED_DATA_SETUP;
+
+  switch ( vid ) {
+
     case RAJA_OpenMP : {
 
       using AllocatorHolder = RAJAPoolAllocatorHolder<
@@ -230,10 +280,17 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
 
       AllocatorHolder allocatorHolder;
 
+      using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+      using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                                camp::list<range_segment, Packer>,
+                                camp::list<range_segment, UnPacker>>;
+
       using workgroup_policy = RAJA::WorkGroupPolicy <
                                    RAJA::omp_work,
                                    RAJA::ordered,
-                                   RAJA::constant_stride_array_of_objects >;
+                                   RAJA::constant_stride_array_of_objects,
+                                   dispatch_policy >;
 
       using workpool = RAJA::WorkPool< workgroup_policy,
                                        Index_type,
@@ -259,35 +316,39 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_fused_pack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_FUSED_PACK_BODY;
-                };
-            pool_pack.enqueue(
-                RAJA::TypedRangeSegment<Index_type>(0, len),
-                haloexchange_fused_pack_base_lam );
+            pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
             buffer += len;
           }
         }
         workgroup group_pack = pool_pack.instantiate();
         worksite site_pack = group_pack.run();
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_fused_unpack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_FUSED_UNPACK_BODY;
-                };
-            pool_unpack.enqueue(
-                RAJA::TypedRangeSegment<Index_type>(0, len),
-                haloexchange_fused_unpack_base_lam );
+            pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
             buffer += len;
           }
         }
@@ -301,7 +362,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
     }
 
     default : {
-      getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -311,5 +372,57 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_
 #endif
 }
 
-} // end namespace apps
+void HALO_PACKING_FUSED::runOpenMPVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_OpenMP || vid == Lambda_OpenMP) {
+
+    if (tune_idx == t) {
+
+      runOpenMPVariantDirect(vid);
+
+    }
+
+    t += 1;
+
+  }
+
+  if (vid == RAJA_OpenMP) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      if (tune_idx == t) {
+
+        runOpenMPVariantWorkGroup<decltype(dispatch_helper)>(vid);
+
+      }
+
+      t += 1;
+
+    });
+
+  }
+}
+
+void HALO_PACKING_FUSED::setOpenMPTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_OpenMP || vid == Lambda_OpenMP) {
+
+    addVariantTuningName(vid, "direct");
+
+  }
+
+  if (vid == RAJA_OpenMP) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      addVariantTuningName(vid, decltype(dispatch_helper)::get_name());
+
+    });
+
+  }
+}
+
+} // end namespace comm
 } // end namespace rajaperf
diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp
similarity index 68%
rename from src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp
rename to src/comm/HALO_PACKING_FUSED-OMPTarget.cpp
index 4dd2dad31..ab0b075b4 100644
--- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp
+++ b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE_FUSED.hpp"
+#include "HALO_PACKING_FUSED.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -18,7 +18,7 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
   //
@@ -26,7 +26,7 @@ namespace apps
   //
 //const size_t threads_per_team = 256;
 
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \
   void** pack_ptrs; \
   allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \
   Real_ptr*   pack_buffer_ptrs = reinterpret_cast<Real_ptr*>(pack_ptrs) + 0 * num_neighbors * num_vars; \
@@ -50,28 +50,28 @@ namespace apps
   Real_ptr*   h_unpack_var_ptrs    = reinterpret_cast<Real_ptr*>(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \
   Index_type* h_unpack_len_ptrs    = reinterpret_cast<Index_type*>(h_unpack_ptrs) + 3 * num_neighbors * num_vars;
 
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \
   initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars);
 
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \
   initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars);
 
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \
   deallocData(DataSpace::OmpTarget, pack_ptrs); \
   delete[] h_pack_ptrs; \
   deallocData(DataSpace::OmpTarget, unpack_ptrs); \
   delete[] h_unpack_ptrs;
 
 
-void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void HALO_PACKING_FUSED::runOpenMPTargetVariantDirect(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
-  HALOEXCHANGE_FUSED_DATA_SETUP;
+  HALO_PACKING_FUSED_DATA_SETUP;
 
   if ( vid == Base_OpenMPTarget ) {
 
-    HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET;
+    HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -80,7 +80,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
       Index_type pack_len_sum = 0;
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
         Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
@@ -94,7 +94,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
           buffer += len;
         }
       }
-      HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET;
+      HALO_PACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET;
       Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index;
       #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did )
       #pragma omp teams distribute parallel for collapse(2) schedule(static, 1)
@@ -107,18 +107,32 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
           Index_type len    = pack_len_ptrs[j];
 
           for (Index_type i = ii; i < len; i += pack_len_ave) {
-            HALOEXCHANGE_FUSED_PACK_BODY;
+            HALO_PACK_BODY;
           }
         }
       }
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
 
       Index_type unpack_index = 0;
       Index_type unpack_len_sum = 0;
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
         Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
           h_unpack_buffer_ptrs[unpack_index] = buffer;
@@ -130,7 +144,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
           buffer += len;
         }
       }
-      HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET;
+      HALO_PACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET;
       Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index;
       #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did )
       #pragma omp teams distribute parallel for collapse(2) schedule(static, 1)
@@ -143,7 +157,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
           Index_type len    = unpack_len_ptrs[j];
 
           for (Index_type i = ii; i < len; i += unpack_len_ave) {
-            HALOEXCHANGE_FUSED_UNPACK_BODY;
+            HALO_UNPACK_BODY;
           }
         }
       }
@@ -151,9 +165,21 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
     }
     stopTimer();
 
-    HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET;
+    HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET;
+
+  } else {
+     getCout() << "\n HALO_PACKING_FUSED : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+template < typename dispatch_helper >
+void HALO_PACKING_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
 
-  } else if ( vid == RAJA_OpenMPTarget ) {
+  HALO_PACKING_FUSED_DATA_SETUP;
+
+  if ( vid == RAJA_OpenMPTarget ) {
 
     using AllocatorHolder = RAJAPoolAllocatorHolder<
         RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
@@ -161,10 +187,17 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
 
     AllocatorHolder allocatorHolder;
 
+    using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+    using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                              camp::list<range_segment, Packer>,
+                              camp::list<range_segment, UnPacker>>;
+
     using workgroup_policy = RAJA::WorkGroupPolicy <
                                  RAJA::omp_target_work /*<threads_per_team>*/,
                                  RAJA::ordered,
-                                 RAJA::constant_stride_array_of_objects >;
+                                 RAJA::constant_stride_array_of_objects,
+                                 dispatch_policy >;
 
     using workpool = RAJA::WorkPool< workgroup_policy,
                                      Index_type,
@@ -190,35 +223,39 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = pack_buffers[l];
         Int_ptr list = pack_index_lists[l];
-        Index_type  len  = pack_index_list_lengths[l];
+        Index_type len = pack_index_list_lengths[l];
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_fused_pack_base_lam = [=](Index_type i) {
-                HALOEXCHANGE_FUSED_PACK_BODY;
-              };
-          pool_pack.enqueue(
-              RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_fused_pack_base_lam );
+          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
           buffer += len;
         }
       }
       workgroup group_pack = pool_pack.instantiate();
       worksite site_pack = group_pack.run();
+      if (separate_buffers) {
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          copyData(DataSpace::Host, send_buffers[l],
+                   dataSpace, pack_buffers[l],
+                   len*num_vars);
+        }
+      }
 
       for (Index_type l = 0; l < num_neighbors; ++l) {
-        Real_ptr buffer = buffers[l];
+        Real_ptr buffer = unpack_buffers[l];
         Int_ptr list = unpack_index_lists[l];
-        Index_type  len  = unpack_index_list_lengths[l];
+        Index_type len = unpack_index_list_lengths[l];
+        if (separate_buffers) {
+          copyData(dataSpace, unpack_buffers[l],
+                   DataSpace::Host, recv_buffers[l],
+                   len*num_vars);
+        }
+
         for (Index_type v = 0; v < num_vars; ++v) {
           Real_ptr var = vars[v];
-          auto haloexchange_fused_unpack_base_lam = [=](Index_type i) {
-                HALOEXCHANGE_FUSED_UNPACK_BODY;
-              };
-          pool_unpack.enqueue(
-              RAJA::TypedRangeSegment<Index_type>(0, len),
-              haloexchange_fused_unpack_base_lam );
+          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
           buffer += len;
         }
       }
@@ -229,11 +266,63 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U
     stopTimer();
 
   } else {
-     getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl;
+     getCout() << "\n HALO_PACKING_FUSED : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+void HALO_PACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_OpenMPTarget) {
+
+    if (tune_idx == t) {
+
+      runOpenMPTargetVariantDirect(vid);
+
+    }
+
+    t += 1;
+
+  }
+
+  if (vid == RAJA_OpenMPTarget) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      if (tune_idx == t) {
+
+        runOpenMPTargetVariantWorkGroup<decltype(dispatch_helper)>(vid);
+
+      }
+
+      t += 1;
+
+    });
+
+  }
+}
+
+void HALO_PACKING_FUSED::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_OpenMPTarget) {
+
+    addVariantTuningName(vid, "direct");
+
+  }
+
+  if (vid == RAJA_OpenMPTarget) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      addVariantTuningName(vid, decltype(dispatch_helper)::get_name());
+
+    });
+
   }
 }
 
-} // end namespace apps
+} // end namespace comm
 } // end namespace rajaperf
 
 #endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_PACKING_FUSED-Seq.cpp
similarity index 58%
rename from src/apps/HALOEXCHANGE_FUSED-Seq.cpp
rename to src/comm/HALO_PACKING_FUSED-Seq.cpp
index e6aa5fdbe..f7c16e253 100644
--- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp
+++ b/src/comm/HALO_PACKING_FUSED-Seq.cpp
@@ -1,12 +1,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "HALOEXCHANGE_FUSED.hpp"
+#include "HALO_PACKING_FUSED.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -14,21 +14,21 @@
 
 namespace rajaperf
 {
-namespace apps
+namespace comm
 {
 
 
-void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void HALO_PACKING_FUSED::runSeqVariantDirect(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
-  HALOEXCHANGE_FUSED_DATA_SETUP;
+  HALO_PACKING_FUSED_DATA_SETUP;
 
   switch ( vid ) {
 
     case Base_Seq : {
 
-      HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP;
+      HALO_PACKING_FUSED_MANUAL_FUSER_SETUP;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -36,9 +36,9 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
         Index_type pack_index = 0;
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var};
@@ -53,16 +53,30 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
           Real_ptr   var    = pack_ptr_holders[j].var;
           Index_type len    = pack_lens[j];
           for (Index_type i = 0; i < len; i++) {
-            HALOEXCHANGE_FUSED_PACK_BODY;
+            HALO_PACK_BODY;
+          }
+        }
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
           }
         }
 
         Index_type unpack_index = 0;
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var};
@@ -77,14 +91,14 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
           Real_ptr   var    = unpack_ptr_holders[j].var;
           Index_type len    = unpack_lens[j];
           for (Index_type i = 0; i < len; i++) {
-            HALOEXCHANGE_FUSED_UNPACK_BODY;
+            HALO_UNPACK_BODY;
           }
         }
 
       }
       stopTimer();
 
-      HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN;
+      HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN;
 
       break;
     }
@@ -92,7 +106,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
 #if defined(RUN_RAJA_SEQ)
     case Lambda_Seq : {
 
-      HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP;
+      HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -100,9 +114,9 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
         Index_type pack_index = 0;
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var));
@@ -118,13 +132,27 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
             pack_lambda(i);
           }
         }
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
 
         Index_type unpack_index = 0;
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
             new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var));
@@ -144,23 +172,49 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
       }
       stopTimer();
 
-      HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN;
+      HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN;
 
       break;
     }
+#endif // RUN_RAJA_SEQ
+
+    default : {
+      getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+template < typename dispatch_helper >
+void HALO_PACKING_FUSED::runSeqVariantWorkGroup(VariantID vid)
+{
+  switch ( vid ) {
 
     case RAJA_Seq : {
 
+#if defined(RUN_RAJA_SEQ)
+      const Index_type run_reps = getRunReps();
+
+      HALO_PACKING_FUSED_DATA_SETUP;
+
       using AllocatorHolder = RAJAPoolAllocatorHolder<
         RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
       using Allocator = AllocatorHolder::Allocator<char>;
 
       AllocatorHolder allocatorHolder;
 
+      using range_segment = RAJA::TypedRangeSegment<Index_type>;
+
+      using dispatch_policy = typename dispatch_helper::template dispatch_policy<
+                                camp::list<range_segment, Packer>,
+                                camp::list<range_segment, UnPacker>>;
+
       using workgroup_policy = RAJA::WorkGroupPolicy <
                                    RAJA::seq_work,
                                    RAJA::ordered,
-                                   RAJA::constant_stride_array_of_objects >;
+                                   RAJA::constant_stride_array_of_objects,
+                                   dispatch_policy >;
 
       using workpool = RAJA::WorkPool< workgroup_policy,
                                        Index_type,
@@ -186,35 +240,39 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = pack_buffers[l];
           Int_ptr list = pack_index_lists[l];
-          Index_type  len  = pack_index_list_lengths[l];
+          Index_type len = pack_index_list_lengths[l];
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_fused_pack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_FUSED_PACK_BODY;
-                };
-            pool_pack.enqueue(
-                RAJA::TypedRangeSegment<Index_type>(0, len),
-                haloexchange_fused_pack_base_lam );
+            pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
             buffer += len;
           }
         }
         workgroup group_pack = pool_pack.instantiate();
         worksite site_pack = group_pack.run();
+        if (separate_buffers) {
+          for (Index_type l = 0; l < num_neighbors; ++l) {
+            Index_type len = pack_index_list_lengths[l];
+            copyData(DataSpace::Host, send_buffers[l],
+                     dataSpace, pack_buffers[l],
+                     len*num_vars);
+          }
+        }
 
         for (Index_type l = 0; l < num_neighbors; ++l) {
-          Real_ptr buffer = buffers[l];
+          Real_ptr buffer = unpack_buffers[l];
           Int_ptr list = unpack_index_lists[l];
-          Index_type  len  = unpack_index_list_lengths[l];
+          Index_type len = unpack_index_list_lengths[l];
+          if (separate_buffers) {
+            copyData(dataSpace, unpack_buffers[l],
+                     DataSpace::Host, recv_buffers[l],
+                     len*num_vars);
+          }
+
           for (Index_type v = 0; v < num_vars; ++v) {
             Real_ptr var = vars[v];
-            auto haloexchange_fused_unpack_base_lam = [=](Index_type i) {
-                  HALOEXCHANGE_FUSED_UNPACK_BODY;
-                };
-            pool_unpack.enqueue(
-                RAJA::TypedRangeSegment<Index_type>(0, len),
-                haloexchange_fused_unpack_base_lam );
+            pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
             buffer += len;
           }
         }
@@ -223,18 +281,70 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
 
       }
       stopTimer();
+#endif // RUN_RAJA_SEQ
 
       break;
     }
-#endif // RUN_RAJA_SEQ
 
     default : {
-      getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl;
+      getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
 }
 
-} // end namespace apps
+void HALO_PACKING_FUSED::runSeqVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if (vid == Base_Seq || vid == Lambda_Seq) {
+
+    if (tune_idx == t) {
+
+      runSeqVariantDirect(vid);
+
+    }
+
+    t += 1;
+
+  }
+
+  if (vid == RAJA_Seq) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      if (tune_idx == t) {
+
+        runSeqVariantWorkGroup<decltype(dispatch_helper)>(vid);
+
+      }
+
+      t += 1;
+
+    });
+
+  }
+}
+
+void HALO_PACKING_FUSED::setSeqTuningDefinitions(VariantID vid)
+{
+  if (vid == Base_Seq || vid == Lambda_Seq) {
+
+    addVariantTuningName(vid, "direct");
+
+  }
+
+  if (vid == RAJA_Seq) {
+
+    seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) {
+
+      addVariantTuningName(vid, decltype(dispatch_helper)::get_name());
+
+    });
+
+  }
+}
+
+} // end namespace comm
 } // end namespace rajaperf
diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp
new file mode 100644
index 000000000..93d29dfbc
--- /dev/null
+++ b/src/comm/HALO_PACKING_FUSED.cpp
@@ -0,0 +1,163 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_PACKING_FUSED.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+namespace rajaperf
+{
+namespace comm
+{
+
+HALO_PACKING_FUSED::HALO_PACKING_FUSED(const RunParams& params)
+  : HALO_base(rajaperf::Comm_HALO_PACKING_FUSED, params)
+{
+  setDefaultReps(200);
+
+  m_num_vars = params.getHaloNumVars();
+  m_var_size = m_grid_plus_halo_size ;
+
+  setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) );
+  setKernelsPerRep( 2 );
+  setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() +   // pack
+                      1*sizeof(Real_type) * getItsPerRep() +  // pack
+
+                      1*sizeof(Int_type) * getItsPerRep() +   // unpack
+                      1*sizeof(Real_type) * getItsPerRep() ); // unpack
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() +  // pack
+
+                         1*sizeof(Real_type) * getItsPerRep() ); // unpack
+  setBytesAtomicModifyWrittenPerRep( 0 );
+  setFLOPsPerRep(0);
+
+  setUsesFeature(Workgroup);
+
+  setVariantDefined( Base_Seq );
+  setVariantDefined( Lambda_Seq );
+  setVariantDefined( RAJA_Seq );
+
+  setVariantDefined( Base_OpenMP );
+  setVariantDefined( Lambda_OpenMP );
+  setVariantDefined( RAJA_OpenMP );
+
+  setVariantDefined( Base_OpenMPTarget );
+  setVariantDefined( RAJA_OpenMPTarget );
+
+  setVariantDefined( Base_CUDA );
+  setVariantDefined( RAJA_CUDA );
+
+  setVariantDefined( Base_HIP );
+  setVariantDefined( RAJA_HIP );
+}
+
+HALO_PACKING_FUSED::~HALO_PACKING_FUSED()
+{
+}
+
+void HALO_PACKING_FUSED::setUp(VariantID vid, size_t tune_idx)
+{
+  int my_mpi_rank = 0;
+  const int mpi_dims[3] = {1,1,1};
+  setUp_base(my_mpi_rank, mpi_dims, vid, tune_idx);
+
+  m_vars.resize(m_num_vars, nullptr);
+  for (Index_type v = 0; v < m_num_vars; ++v) {
+    allocAndInitData(m_vars[v], m_var_size, vid);
+    auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid);
+
+    Real_ptr var = m_vars[v];
+
+    for (Index_type i = 0; i < m_var_size; i++) {
+      var[i] = i + v;
+    }
+  }
+
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  m_pack_buffers.resize(s_num_neighbors, nullptr);
+  m_send_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len);
+      allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len);
+      m_send_buffers[l] = m_pack_buffers[l];
+    }
+  }
+
+  m_unpack_buffers.resize(s_num_neighbors, nullptr);
+  m_recv_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len);
+      allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len);
+      m_recv_buffers[l] = m_unpack_buffers[l];
+    }
+  }
+}
+
+void HALO_PACKING_FUSED::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  for (Real_ptr var : m_vars) {
+    checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid);
+  }
+
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
+    if (separate_buffers) {
+      checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_send_buffers[l], buffer_len, vid);
+    } else {
+      checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_send_buffers[l], buffer_len, vid);
+    }
+  }
+}
+
+void HALO_PACKING_FUSED::tearDown(VariantID vid, size_t tune_idx)
+{
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_recv_buffers[l]);
+      deallocData(getDataSpace(vid), m_unpack_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]);
+    }
+  }
+  m_recv_buffers.clear();
+  m_unpack_buffers.clear();
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_send_buffers[l]);
+      deallocData(getDataSpace(vid), m_pack_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_pack_buffers[l]);
+    }
+  }
+  m_send_buffers.clear();
+  m_pack_buffers.clear();
+
+  for (int v = 0; v < m_num_vars; ++v) {
+    deallocData(m_vars[v], vid);
+  }
+  m_vars.clear();
+
+  tearDown_base(vid, tune_idx);
+}
+
+} // end namespace comm
+} // end namespace rajaperf
diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp
similarity index 50%
rename from src/apps/HALOEXCHANGE_FUSED.hpp
rename to src/comm/HALO_PACKING_FUSED.hpp
index b0af7e60e..065c0be3a 100644
--- a/src/apps/HALOEXCHANGE_FUSED.hpp
+++ b/src/comm/HALO_PACKING_FUSED.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -7,56 +7,59 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// HALOEXCHANGE_FUSED kernel reference implementation:
+/// HALO_PACKING_FUSED kernel reference implementation:
 ///
-/// // pack message for each neighbor
+/// // pack buffers for neighbors
 /// for (Index_type l = 0; l < num_neighbors; ++l) {
-///   Real_ptr buffer = buffers[l];
+///   Real_ptr buffer = pack_buffers[l];
 ///   Int_ptr list = pack_index_lists[l];
-///   Index_type  len  = pack_index_list_lengths[l];
+///   Index_type len = pack_index_list_lengths[l];
 ///   // pack part of each variable
 ///   for (Index_type v = 0; v < num_vars; ++v) {
 ///     Real_ptr var = vars[v];
 ///     for (Index_type i = 0; i < len; i++) {
-///       HALOEXCHANGE_FUSED_PACK_BODY;
+///       buffer[i] = var[list[i]];
 ///     }
 ///     buffer += len;
 ///   }
-///   // send message to neighbor
 /// }
 ///
-/// // unpack messages for each neighbor
+/// // unpack buffers for neighbors
 /// for (Index_type l = 0; l < num_neighbors; ++l) {
-///   // receive message from neighbor
-///   Real_ptr buffer = buffers[l];
+///   Real_ptr buffer = unpack_buffers[l];
 ///   Int_ptr list = unpack_index_lists[l];
-///   Index_type  len  = unpack_index_list_lengths[l];
+///   Index_type len = unpack_index_list_lengths[l];
 ///   // unpack part of each variable
 ///   for (Index_type v = 0; v < num_vars; ++v) {
 ///     Real_ptr var = vars[v];
 ///     for (Index_type i = 0; i < len; i++) {
-///       HALOEXCHANGE_FUSED_UNPACK_BODY;
+///       var[list[i]] = buffer[i];
 ///     }
 ///     buffer += len;
 ///   }
 /// }
 ///
 
-#ifndef RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP
-#define RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP
+#ifndef RAJAPerf_Comm_HALO_PACKING_FUSED_HPP
+#define RAJAPerf_Comm_HALO_PACKING_FUSED_HPP
 
-#define HALOEXCHANGE_FUSED_DATA_SETUP \
-  std::vector<Real_ptr> vars = m_vars; \
-  std::vector<Real_ptr> buffers = m_buffers; \
-\
-  Index_type num_neighbors = s_num_neighbors; \
+#define HALO_PACKING_FUSED_DATA_SETUP \
+  HALO_BASE_DATA_SETUP \
+  \
   Index_type num_vars = m_num_vars; \
-  std::vector<Int_ptr> pack_index_lists = m_pack_index_lists; \
-  std::vector<Index_type> pack_index_list_lengths = m_pack_index_list_lengths; \
-  std::vector<Int_ptr> unpack_index_lists = m_unpack_index_lists; \
-  std::vector<Index_type> unpack_index_list_lengths = m_unpack_index_list_lengths;
-
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \
+  std::vector<Real_ptr> vars = m_vars; \
+  \
+  const DataSpace dataSpace = getDataSpace(vid); \
+  \
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \
+  \
+  std::vector<Real_ptr> pack_buffers = m_pack_buffers; \
+  std::vector<Real_ptr> unpack_buffers = m_unpack_buffers; \
+  \
+  std::vector<Real_ptr> send_buffers = m_send_buffers; \
+  std::vector<Real_ptr> recv_buffers = m_recv_buffers;
+
+#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP \
   struct ptr_holder { \
     Real_ptr buffer; \
     Int_ptr  list; \
@@ -67,23 +70,17 @@
   ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \
   Index_type* unpack_lens        = new Index_type[num_neighbors * num_vars];
 
-#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \
+#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN \
   delete[] pack_ptr_holders; \
   delete[] pack_lens; \
   delete[] unpack_ptr_holders; \
   delete[] unpack_lens;
 
-#define HALOEXCHANGE_FUSED_PACK_BODY \
-  buffer[i] = var[list[i]];
 
-#define HALOEXCHANGE_FUSED_UNPACK_BODY \
-  var[list[i]] = buffer[i];
-
-
-#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \
+#define HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP \
   auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \
     return [=](Index_type i) { \
-      HALOEXCHANGE_FUSED_PACK_BODY; \
+      HALO_PACK_BODY; \
     }; \
   }; \
   using pack_lambda_type = decltype(make_pack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \
@@ -92,7 +89,7 @@
   Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \
   auto make_unpack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \
     return [=](Index_type i) { \
-      HALOEXCHANGE_FUSED_UNPACK_BODY; \
+      HALO_UNPACK_BODY; \
     }; \
   }; \
   using unpack_lambda_type = decltype(make_unpack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \
@@ -100,14 +97,14 @@
       malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \
   Index_type* unpack_lens = new Index_type[num_neighbors * num_vars];
 
-#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \
+#define HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \
   free(pack_lambdas); \
   delete[] pack_lens; \
   free(unpack_lambdas); \
   delete[] unpack_lens;
 
 
-#include "common/KernelBase.hpp"
+#include "HALO_base.hpp"
 
 #include "RAJA/RAJA.hpp"
 
@@ -117,16 +114,16 @@ namespace rajaperf
 {
 class RunParams;
 
-namespace apps
+namespace comm
 {
 
-class HALOEXCHANGE_FUSED : public KernelBase
+class HALO_PACKING_FUSED : public HALO_base
 {
 public:
 
-  HALOEXCHANGE_FUSED(const RunParams& params);
+  HALO_PACKING_FUSED(const RunParams& params);
 
-  ~HALOEXCHANGE_FUSED();
+  ~HALO_PACKING_FUSED();
 
   void setUp(VariantID vid, size_t tune_idx);
   void updateChecksum(VariantID vid, size_t tune_idx);
@@ -138,58 +135,48 @@ class HALOEXCHANGE_FUSED : public KernelBase
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
 
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+
+  void runSeqVariantDirect(VariantID vid);
+  void runOpenMPVariantDirect(VariantID vid);
+  void runOpenMPTargetVariantDirect(VariantID vid);
   template < size_t block_size >
-  void runCudaVariantImpl(VariantID vid);
+  void runCudaVariantDirect(VariantID vid);
   template < size_t block_size >
-  void runHipVariantImpl(VariantID vid);
+  void runHipVariantDirect(VariantID vid);
+
+  template < typename dispatch_helper >
+  void runSeqVariantWorkGroup(VariantID vid);
+  template < typename dispatch_helper >
+  void runOpenMPVariantWorkGroup(VariantID vid);
+  template < typename dispatch_helper >
+  void runOpenMPTargetVariantWorkGroup(VariantID vid);
+  template < size_t block_size, typename dispatch_helper >
+  void runCudaVariantWorkGroup(VariantID vid);
+  template < size_t block_size, typename dispatch_helper >
+  void runHipVariantWorkGroup(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 1024;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
-  static const int s_num_neighbors = 26;
-
-  Index_type m_grid_dims[3];
-  Index_type m_halo_width;
   Index_type m_num_vars;
-
-  Index_type m_grid_dims_default[3];
-  Index_type m_halo_width_default;
-  Index_type m_num_vars_default;
-
-  Index_type m_grid_plus_halo_dims[3];
   Index_type m_var_size;
-  Index_type m_var_halo_size;
 
   std::vector<Real_ptr> m_vars;
-  std::vector<Real_ptr> m_buffers;
-
-  std::vector<Int_ptr> m_pack_index_lists;
-  std::vector<Index_type > m_pack_index_list_lengths;
-  std::vector<Int_ptr> m_unpack_index_lists;
-  std::vector<Index_type > m_unpack_index_list_lengths;
-
-  void create_pack_lists(std::vector<Int_ptr>& pack_index_lists,
-                         std::vector<Index_type >& pack_index_list_lengths,
-                         const Index_type halo_width, const Index_type* grid_dims,
-                         const Index_type num_neighbors,
-                         VariantID vid);
-  void destroy_pack_lists(std::vector<Int_ptr>& pack_index_lists,
-                          const Index_type num_neighbors,
-                          VariantID vid);
-  void create_unpack_lists(std::vector<Int_ptr>& unpack_index_lists,
-                           std::vector<Index_type >& unpack_index_list_lengths,
-                           const Index_type halo_width, const Index_type* grid_dims,
-                           const Index_type num_neighbors,
-                           VariantID vid);
-  void destroy_unpack_lists(std::vector<Int_ptr>& unpack_index_lists,
-                            const Index_type num_neighbors,
-                            VariantID vid);
+
+  std::vector<Real_ptr> m_pack_buffers;
+  std::vector<Real_ptr> m_unpack_buffers;
+
+  std::vector<Real_ptr> m_send_buffers;
+  std::vector<Real_ptr> m_recv_buffers;
 };
 
-} // end namespace apps
+} // end namespace comm
 } // end namespace rajaperf
 
 #endif // closing endif for header file include guard
diff --git a/src/comm/HALO_SENDRECV-Cuda.cpp b/src/comm/HALO_SENDRECV-Cuda.cpp
new file mode 100644
index 000000000..6d8d1bf56
--- /dev/null
+++ b/src/comm/HALO_SENDRECV-Cuda.cpp
@@ -0,0 +1,63 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_SENDRECV.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+
+void HALO_SENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_SENDRECV_DATA_SETUP;
+
+  if ( vid == Base_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_SENDRECV : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/comm/HALO_SENDRECV-Hip.cpp b/src/comm/HALO_SENDRECV-Hip.cpp
new file mode 100644
index 000000000..7db6baf83
--- /dev/null
+++ b/src/comm/HALO_SENDRECV-Hip.cpp
@@ -0,0 +1,63 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_SENDRECV.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+
+void HALO_SENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_SENDRECV_DATA_SETUP;
+
+  if ( vid == Base_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_SENDRECV : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/comm/HALO_SENDRECV-OMP.cpp b/src/comm/HALO_SENDRECV-OMP.cpp
new file mode 100644
index 000000000..347756d81
--- /dev/null
+++ b/src/comm/HALO_SENDRECV-OMP.cpp
@@ -0,0 +1,74 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_SENDRECV.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+
+void HALO_SENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+
+  HALO_SENDRECV_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_OpenMP : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n HALO_SENDRECV : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_SENDRECV-OMPTarget.cpp b/src/comm/HALO_SENDRECV-OMPTarget.cpp
new file mode 100644
index 000000000..42f62289f
--- /dev/null
+++ b/src/comm/HALO_SENDRECV-OMPTarget.cpp
@@ -0,0 +1,68 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_SENDRECV.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
+
+
+void HALO_SENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_SENDRECV_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = unpack_index_list_lengths[l];
+        MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+      }
+
+      for (Index_type l = 0; l < num_neighbors; ++l) {
+        Index_type len = pack_index_list_lengths[l];
+        MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+            mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+      }
+
+      MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n HALO_SENDRECV : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/comm/HALO_SENDRECV-Seq.cpp b/src/comm/HALO_SENDRECV-Seq.cpp
new file mode 100644
index 000000000..ab64c9415
--- /dev/null
+++ b/src/comm/HALO_SENDRECV-Seq.cpp
@@ -0,0 +1,69 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_SENDRECV.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+
+void HALO_SENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+
+  HALO_SENDRECV_DATA_SETUP;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = unpack_index_list_lengths[l];
+          MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+        }
+
+        for (Index_type l = 0; l < num_neighbors; ++l) {
+          Index_type len = pack_index_list_lengths[l];
+          MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+              mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+        }
+
+        MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+        MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n HALO_SENDRECV : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_SENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp
new file mode 100644
index 000000000..0c57b2c3a
--- /dev/null
+++ b/src/comm/HALO_SENDRECV.cpp
@@ -0,0 +1,128 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_SENDRECV.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+namespace rajaperf
+{
+namespace comm
+{
+
+HALO_SENDRECV::HALO_SENDRECV(const RunParams& params)
+  : HALO_base(rajaperf::Comm_HALO_SENDRECV, params)
+{
+  m_mpi_size = params.getMPISize();
+  m_my_mpi_rank = params.getMPIRank();
+  m_mpi_dims = params.getMPI3DDivision();
+
+  setDefaultReps(200);
+
+  m_num_vars = params.getHaloNumVars();
+  m_var_size = m_grid_plus_halo_size ;
+
+  setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) );
+  setKernelsPerRep( 0 );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // send
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // recv
+  setBytesAtomicModifyWrittenPerRep( 0 );
+  setFLOPsPerRep(0);
+
+  setUsesFeature(Forall);
+  setUsesFeature(MPI);
+
+  if (params.validMPI3DDivision()) {
+    setVariantDefined( Base_Seq );
+
+    setVariantDefined( Base_OpenMP );
+
+    setVariantDefined( Base_OpenMPTarget );
+
+    setVariantDefined( Base_CUDA );
+
+    setVariantDefined( Base_HIP );
+  }
+}
+
+HALO_SENDRECV::~HALO_SENDRECV()
+{
+}
+
+void HALO_SENDRECV::setUp(VariantID vid, size_t tune_idx)
+{
+  setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx);
+
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  m_send_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_send_buffers[l], buffer_len);
+    }
+  }
+
+  m_recv_buffers.resize(s_num_neighbors, nullptr);
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l];
+    if (separate_buffers) {
+      allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len);
+    } else {
+      allocAndInitData(getMPIDataSpace(vid), m_recv_buffers[l], buffer_len);
+    }
+  }
+}
+
+void HALO_SENDRECV::updateChecksum(VariantID vid, size_t tune_idx)
+{
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  for (Index_type l = 0; l < s_num_neighbors; ++l) {
+    Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l];
+    if (separate_buffers) {
+      checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_recv_buffers[l], buffer_len, vid);
+    } else {
+      checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_recv_buffers[l], buffer_len, vid);
+    }
+  }
+}
+
+void HALO_SENDRECV::tearDown(VariantID vid, size_t tune_idx)
+{
+  const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy);
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_recv_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_recv_buffers[l]);
+    }
+  }
+  m_recv_buffers.clear();
+
+  for (int l = 0; l < s_num_neighbors; ++l) {
+    if (separate_buffers) {
+      deallocData(DataSpace::Host, m_send_buffers[l]);
+    } else {
+      deallocData(getMPIDataSpace(vid), m_send_buffers[l]);
+    }
+  }
+  m_send_buffers.clear();
+
+  tearDown_base(vid, tune_idx);
+}
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
diff --git a/src/comm/HALO_SENDRECV.hpp b/src/comm/HALO_SENDRECV.hpp
new file mode 100644
index 000000000..da2a1d1cc
--- /dev/null
+++ b/src/comm/HALO_SENDRECV.hpp
@@ -0,0 +1,124 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// HALO_SENDRECV kernel reference implementation:
+///
+/// // post a recv for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Index_type len = unpack_index_list_lengths[l];
+///   MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type,
+///       mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]);
+/// }
+///
+/// // pack a buffer for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   Real_ptr buffer = pack_buffers[l];
+///   Int_ptr list = pack_index_lists[l];
+///   Index_type len = pack_index_list_lengths[l];
+///   // pack part of each variable
+///   for (Index_type v = 0; v < num_vars; ++v) {
+///     Real_ptr var = vars[v];
+///     for (Index_type i = 0; i < len; i++) {
+///       buffer[i] = var[list[i]];
+///     }
+///     buffer += len;
+///   }
+///   // send buffer to neighbor
+///   MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type,
+///       mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]);
+/// }
+///
+/// // unpack a buffer for each neighbor
+/// for (Index_type l = 0; l < num_neighbors; ++l) {
+///   // receive buffer from neighbor
+///   MPI_Wait(&unpack_mpi_requests[l], MPI_STATUS_IGNORE);
+///   Real_ptr buffer = unpack_buffers[l];
+///   Int_ptr list = unpack_index_lists[l];
+///   Index_type len = unpack_index_list_lengths[l];
+///   // unpack part of each variable
+///   for (Index_type v = 0; v < num_vars; ++v) {
+///     Real_ptr var = vars[v];
+///     for (Index_type i = 0; i < len; i++) {
+///       var[list[i]] = buffer[i];
+///     }
+///     buffer += len;
+///   }
+/// }
+///
+/// // wait for all sends to complete
+/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE);
+///
+
+
+#ifndef RAJAPerf_Comm_HALO_SENDRECV_HPP
+#define RAJAPerf_Comm_HALO_SENDRECV_HPP
+
+#define HALO_SENDRECV_DATA_SETUP \
+  HALO_BASE_DATA_SETUP \
+  \
+  Index_type num_vars = m_num_vars; \
+  \
+  std::vector<int> mpi_ranks = m_mpi_ranks; \
+  \
+  std::vector<MPI_Request> pack_mpi_requests(num_neighbors); \
+  std::vector<MPI_Request> unpack_mpi_requests(num_neighbors); \
+  \
+  std::vector<Real_ptr> send_buffers = m_send_buffers; \
+  std::vector<Real_ptr> recv_buffers = m_recv_buffers;
+
+
+#include "HALO_base.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+#include <vector>
+#include <array>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+class HALO_SENDRECV : public HALO_base
+{
+public:
+
+  HALO_SENDRECV(const RunParams& params);
+
+  ~HALO_SENDRECV();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+private:
+  int m_mpi_size = -1;
+  int m_my_mpi_rank = -1;
+  std::array<int, 3> m_mpi_dims = {-1, -1, -1};
+
+  Index_type m_num_vars;
+  Index_type m_var_size;
+
+  std::vector<Real_ptr> m_send_buffers;
+  std::vector<Real_ptr> m_recv_buffers;
+};
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif
+#endif // closing endif for header file include guard
diff --git a/src/comm/HALO_base.cpp b/src/comm/HALO_base.cpp
new file mode 100644
index 000000000..84845114c
--- /dev/null
+++ b/src/comm/HALO_base.cpp
@@ -0,0 +1,311 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HALO_base.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <utility>
+#include <cmath>
+#include <map>
+
+namespace rajaperf
+{
+namespace comm
+{
+
+Index_type HALO_base::s_grid_dims_default[3] {100, 100, 100};
+
+HALO_base::HALO_base(KernelID kid, const RunParams& params)
+  : KernelBase(kid, params)
+{
+  setDefaultProblemSize( s_grid_dims_default[0] *
+                         s_grid_dims_default[1] *
+                         s_grid_dims_default[2] );
+
+  double cbrt_run_size = std::cbrt(getTargetProblemSize()) + std::cbrt(3)-1;
+
+  m_grid_dims[0] = cbrt_run_size;
+  m_grid_dims[1] = cbrt_run_size;
+  m_grid_dims[2] = cbrt_run_size;
+  m_halo_width = params.getHaloWidth();
+
+  m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width;
+  m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width;
+  m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width;
+  m_grid_plus_halo_size = m_grid_plus_halo_dims[0] *
+                          m_grid_plus_halo_dims[1] *
+                          m_grid_plus_halo_dims[2] ;
+
+  setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] );
+}
+
+HALO_base::~HALO_base()
+{
+}
+
+void HALO_base::setUp_base(const int my_mpi_rank, const int* mpi_dims,
+                                   VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  m_mpi_ranks.resize(s_num_neighbors, -1);
+  m_send_tags.resize(s_num_neighbors, -1);
+  m_pack_index_lists.resize(s_num_neighbors, nullptr);
+  m_pack_index_list_lengths.resize(s_num_neighbors, 0);
+  m_recv_tags.resize(s_num_neighbors, -1);
+  m_unpack_index_lists.resize(s_num_neighbors, nullptr);
+  m_unpack_index_list_lengths.resize(s_num_neighbors, 0);
+  create_lists(my_mpi_rank, mpi_dims, m_mpi_ranks,
+      m_send_tags, m_pack_index_lists, m_pack_index_list_lengths,
+      m_recv_tags, m_unpack_index_lists, m_unpack_index_list_lengths,
+      m_halo_width, m_grid_dims,
+      s_num_neighbors, vid);
+}
+
+void HALO_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  destroy_lists(m_pack_index_lists, m_unpack_index_lists, s_num_neighbors, vid);
+  m_unpack_index_list_lengths.clear();
+  m_unpack_index_lists.clear();
+  m_recv_tags.clear();
+  m_pack_index_list_lengths.clear();
+  m_pack_index_lists.clear();
+  m_send_tags.clear();
+  m_mpi_ranks.clear();
+}
+
+
+const int HALO_base::s_boundary_offsets[HALO_base::s_num_neighbors][3]{
+
+  // faces
+  {-1,  0,  0},
+  { 1,  0,  0},
+  { 0, -1,  0},
+  { 0,  1,  0},
+  { 0,  0, -1},
+  { 0,  0,  1},
+
+  // edges
+  {-1, -1,  0},
+  {-1,  1,  0},
+  { 1, -1,  0},
+  { 1,  1,  0},
+  {-1,  0, -1},
+  {-1,  0,  1},
+  { 1,  0, -1},
+  { 1,  0,  1},
+  { 0, -1, -1},
+  { 0, -1,  1},
+  { 0,  1, -1},
+  { 0,  1,  1},
+
+  // corners
+  {-1, -1, -1},
+  {-1, -1,  1},
+  {-1,  1, -1},
+  {-1,  1,  1},
+  { 1, -1, -1},
+  { 1, -1,  1},
+  { 1,  1, -1},
+  { 1,  1,  1}
+
+};
+
+HALO_base::Extent HALO_base::make_boundary_extent(
+    const HALO_base::message_type msg_type,
+    const int (&boundary_offset)[3],
+    const Index_type halo_width, const Index_type* grid_dims)
+{
+  if (msg_type != message_type::send &&
+      msg_type != message_type::recv) {
+    throw std::runtime_error("make_boundary_extent: Invalid message type");
+  }
+  auto get_bounds = [&](int offset, Index_type dim_size) {
+    std::pair<Index_type, Index_type> bounds;
+    switch (offset) {
+    case -1:
+      if (msg_type == message_type::send) {
+        bounds.first  = halo_width;
+        bounds.second = halo_width + halo_width;
+      } else { // (msg_type == message_type::recv)
+        bounds.first  = 0;
+        bounds.second = halo_width;
+      }
+      break;
+    case 0:
+      bounds.first  = halo_width;
+      bounds.second = halo_width + dim_size;
+      break;
+    case 1:
+      if (msg_type == message_type::send) {
+        bounds.first  = halo_width + dim_size - halo_width;
+        bounds.second = halo_width + dim_size;
+      } else { // (msg_type == message_type::recv)
+        bounds.first  = halo_width + dim_size;
+        bounds.second = halo_width + dim_size + halo_width;
+      }
+      break;
+    default:
+      throw std::runtime_error("make_extent: Invalid location");
+    }
+    return bounds;
+  };
+  auto x_bounds = get_bounds(boundary_offset[0], grid_dims[0]);
+  auto y_bounds = get_bounds(boundary_offset[1], grid_dims[1]);
+  auto z_bounds = get_bounds(boundary_offset[2], grid_dims[2]);
+  return {x_bounds.first, x_bounds.second,
+          y_bounds.first, y_bounds.second,
+          z_bounds.first, z_bounds.second};
+}
+
+
+//
+// Function to generate mpi decomposition and index lists for packing and unpacking.
+//
+void HALO_base::create_lists(
+    int my_mpi_rank,
+    const int* mpi_dims,
+    std::vector<int>& mpi_ranks,
+    std::vector<int>& send_tags,
+    std::vector<Int_ptr>& pack_index_lists,
+    std::vector<Index_type >& pack_index_list_lengths,
+    std::vector<int>& recv_tags,
+    std::vector<Int_ptr>& unpack_index_lists,
+    std::vector<Index_type >& unpack_index_list_lengths,
+    const Index_type halo_width, const Index_type* grid_dims,
+    const Index_type num_neighbors,
+    VariantID vid)
+{
+  int my_mpi_idx[3]{-1,-1,-1};
+  my_mpi_idx[2] = my_mpi_rank / (mpi_dims[0]*mpi_dims[1]);
+  my_mpi_idx[1] = (my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1])) / mpi_dims[0];
+  my_mpi_idx[0] = my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1]) - my_mpi_idx[1]*mpi_dims[0];
+
+  auto get_boundary_idx = [&](const int (&boundary_offset)[3]) {
+    return (boundary_offset[0]+1) + 3*(boundary_offset[1]+1) + 9*(boundary_offset[2]+1);
+  };
+
+  std::map<int, int> boundary_idx_to_tag;
+  for (Index_type l = 0; l < num_neighbors; ++l) {
+    boundary_idx_to_tag[get_boundary_idx(s_boundary_offsets[l])] = l;
+  }
+
+  const Index_type grid_i_stride = 1;
+  const Index_type grid_j_stride = grid_dims[0] + 2*halo_width;
+  const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width);
+
+  for (Index_type l = 0; l < num_neighbors; ++l) {
+
+    const int (&boundary_offset)[3] = s_boundary_offsets[l];
+
+    int neighbor_boundary_offset[3]{-1, -1, -1};
+    for (int dim = 0; dim < 3; ++dim) {
+      neighbor_boundary_offset[dim] = -boundary_offset[dim];
+    }
+
+    int neighbor_mpi_idx[3] = {my_mpi_idx[0]+boundary_offset[0],
+                               my_mpi_idx[1]+boundary_offset[1],
+                               my_mpi_idx[2]+boundary_offset[2]};
+
+    // fix neighbor mpi index on periodic boundaries
+    for (int dim = 0; dim < 3; ++dim) {
+      if (neighbor_mpi_idx[dim] >= mpi_dims[dim]) {
+        neighbor_mpi_idx[dim] = 0;
+      } else if (neighbor_mpi_idx[dim] < 0) {
+        neighbor_mpi_idx[dim] = mpi_dims[dim]-1;
+      }
+    }
+
+    mpi_ranks[l] = neighbor_mpi_idx[0] + mpi_dims[0]*(neighbor_mpi_idx[1] + mpi_dims[1]*neighbor_mpi_idx[2]);
+
+    {
+      // pack and send
+      send_tags[l] = boundary_idx_to_tag[get_boundary_idx(boundary_offset)];
+      Extent extent = make_boundary_extent(message_type::send,
+                                           boundary_offset,
+                                           halo_width, grid_dims);
+
+      pack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
+                                   (extent.j_max - extent.j_min) *
+                                   (extent.k_max - extent.k_min) ;
+
+      allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid);
+      auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid);
+
+      Int_ptr pack_list = pack_index_lists[l];
+
+      Index_type list_idx = 0;
+      for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) {
+        for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) {
+          for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) {
+
+            Index_type pack_idx = ii * grid_i_stride +
+                                  jj * grid_j_stride +
+                                  kk * grid_k_stride ;
+
+            pack_list[list_idx] = pack_idx;
+
+            list_idx += 1;
+          }
+        }
+      }
+    }
+
+    {
+      // receive and unpack
+      recv_tags[l] = boundary_idx_to_tag[get_boundary_idx(neighbor_boundary_offset)];
+      Extent extent = make_boundary_extent(message_type::recv,
+                                           boundary_offset,
+                                           halo_width, grid_dims);
+
+      unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) *
+                                     (extent.j_max - extent.j_min) *
+                                     (extent.k_max - extent.k_min) ;
+
+      allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid);
+      auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid);
+
+      Int_ptr unpack_list = unpack_index_lists[l];
+
+      Index_type list_idx = 0;
+      for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) {
+        for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) {
+          for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) {
+
+            Index_type unpack_idx = ii * grid_i_stride +
+                                    jj * grid_j_stride +
+                                    kk * grid_k_stride ;
+
+            unpack_list[list_idx] = unpack_idx;
+
+            list_idx += 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+//
+// Function to destroy packing and unpacking index lists.
+//
+void HALO_base::destroy_lists(
+    std::vector<Int_ptr>& pack_index_lists,
+    std::vector<Int_ptr>& unpack_index_lists,
+    const Index_type num_neighbors,
+    VariantID vid)
+{
+  for (Index_type l = 0; l < num_neighbors; ++l) {
+    deallocData(pack_index_lists[l], vid);
+  }
+  for (Index_type l = 0; l < num_neighbors; ++l) {
+    deallocData(unpack_index_lists[l], vid);
+  }
+}
+
+} // end namespace comm
+} // end namespace rajaperf
diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp
new file mode 100644
index 000000000..fea021a87
--- /dev/null
+++ b/src/comm/HALO_base.hpp
@@ -0,0 +1,176 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// HALO_base provides a common starting point for the other HALO_ classes.
+///
+
+#ifndef RAJAPerf_Comm_HALO_BASE_HPP
+#define RAJAPerf_Comm_HALO_BASE_HPP
+
+#define HALO_BASE_DATA_SETUP \
+  Index_type num_neighbors = s_num_neighbors; \
+  std::vector<int> send_tags = m_send_tags; \
+  std::vector<Int_ptr> pack_index_lists = m_pack_index_lists; \
+  std::vector<Index_type> pack_index_list_lengths = m_pack_index_list_lengths; \
+  std::vector<int> recv_tags = m_recv_tags; \
+  std::vector<Int_ptr> unpack_index_lists = m_unpack_index_lists; \
+  std::vector<Index_type> unpack_index_list_lengths = m_unpack_index_list_lengths;
+
+#define HALO_PACK_BODY \
+  buffer[i] = var[list[i]];
+
+#define HALO_UNPACK_BODY \
+  var[list[i]] = buffer[i];
+
+
+#include "common/KernelBase.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <vector>
+
+namespace rajaperf
+{
+class RunParams;
+
+struct direct_dispatch_helper
+{
+  template < typename... Ts >
+  using dispatch_policy = RAJA::direct_dispatch<Ts...>;
+  static std::string get_name() { return "direct"; }
+};
+
+struct indirect_function_call_dispatch_helper
+{
+  template < typename... Ts >
+  using dispatch_policy = RAJA::indirect_function_call_dispatch;
+  static std::string get_name() { return "funcptr"; }
+};
+
+struct indirect_virtual_function_dispatch_helper
+{
+  template < typename... Ts >
+  using dispatch_policy = RAJA::indirect_virtual_function_dispatch;
+  static std::string get_name() { return "virtfunc"; }
+};
+
+using workgroup_dispatch_helpers = camp::list<
+    direct_dispatch_helper,
+    indirect_function_call_dispatch_helper,
+    indirect_virtual_function_dispatch_helper >;
+
+using hip_workgroup_dispatch_helpers = camp::list<
+    direct_dispatch_helper
+#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL
+   ,indirect_function_call_dispatch_helper
+   ,indirect_virtual_function_dispatch_helper
+#endif
+    >;
+
+namespace comm
+{
+
+class HALO_base : public KernelBase
+{
+public:
+
+  HALO_base(KernelID kid, const RunParams& params);
+
+  ~HALO_base();
+
+  void setUp_base(const int my_mpi_rank, const int* mpi_dims,
+             VariantID vid, size_t tune_idx);
+  void tearDown_base(VariantID vid, size_t tune_idx);
+
+  struct Packer {
+    Real_ptr buffer;
+    Real_ptr var;
+    Int_ptr list;
+    RAJA_HOST_DEVICE void operator()(Index_type i) const {
+      HALO_PACK_BODY;
+    }
+  };
+
+  struct UnPacker {
+    Real_ptr buffer;
+    Real_ptr var;
+    Int_ptr list;
+    RAJA_HOST_DEVICE void operator()(Index_type i) const {
+      HALO_UNPACK_BODY;
+    }
+  };
+
+protected:
+  enum struct message_type : int
+  {
+    send,
+    recv
+  };
+
+  struct Extent
+  {
+    Index_type i_min;
+    Index_type i_max;
+    Index_type j_min;
+    Index_type j_max;
+    Index_type k_min;
+    Index_type k_max;
+  };
+
+  static const int s_num_neighbors = 26;
+  static const int s_boundary_offsets[s_num_neighbors][3];
+
+  static Index_type s_grid_dims_default[3];
+
+  Index_type m_grid_dims[3];
+  Index_type m_halo_width;
+
+  Index_type m_grid_plus_halo_dims[3];
+  Index_type m_grid_plus_halo_size;
+
+  std::vector<int> m_mpi_ranks;
+
+  std::vector<int> m_send_tags;
+  std::vector<Int_ptr> m_pack_index_lists;
+  std::vector<Index_type > m_pack_index_list_lengths;
+
+  std::vector<int> m_recv_tags;
+  std::vector<Int_ptr> m_unpack_index_lists;
+  std::vector<Index_type > m_unpack_index_list_lengths;
+
+  Extent make_boundary_extent(
+    const message_type msg_type,
+    const int (&boundary_offset)[3],
+    const Index_type halo_width, const Index_type* grid_dims);
+
+  void create_lists(
+      int my_mpi_rank,
+      const int* mpi_dims,
+      std::vector<int>& mpi_ranks,
+      std::vector<int>& send_tags,
+      std::vector<Int_ptr>& pack_index_lists,
+      std::vector<Index_type >& pack_index_list_lengths,
+      std::vector<int>& recv_tags,
+      std::vector<Int_ptr>& unpack_index_lists,
+      std::vector<Index_type >& unpack_index_list_lengths,
+      const Index_type halo_width, const Index_type* grid_dims,
+      const Index_type num_neighbors,
+      VariantID vid);
+
+  void destroy_lists(
+      std::vector<Int_ptr>& pack_index_lists,
+      std::vector<Int_ptr>& unpack_index_lists,
+      const Index_type num_neighbors,
+      VariantID vid);
+};
+
+} // end namespace comm
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 9dff522bd..f14076398 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp
index 0c3504d69..dea54acf2 100644
--- a/src/common/CudaDataUtils.hpp
+++ b/src/common/CudaDataUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -42,6 +42,33 @@ __device__ __forceinline__ unsigned long long device_timer()
   return global_timer;
 }
 
+/*!
+ * \brief Method for launching a CUDA kernel with given configuration.
+ *
+ *        Note: method checks whether number of args and their types in
+ *              kernel signature matches args passed to this method.
+ */
+template <typename... Args, typename...KernArgs>
+void RPlaunchCudaKernel(void (*kernel)(KernArgs...),
+                        const dim3& numBlocks, const dim3& dimBlocks,
+                        std::uint32_t sharedMemBytes, cudaStream_t stream,
+                        Args const&... args)
+{
+  static_assert(sizeof...(KernArgs) == sizeof...(Args),
+                "Number of kernel args doesn't match what's passed to method");
+
+  static_assert(conjunction<std::is_same<std::decay_t<KernArgs>, std::decay_t<Args>>...>::value,
+                "Kernel arg types don't match what's passed to method");
+
+  constexpr size_t count = sizeof...(Args);
+  void* arg_arr[count]{(void*)&args...};
+
+  auto k = reinterpret_cast<const void*>(kernel);
+  cudaErrchk( cudaLaunchKernel(k, numBlocks, dimBlocks,
+                               arg_arr,
+                               sharedMemBytes, stream) );
+}
+
 /*!
  * \brief Simple forall cuda kernel that runs a lambda.
  */
@@ -84,19 +111,56 @@ __global__ void lambda_cuda(Lambda body)
 namespace detail
 {
 
+/*!
+ * \brief Get current cuda device.
+ */
+inline int getCudaDevice()
+{
+  int device = -1;
+  cudaErrchk( cudaGetDevice( &device ) );
+  return device;
+}
+
+/*!
+ * \brief Get properties of the current cuda device.
+ */
+inline cudaDeviceProp getCudaDeviceProp()
+{
+  cudaDeviceProp prop;
+  cudaErrchk(cudaGetDeviceProperties(&prop, getCudaDevice()));
+  return prop;
+}
+
+/*!
+ * \brief Get max occupancy in blocks for the given kernel for the current
+ *        cuda device.
+ */
+template < typename Func >
+RAJA_INLINE
+int getCudaOccupancyMaxBlocks(Func&& func, int num_threads, size_t shmem_size)
+{
+  int max_blocks = -1;
+  cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_blocks, func, num_threads, shmem_size));
+
+  size_t multiProcessorCount = getCudaDeviceProp().multiProcessorCount;
+
+  return max_blocks * multiProcessorCount;
+}
+
 /*
  * Copy memory len bytes from src to dst.
  */
-inline void copyCudaData(void* dst_ptr, const void* src_ptr, size_t len)
+inline void copyCudaData(void* dst_ptr, const void* src_ptr, Size_type len)
 {
   cudaErrchk( cudaMemcpy( dst_ptr, src_ptr, len,
               cudaMemcpyDefault ) );
 }
 
 /*!
- * \brief Allocate CUDA device data array (dptr).
+ * \brief Allocate CUDA device data array.
  */
-inline void* allocCudaDeviceData(size_t len)
+inline void* allocCudaDeviceData(Size_type len)
 {
   void* dptr = nullptr;
   cudaErrchk( cudaMalloc( &dptr, len ) );
@@ -104,19 +168,65 @@ inline void* allocCudaDeviceData(size_t len)
 }
 
 /*!
- * \brief Allocate CUDA managed data array (dptr).
+ * \brief Allocate CUDA managed data array.
+ */
+inline void* allocCudaManagedData(Size_type len)
+{
+  void* mptr = nullptr;
+  cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) );
+  return mptr;
+}
+
+/*!
+ * \brief Allocate CUDA managed host preferred data array.
+ */
+inline void* allocCudaManagedHostPreferredData(Size_type len)
+{
+  void* mptr = nullptr;
+  cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) );
+  cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId ) );
+  return mptr;
+}
+
+/*!
+ * \brief Allocate CUDA managed device preferred data array.
  */
-inline void* allocCudaManagedData(size_t len)
+inline void* allocCudaManagedDevicePreferredData(Size_type len)
 {
   void* mptr = nullptr;
   cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) );
+  cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, getCudaDevice() ) );
   return mptr;
 }
 
 /*!
- * \brief Allocate CUDA pinned data array (pptr).
+ * \brief Allocate CUDA managed host preferred host accessed data array.
  */
-inline void* allocCudaPinnedData(size_t len)
+inline void* allocCudaManagedHostPreferredDeviceAccessedData(Size_type len)
+{
+  void* mptr = nullptr;
+  cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) );
+  cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId ) );
+  cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetAccessedBy, getCudaDevice() ) );
+  return mptr;
+}
+
+/*!
+ * \brief Allocate CUDA managed device preferred host accessed data array.
+ */
+inline void* allocCudaManagedDevicePreferredHostAccessedData(Size_type len)
+{
+  void* mptr = nullptr;
+  cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) );
+  cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, getCudaDevice() ) );
+  cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId ) );
+  return mptr;
+}
+
+/*!
+ * \brief Allocate CUDA pinned data array.
+ */
+inline void* allocCudaPinnedData(Size_type len)
 {
   void* pptr = nullptr;
   cudaErrchk( cudaHostAlloc( &pptr, len, cudaHostAllocMapped ) );
@@ -125,7 +235,7 @@ inline void* allocCudaPinnedData(size_t len)
 
 
 /*!
- * \brief Free device data array.
+ * \brief Free CUDA device data array.
  */
 inline void deallocCudaDeviceData(void* dptr)
 {
@@ -133,7 +243,7 @@ inline void deallocCudaDeviceData(void* dptr)
 }
 
 /*!
- * \brief Free managed data array.
+ * \brief Free CUDA managed data array.
  */
 inline void deallocCudaManagedData(void* mptr)
 {
@@ -141,7 +251,39 @@ inline void deallocCudaManagedData(void* mptr)
 }
 
 /*!
- * \brief Free pinned data array.
+ * \brief Free CUDA managed host preferred data array.
+ */
+inline void deallocCudaManagedHostPreferredData(void* mptr)
+{
+  cudaErrchk( cudaFree( mptr ) );
+}
+
+/*!
+ * \brief Free CUDA managed device preferred data array.
+ */
+inline void deallocCudaManagedDevicePreferredData(void* mptr)
+{
+  cudaErrchk( cudaFree( mptr ) );
+}
+
+/*!
+ * \brief Free CUDA managed host preferred host accessed data array.
+ */
+inline void deallocCudaManagedHostPreferredDeviceAccessedData(void* mptr)
+{
+  cudaErrchk( cudaFree( mptr ) );
+}
+
+/*!
+ * \brief Free CUDA managed device preferred host accessed data array.
+ */
+inline void deallocCudaManagedDevicePreferredHostAccessedData(void* mptr)
+{
+  cudaErrchk( cudaFree( mptr ) );
+}
+
+/*!
+ * \brief Free CUDA pinned data array.
  */
 inline void deallocCudaPinnedData(void* pptr)
 {
diff --git a/src/common/CudaGridScan.hpp b/src/common/CudaGridScan.hpp
new file mode 100644
index 000000000..f2c8f2cd1
--- /dev/null
+++ b/src/common/CudaGridScan.hpp
@@ -0,0 +1,245 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/warp/warp_reduce.cuh>
+#include <cub/warp/warp_scan.cuh>
+
+namespace rajaperf
+{
+namespace detail
+{
+namespace cuda
+{
+
+//
+// Define magic numbers for CUDA execution
+//
+const size_t warp_size = 32;
+const size_t max_static_shmem = 49154;
+
+
+// perform a grid scan on val and returns the result at each thread
+// in exclusive and inclusive, note that val is used as scratch space
+template < typename DataType, size_t block_size, size_t items_per_thread >
+struct GridScan
+{
+  using BlockScan = cub::BlockScan<DataType, block_size>; //, cub::BLOCK_SCAN_WARP_SCANS>;
+  using BlockExchange = cub::BlockExchange<DataType, block_size, items_per_thread>;
+  using WarpReduce = cub::WarpReduce<DataType, warp_size>;
+
+  union SharedStorage {
+    typename BlockScan::TempStorage block_scan_storage;
+    typename BlockExchange::TempStorage block_exchange_storage;
+    typename WarpReduce::TempStorage warp_reduce_storage;
+    volatile DataType prev_grid_count;
+  };
+
+  static constexpr size_t shmem_size = sizeof(SharedStorage);
+
+  __device__
+  static void grid_scan(const int block_id,
+                        DataType (&val)[items_per_thread],
+                        DataType (&exclusive)[items_per_thread],
+                        DataType (&inclusive)[items_per_thread],
+                        DataType* block_counts,
+                        DataType* grid_counts,
+                        unsigned* block_readys)
+  {
+    const bool first_block = (block_id == 0);
+    const bool last_block = (block_id == gridDim.x-1);
+    const bool last_thread = (threadIdx.x == block_size-1);
+    const bool last_warp = (threadIdx.x >= block_size - warp_size);
+    const int warp_index = (threadIdx.x % warp_size);
+    const unsigned warp_index_mask = (1u << warp_index);
+    const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u);
+
+    __shared__ SharedStorage s_temp_storage;
+
+
+    BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val);
+    __syncthreads();
+
+
+    BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive);
+    __syncthreads();
+
+    for (size_t ti = 0; ti < items_per_thread; ++ti) {
+      inclusive[ti] = exclusive[ti] + val[ti];
+    }
+
+    BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive);
+    __syncthreads();
+    BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive);
+    __syncthreads();
+    if (first_block) {
+
+      if (!last_block && last_thread) {
+        block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
+        grid_counts[block_id] = inclusive[items_per_thread-1];  // write inclusive scan result for grid through block
+        __threadfence();                         // ensure block_counts, grid_counts ready (release)
+        atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready
+      }
+
+    } else {
+
+      if (!last_block && last_thread) {
+        block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
+        __threadfence();                         // ensure block_counts ready (release)
+        atomicExch(&block_readys[block_id], 1u); // write block_counts is ready
+      }
+
+      // get prev_grid_count using last warp in block
+      if (last_warp) {
+
+        DataType prev_grid_count = 0;
+
+        // accumulate previous block counts into registers of warp
+
+        int prev_block_base_id = block_id - warp_size;
+
+        unsigned prev_block_ready = 0u;
+        unsigned prev_blocks_ready_ballot = 0u;
+        unsigned prev_grids_ready_ballot = 0u;
+
+        // accumulate full warp worths of block counts
+        // stop if run out of full warps of a grid count is ready
+        while (prev_block_base_id >= 0) {
+
+          const int prev_block_id = prev_block_base_id + warp_index;
+
+          // ensure previous block_counts are ready
+          do {
+            prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
+
+            prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u);
+
+          } while (prev_blocks_ready_ballot != 0xffffffffu);
+
+          prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u);
+
+          if (prev_grids_ready_ballot != 0u) {
+            break;
+          }
+
+          __threadfence(); // ensure block_counts or grid_counts ready (acquire)
+
+          // accumulate block_counts for prev_block_id
+          prev_grid_count += block_counts[prev_block_id];
+
+          prev_block_ready = 0u;
+
+          prev_block_base_id -= warp_size;
+        }
+
+        const int prev_block_id = prev_block_base_id + warp_index;
+
+        // ensure previous block_counts are ready
+        // this checks that block counts is ready for all blocks above
+        // the highest grid count that is ready
+        while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) {
+
+          if (prev_block_id >= 0) {
+            prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
+          }
+
+          prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u);
+          prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u);
+        }
+        __threadfence(); // ensure block_counts or grid_counts ready (acquire)
+
+        // read one grid_count from a block with id grid_count_ready_id
+        // and read the block_counts from blocks with higher ids.
+        if (warp_index_mask > prev_grids_ready_ballot) {
+          // accumulate block_counts for prev_block_id
+          prev_grid_count += block_counts[prev_block_id];
+        } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) {
+          // accumulate grid_count for grid_count_ready_id
+          prev_grid_count += grid_counts[prev_block_id];
+        }
+
+
+        prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count);
+        prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp
+
+        if (last_thread) {
+
+          if (!last_block) {
+            grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1];   // write inclusive scan result for grid through block
+            __threadfence();                        // ensure grid_counts ready (release)
+            atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready
+          }
+
+          s_temp_storage.prev_grid_count = prev_grid_count;
+        }
+      }
+
+      __syncthreads();
+      DataType prev_grid_count = s_temp_storage.prev_grid_count;
+
+      for (size_t ti = 0; ti < items_per_thread; ++ti) {
+        exclusive[ti] = prev_grid_count + exclusive[ti];
+        inclusive[ti] = prev_grid_count + inclusive[ti];
+      }
+    }
+  }
+
+};
+
+
+namespace detail
+{
+
+template < typename T, size_t block_size, size_t max_items_per_thread >
+struct grid_scan_max_items_per_thread
+  : std::conditional_t< (GridScan<T, block_size, max_items_per_thread>::shmem_size <= max_static_shmem),
+        grid_scan_max_items_per_thread<T, block_size, max_items_per_thread+1>,
+        std::integral_constant<size_t, max_items_per_thread-1> >
+{
+};
+
+}
+
+template < typename T, size_t block_size >
+struct grid_scan_max_items_per_thread
+  : detail::grid_scan_max_items_per_thread<T, block_size, 1>
+{
+};
+
+
+// tune grid scan to maximize throughput while minimizing items_per_thread
+
+// default tuning for unknown DataType or cuda_arch
+template < typename DataType, size_t block_size, size_t cuda_arch, typename enable = void >
+struct grid_scan_default_items_per_thread
+{
+  static constexpr size_t value =
+      grid_scan_max_items_per_thread<DataType, block_size>::value / 2;
+};
+
+// tuning for sm_70
+template < typename DataType, size_t block_size >
+struct grid_scan_default_items_per_thread<
+    DataType, block_size, 700, std::enable_if_t<sizeof(DataType) == sizeof(double)> >
+{
+  static constexpr size_t value =
+      (block_size <= 64) ? 13 :
+      (block_size <= 128) ? 9 :
+      (block_size <= 256) ? 6 :
+      (block_size <= 512) ? 5 :
+      (block_size <= 1024) ? 5 : 1;
+};
+
+} // end namespace cuda
+} // end namespace detail
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp
index f1831cc1f..607f5aa00 100644
--- a/src/common/DataUtils.cpp
+++ b/src/common/DataUtils.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -10,7 +10,9 @@
 #include "CudaDataUtils.hpp"
 #include "HipDataUtils.hpp"
 #include "OpenMPTargetDataUtils.hpp"
+#include "SyclDataUtils.hpp"
 
+#include "KernelBase.hpp"
 
 #include "RAJA/internal/MemUtils_CPU.hpp"
 
@@ -72,6 +74,10 @@ bool isCudaDataSpace(DataSpace dataSpace)
   switch (dataSpace) {
     case DataSpace::CudaPinned:
     case DataSpace::CudaManaged:
+    case DataSpace::CudaManagedHostPreferred:
+    case DataSpace::CudaManagedDevicePreferred:
+    case DataSpace::CudaManagedHostPreferredDeviceAccessed:
+    case DataSpace::CudaManagedDevicePreferredHostAccessed:
     case DataSpace::CudaDevice:
       return true;
     default:
@@ -101,6 +107,21 @@ bool isHipDataSpace(DataSpace dataSpace)
   }
 }
 
+/*!
+ * \brief Get if the data space is a sycl DataSpace.
+ */
+bool isSyclDataSpace(DataSpace dataSpace)
+{
+  switch (dataSpace) {
+    case DataSpace::SyclPinned:
+    case DataSpace::SyclManaged:
+    case DataSpace::SyclDevice:
+      return true;
+    default:
+      return false;
+  }
+}
+
 
 static int data_init_count = 0;
 
@@ -123,7 +144,7 @@ void incDataInitCount()
 /*
  * Copy memory len bytes from src to dst.
  */
-void copyHostData(void* dst_ptr, const void* src_ptr, size_t len)
+void copyHostData(void* dst_ptr, const void* src_ptr, Size_type len)
 {
   std::memcpy(dst_ptr, src_ptr, len);
 }
@@ -132,7 +153,7 @@ void copyHostData(void* dst_ptr, const void* src_ptr, size_t len)
 /*
  * Allocate data arrays of given type.
  */
-void* allocHostData(size_t len, size_t align)
+void* allocHostData(Size_type len, Size_type align)
 {
   return RAJA::allocate_aligned_type<Int_type>(
       align, len);
@@ -153,7 +174,7 @@ void deallocHostData(void* ptr)
 /*
  * Allocate data arrays of given dataSpace.
  */
-void* allocData(DataSpace dataSpace, int nbytes, int align)
+void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align)
 {
   void* ptr = nullptr;
 
@@ -186,6 +207,22 @@ void* allocData(DataSpace dataSpace, int nbytes, int align)
     {
       ptr = detail::allocCudaManagedData(nbytes);
     } break;
+    case DataSpace::CudaManagedHostPreferred:
+    {
+      ptr = detail::allocCudaManagedHostPreferredData(nbytes);
+    } break;
+    case DataSpace::CudaManagedDevicePreferred:
+    {
+      ptr = detail::allocCudaManagedDevicePreferredData(nbytes);
+    } break;
+    case DataSpace::CudaManagedHostPreferredDeviceAccessed:
+    {
+      ptr = detail::allocCudaManagedHostPreferredDeviceAccessedData(nbytes);
+    } break;
+    case DataSpace::CudaManagedDevicePreferredHostAccessed:
+    {
+      ptr = detail::allocCudaManagedDevicePreferredHostAccessedData(nbytes);
+    } break;
     case DataSpace::CudaDevice:
     {
       ptr = detail::allocCudaDeviceData(nbytes);
@@ -243,6 +280,25 @@ void* allocData(DataSpace dataSpace, int nbytes, int align)
     } break;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+    case DataSpace::SyclPinned:
+    {
+      auto qu = camp::resources::Sycl::get_default().get_queue();
+      ptr = detail::allocSyclPinnedData(nbytes, qu);
+    } break;
+    case DataSpace::SyclManaged:
+    {
+      auto qu = camp::resources::Sycl::get_default().get_queue();
+      ptr = detail::allocSyclManagedData(nbytes, qu);
+    } break;
+    case DataSpace::SyclDevice:
+    {
+      auto qu = camp::resources::Sycl::get_default().get_queue();
+      ptr = detail::allocSyclDeviceData(nbytes, qu);
+    } break;
+#endif
+
+
     default:
     {
       throw std::invalid_argument("allocData : Unknown data space");
@@ -257,10 +313,10 @@ void* allocData(DataSpace dataSpace, int nbytes, int align)
  */
 void copyData(DataSpace dst_dataSpace, void* dst_ptr,
               DataSpace src_dataSpace, const void* src_ptr,
-              size_t nbytes)
+              Size_type nbytes)
 {
-  if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace &&
-      hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) {
+  if (hostCopyDataSpace(dst_dataSpace) == dst_dataSpace &&
+      hostCopyDataSpace(src_dataSpace) == src_dataSpace) {
     detail::copyHostData(dst_ptr, src_ptr, nbytes);
   }
 
@@ -290,6 +346,14 @@ void copyData(DataSpace dst_dataSpace, void* dst_ptr,
   }
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+  else if (isSyclDataSpace(dst_dataSpace) ||
+           isSyclDataSpace(src_dataSpace)) {
+    auto qu = camp::resources::Sycl::get_default().get_queue();
+    detail::copySyclData(dst_ptr, src_ptr, nbytes, qu);
+  }
+#endif
+
   else {
     throw std::invalid_argument("copyData : Unknown data space");
   }
@@ -329,6 +393,22 @@ void deallocData(DataSpace dataSpace, void* ptr)
     {
       detail::deallocCudaManagedData(ptr);
     } break;
+    case DataSpace::CudaManagedHostPreferred:
+    {
+      detail::deallocCudaManagedHostPreferredData(ptr);
+    } break;
+    case DataSpace::CudaManagedDevicePreferred:
+    {
+      detail::deallocCudaManagedDevicePreferredData(ptr);
+    } break;
+    case DataSpace::CudaManagedHostPreferredDeviceAccessed:
+    {
+      detail::deallocCudaManagedHostPreferredDeviceAccessedData(ptr);
+    } break;
+    case DataSpace::CudaManagedDevicePreferredHostAccessed:
+    {
+      detail::deallocCudaManagedDevicePreferredHostAccessedData(ptr);
+    } break;
     case DataSpace::CudaDevice:
     {
       detail::deallocCudaDeviceData(ptr);
@@ -357,6 +437,26 @@ void deallocData(DataSpace dataSpace, void* ptr)
     } break;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+    case DataSpace::SyclPinned:
+    {
+      auto qu = camp::resources::Sycl::get_default().get_queue();
+      detail::deallocSyclPinnedData(ptr, qu);
+    } break;
+    case DataSpace::SyclManaged:
+    {
+      auto qu = camp::resources::Sycl::get_default().get_queue();
+      detail::deallocSyclManagedData(ptr, qu);
+    } break;
+    case DataSpace::SyclDevice:
+    {
+      auto qu = camp::resources::Sycl::get_default().get_queue();
+      detail::deallocSyclDeviceData(ptr, qu);
+    } break;
+#endif
+
+
+
     default:
     {
       throw std::invalid_argument("deallocData : Unknown data space");
@@ -369,23 +469,23 @@ void deallocData(DataSpace dataSpace, void* ptr)
  * \brief Initialize Int_type data array to
  * randomly signed positive and negative values.
  */
-void initData(Int_ptr& ptr, int len)
+void initData(Int_ptr& ptr, Size_type len)
 {
   srand(4793);
 
   Real_type signfact = 0.0;
 
-  for (int i = 0; i < len; ++i) {
+  for (Size_type i = 0; i < len; ++i) {
     signfact = Real_type(rand())/RAND_MAX;
     ptr[i] = ( signfact < 0.5 ? -1 : 1 );
   };
 
   signfact = Real_type(rand())/RAND_MAX;
-  Int_type ilo = len * signfact;
+  Size_type ilo = len * signfact;
   ptr[ilo] = -58;
 
   signfact = Real_type(rand())/RAND_MAX;
-  Int_type ihi = len * signfact;
+  Size_type ihi = len * signfact;
   ptr[ihi] = 19;
 
   incDataInitCount();
@@ -396,11 +496,11 @@ void initData(Int_ptr& ptr, int len)
  * positive values (0.0, 1.0) based on their array position
  * (index) and the order in which this method is called.
  */
-void initData(Real_ptr& ptr, int len)
+void initData(Real_ptr& ptr, Size_type len)
 {
   Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 );
 
-  for (int i = 0; i < len; ++i) {
+  for (Size_type i = 0; i < len; ++i) {
     ptr[i] = factor*(i + 1.1)/(i + 1.12345);
   }
 
@@ -410,9 +510,9 @@ void initData(Real_ptr& ptr, int len)
 /*
  * Initialize Real_type data array to constant values.
  */
-void initDataConst(Real_ptr& ptr, int len, Real_type val)
+void initDataConst(Real_ptr& ptr, Size_type len, Real_type val)
 {
-  for (int i = 0; i < len; ++i) {
+  for (Size_type i = 0; i < len; ++i) {
     ptr[i] = val;
   };
 
@@ -422,9 +522,9 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val)
 /*
  * Initialize Index_type data array to constant values.
  */
-void initDataConst(Index_type*& ptr, int len, Index_type val)
+void initDataConst(Index_type*& ptr, Size_type len, Index_type val)
 {
-  for (int i = 0; i < len; ++i) {
+  for (Size_type i = 0; i < len; ++i) {
     ptr[i] = val;
   };
 
@@ -434,13 +534,13 @@ void initDataConst(Index_type*& ptr, int len, Index_type val)
 /*
  * Initialize Real_type data array with random sign.
  */
-void initDataRandSign(Real_ptr& ptr, int len)
+void initDataRandSign(Real_ptr& ptr, Size_type len)
 {
   Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 );
 
   srand(4793);
 
-  for (int i = 0; i < len; ++i) {
+  for (Size_type i = 0; i < len; ++i) {
     Real_type signfact = Real_type(rand())/RAND_MAX;
     signfact = ( signfact < 0.5 ? -1.0 : 1.0 );
     ptr[i] = signfact*factor*(i + 1.1)/(i + 1.12345);
@@ -452,11 +552,11 @@ void initDataRandSign(Real_ptr& ptr, int len)
 /*
  * Initialize Real_type data array with random values.
  */
-void initDataRandValue(Real_ptr& ptr, int len)
+void initDataRandValue(Real_ptr& ptr, Size_type len)
 {
   srand(4793);
 
-  for (int i = 0; i < len; ++i) {
+  for (Size_type i = 0; i < len; ++i) {
     ptr[i] = Real_type(rand())/RAND_MAX;
   };
 
@@ -466,12 +566,12 @@ void initDataRandValue(Real_ptr& ptr, int len)
 /*
  * Initialize Complex_type data array.
  */
-void initData(Complex_ptr& ptr, int len)
+void initData(Complex_ptr& ptr, Size_type len)
 {
   Complex_type factor = ( data_init_count % 2 ?  Complex_type(0.1,0.2) :
                                                  Complex_type(0.2,0.3) );
 
-  for (int i = 0; i < len; ++i) {
+  for (Size_type i = 0; i < len; ++i) {
     ptr[i] = factor*(i + 1.1)/(i + 1.12345);
   }
 
@@ -492,13 +592,14 @@ void initData(Real_type& d)
 /*
  * Calculate and return checksum for data arrays.
  */
-long double calcChecksum(Int_ptr ptr, int len,
-                         Real_type scale_factor)
+template < typename Data_getter >
+long double calcChecksumImpl(Data_getter data, Size_type len,
+                             Real_type scale_factor)
 {
   long double tchk = 0.0;
   long double ckahan = 0.0;
-  for (Index_type j = 0; j < len; ++j) {
-    long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j];
+  for (Size_type j = 0; j < len; ++j) {
+    long double x = (std::abs(std::sin(j+1.0))+0.5) * data(j);
     long double y = x - ckahan;
     volatile long double t = tchk + y;
     volatile long double z = t - tchk;
@@ -514,84 +615,138 @@ long double calcChecksum(Int_ptr ptr, int len,
   return tchk;
 }
 
-long double calcChecksum(Real_ptr ptr, int len,
+long double calcChecksum(Int_ptr ptr, Size_type len,
                          Real_type scale_factor)
 {
-  long double tchk = 0.0;
-  long double ckahan = 0.0;
-  for (Index_type j = 0; j < len; ++j) {
-    long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j];
-    long double y = x - ckahan;
-    volatile long double t = tchk + y;
-    volatile long double z = t - tchk;
-    ckahan = z - y;
-    tchk = t;
-#if 0 // RDH DEBUG
-    if ( (j % 100) == 0 ) {
-      getCout() << "j : tchk = " << j << " : " << tchk << std::endl;
-    }
-#endif
-  }
-  tchk *= scale_factor;
-  return tchk;
+  return calcChecksumImpl([=](Size_type j) {
+    return static_cast<long double>(ptr[j]);
+  }, len, scale_factor);
 }
 
-long double calcChecksum(Complex_ptr ptr, int len,
+long double calcChecksum(unsigned long long* ptr, Size_type len,
                          Real_type scale_factor)
 {
-  long double tchk = 0.0;
-  long double ckahan = 0.0;
-  for (Index_type j = 0; j < len; ++j) {
-    long double x = (std::abs(std::sin(j+1.0))+0.5) * (real(ptr[j])+imag(ptr[j]));
-    long double y = x - ckahan;
-    volatile long double t = tchk + y;
-    volatile long double z = t - tchk;
-    ckahan = z - y;
-    tchk = t;
-#if 0 // RDH DEBUG
-    if ( (j % 100) == 0 ) {
-      getCout() << "j : tchk = " << j << " : " << tchk << std::endl;
-    }
-#endif
-  }
-  tchk *= scale_factor;
-  return tchk;
+  return calcChecksumImpl([=](Size_type j) {
+    return static_cast<long double>(ptr[j]);
+  }, len, scale_factor);
+}
+
+long double calcChecksum(Real_ptr ptr, Size_type len,
+                         Real_type scale_factor)
+{
+  return calcChecksumImpl([=](Size_type j) {
+    return static_cast<long double>(ptr[j]);
+  }, len, scale_factor);
+}
+
+long double calcChecksum(Complex_ptr ptr, Size_type len,
+                         Real_type scale_factor)
+{
+  return calcChecksumImpl([=](Size_type j) {
+    return static_cast<long double>(real(ptr[j])+imag(ptr[j]));
+  }, len, scale_factor);
 }
 
 }  // closing brace for detail namespace
 
 
 /*!
- * \brief Get an host accessible data space for this dataSpace.
+ * \brief Get a host data space to use when making a host copy of data in the given
+ *        dataSpace.
+ *
+ * The returned host data space should reside in memory attached to the host.
+ *
+ * The intention is to get a data space with high performance on the host.
+ * Return the given data space if its already performant and fall back on a
+ * host data space that performs well in explicit copy operations with the
+ * given space.
  */
-DataSpace hostAccessibleDataSpace(DataSpace dataSpace)
+DataSpace hostCopyDataSpace(DataSpace dataSpace)
 {
   switch (dataSpace) {
     case DataSpace::Host:
     case DataSpace::Omp:
     case DataSpace::CudaPinned:
+    case DataSpace::CudaManagedHostPreferred:
+    case DataSpace::CudaManagedHostPreferredDeviceAccessed:
     case DataSpace::HipHostAdviseFine:
     case DataSpace::HipHostAdviseCoarse:
     case DataSpace::HipPinned:
     case DataSpace::HipPinnedFine:
     case DataSpace::HipPinnedCoarse:
+    case DataSpace::HipManaged:
+    case DataSpace::HipManagedAdviseFine:
+    case DataSpace::HipManagedAdviseCoarse:
+    case DataSpace::SyclPinned:
       return dataSpace;
 
     case DataSpace::OmpTarget:
       return DataSpace::Host;
 
     case DataSpace::CudaManaged:
+    case DataSpace::CudaManagedDevicePreferred:
+    case DataSpace::CudaManagedDevicePreferredHostAccessed:
     case DataSpace::CudaDevice:
       return DataSpace::CudaPinned;
 
+    case DataSpace::HipDevice:
+    case DataSpace::HipDeviceFine:
+      return DataSpace::HipPinned;
+
+    case DataSpace::SyclManaged:
+    case DataSpace::SyclDevice:
+      return DataSpace::SyclPinned;
+
+    default:
+    {
+      throw std::invalid_argument("hostCopyDataSpace : Unknown data space");
+    } break;
+  }
+}
+
+/*!
+ * \brief Get a data space accessible to the host for the given dataSpace.
+ *
+ * The returned host data space may reside in memory attached to another device.
+ *
+ * The intention is to get a data space accessible on the host even if it is not
+ * performant. Return the given data space if its already accessible and fall
+ * back on a space that is host accessible and performs well in explicit copy
+ * operations with the given space.
+ */
+DataSpace hostAccessibleDataSpace(DataSpace dataSpace)
+{
+  switch (dataSpace) {
+    case DataSpace::Host:
+    case DataSpace::Omp:
+    case DataSpace::CudaPinned:
+    case DataSpace::CudaManaged:
+    case DataSpace::CudaManagedHostPreferred:
+    case DataSpace::CudaManagedHostPreferredDeviceAccessed:
+    case DataSpace::CudaManagedDevicePreferred:
+    case DataSpace::CudaManagedDevicePreferredHostAccessed:
+    case DataSpace::HipHostAdviseFine:
+    case DataSpace::HipHostAdviseCoarse:
+    case DataSpace::HipPinned:
+    case DataSpace::HipPinnedFine:
+    case DataSpace::HipPinnedCoarse:
     case DataSpace::HipManaged:
     case DataSpace::HipManagedAdviseFine:
     case DataSpace::HipManagedAdviseCoarse:
-      return dataSpace;
-
     case DataSpace::HipDevice:
     case DataSpace::HipDeviceFine:
-      return DataSpace::HipPinned;
+    case DataSpace::SyclPinned:
+    case DataSpace::SyclManaged:
+      return dataSpace;
+
+    case DataSpace::OmpTarget:
+      return DataSpace::Host;
+
+    case DataSpace::CudaDevice:
+      return DataSpace::CudaPinned;
+
+    case DataSpace::SyclDevice:
+      return DataSpace::SyclPinned;
 
     default:
     {
diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp
index 1b233e574..b2fd990af 100644
--- a/src/common/DataUtils.hpp
+++ b/src/common/DataUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,6 +27,9 @@
 #if defined(RAJA_ENABLE_HIP)
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #endif
+#if defined(RAJA_ENABLE_HIP)
+#include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
+#endif
 
 namespace rajaperf
 {
@@ -44,12 +47,12 @@ void resetDataInitCount();
  */
 void incDataInitCount();
 
-void copyHostData(void* dst_ptr, const void* src_ptr, size_t len);
+void copyHostData(void* dst_ptr, const void* src_ptr, Size_type len);
 
 /*!
  * \brief Allocate data arrays.
  */
-void* allocHostData(size_t len, size_t align);
+void* allocHostData(Size_type len, Size_type align);
 
 /*!
  * \brief Free data arrays.
@@ -60,14 +63,14 @@ void deallocHostData(void* ptr);
 /*!
  * \brief Allocate data array in dataSpace.
  */
-void* allocData(DataSpace dataSpace, int nbytes, int align);
+void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align);
 
 /*!
  * \brief Copy data from one dataSpace to another.
  */
 void copyData(DataSpace dst_dataSpace, void* dst_ptr,
               DataSpace src_dataSpace, const void* src_ptr,
-              size_t nbytes);
+              Size_type nbytes);
 
 /*!
  * \brief Free data arrays in dataSpace.
@@ -82,7 +85,7 @@ void deallocData(DataSpace dataSpace, void* ptr);
  * Then, two randomly-chosen entries are reset, one to
  * a value > 1, one to a value < -1.
  */
-void initData(Int_ptr& ptr, int len);
+void initData(Int_ptr& ptr, Size_type len);
 
 /*!
  * \brief Initialize Real_type data array.
@@ -91,21 +94,21 @@ void initData(Int_ptr& ptr, int len);
  * in the interval (0.0, 1.0) based on their array position (index)
  * and the order in which this method is called.
  */
-void initData(Real_ptr& ptr, int len);
+void initData(Real_ptr& ptr, Size_type len);
 
 /*!
  * \brief Initialize Real_type data array.
  *
  * Array entries are set to given constant value.
  */
-void initDataConst(Real_ptr& ptr, int len, Real_type val);
+void initDataConst(Real_ptr& ptr, Size_type len, Real_type val);
 
 /*!
  * \brief Initialize Index_type data array.
  *
  * Array entries are set to given constant value.
  */
-void initDataConst(Index_type*& ptr, int len, Index_type val);
+void initDataConst(Index_type*& ptr, Size_type len, Index_type val);
 
 /*!
  * \brief Initialize Real_type data array with random sign.
@@ -113,14 +116,14 @@ void initDataConst(Index_type*& ptr, int len, Index_type val);
  * Array entries are initialized in the same way as the method
  * initData(Real_ptr& ptr...) above, but with random sign.
  */
-void initDataRandSign(Real_ptr& ptr, int len);
+void initDataRandSign(Real_ptr& ptr, Size_type len);
 
 /*!
  * \brief Initialize Real_type data array with random values.
  *
  * Array entries are initialized with random values in the interval [0.0, 1.0].
  */
-void initDataRandValue(Real_ptr& ptr, int len);
+void initDataRandValue(Real_ptr& ptr, Size_type len);
 
 /*!
  * \brief Initialize Complex_type data array.
@@ -128,7 +131,7 @@ void initDataRandValue(Real_ptr& ptr, int len);
  * Real and imaginary array entries are initialized in the same way as the
  * method allocAndInitData(Real_ptr& ptr...) above.
  */
-void initData(Complex_ptr& ptr, int len);
+void initData(Complex_ptr& ptr, Size_type len);
 
 /*!
  * \brief Initialize Real_type scalar data.
@@ -147,23 +150,43 @@ void initData(Real_type& d);
  *
  * Checksumn is multiplied by given scale factor.
  */
-long double calcChecksum(Int_ptr d, int len,
+long double calcChecksum(Int_ptr d, Size_type len,
                          Real_type scale_factor);
 ///
-long double calcChecksum(Real_ptr d, int len,
+long double calcChecksum(unsigned long long* d, Size_type len,
                          Real_type scale_factor);
 ///
-long double calcChecksum(Complex_ptr d, int len,
+long double calcChecksum(Real_ptr d, Size_type len,
+                         Real_type scale_factor);
+///
+long double calcChecksum(Complex_ptr d, Size_type len,
                          Real_type scale_factor);
 
 }  // closing brace for detail namespace
 
 
 /*!
- * \brief Get an host accessible data space for this dataSpace.
+ * \brief Get a host data space to use when making a host copy of data in the given
+ *        dataSpace.
+ *
+ * The returned host data space should reside in memory attached to the host.
+ *
+ * The intention is to get a data space with high performance on the host.
+ * Return the given data space if its already performant and fall back on a
+ * host data space that performs well in explicit copy operations with the
+ * given space.
+ */
+DataSpace hostCopyDataSpace(DataSpace dataSpace);
+
+/*!
+ * \brief Get a data space accessible to the host for the given dataSpace.
+ *
+ * The returned host data space may reside in memory attached to another device.
  *
- * Intended to be a space that is quick to copy to from the given space if
- * the given space is not accessible on the Host.
+ * The intention is to get a data space accessible on the host even if it is not
+ * performant. Return the given data space if its already accessible and fall
+ * back on a space that is host accessible and performs well in explicit copy
+ * operations with the given space.
  */
 DataSpace hostAccessibleDataSpace(DataSpace dataSpace);
 
@@ -171,16 +194,16 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace);
  * \brief Allocate data array (ptr).
  */
 template <typename T>
-inline void allocData(DataSpace dataSpace, T*& ptr_ref, int len, int align)
+inline void allocData(DataSpace dataSpace, T*& ptr_ref, Size_type len, Size_type align)
 {
-  size_t nbytes = len*sizeof(T);
+  Size_type nbytes = len*sizeof(T);
   T* ptr = static_cast<T*>(detail::allocData(dataSpace, nbytes, align));
 
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
   if (dataSpace == DataSpace::Omp) {
     // perform first touch on Omp Data
     #pragma omp parallel for
-    for (int i = 0; i < len; ++i) {
+    for (Size_type i = 0; i < len; ++i) {
       ptr[i] = T{};
     };
   }
@@ -205,9 +228,9 @@ inline void deallocData(DataSpace dataSpace, T*& ptr)
 template <typename T>
 inline void copyData(DataSpace dst_dataSpace, T* dst_ptr,
                      DataSpace src_dataSpace, const T* src_ptr,
-                     int len)
+                     Size_type len)
 {
-  size_t nbytes = len*sizeof(T);
+  Size_type nbytes = len*sizeof(T);
   detail::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, nbytes);
 }
 
@@ -216,7 +239,7 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr,
  */
 template <typename T>
 inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace,
-                     T*& ptr, int len, int align)
+                     T*& ptr, Size_type len, Size_type align)
 {
   if (new_dataSpace != old_dataSpace) {
 
@@ -237,7 +260,7 @@ template <typename T>
 struct AutoDataMover
 {
   AutoDataMover(DataSpace new_dataSpace, DataSpace old_dataSpace,
-                T*& ptr, int len, int align)
+                T*& ptr, Size_type len, Size_type align)
     : m_ptr(&ptr)
     , m_new_dataSpace(new_dataSpace)
     , m_old_dataSpace(old_dataSpace)
@@ -284,17 +307,17 @@ struct AutoDataMover
   T** m_ptr;
   DataSpace m_new_dataSpace;
   DataSpace m_old_dataSpace;
-  int m_len;
-  int m_align;
+  Size_type m_len;
+  Size_type m_align;
 };
 
 /*!
  * \brief Allocate and initialize data array.
  */
 template <typename T>
-inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align)
+inline void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align)
 {
-  DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace);
+  DataSpace init_dataSpace = hostCopyDataSpace(dataSpace);
 
   allocData(init_dataSpace, ptr, len, align);
 
@@ -310,10 +333,10 @@ inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align)
  * Array entries are initialized using the method initDataConst.
  */
 template <typename T>
-inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int align,
+inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align,
                                   T val)
 {
-  DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace);
+  DataSpace init_dataSpace = hostCopyDataSpace(dataSpace);
 
   allocData(init_dataSpace, ptr, len, align);
 
@@ -328,9 +351,9 @@ inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int ali
  * Array is initialized using method initDataRandSign.
  */
 template <typename T>
-inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int align)
+inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align)
 {
-  DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace);
+  DataSpace init_dataSpace = hostCopyDataSpace(dataSpace);
 
   allocData(init_dataSpace, ptr, len, align);
 
@@ -346,9 +369,9 @@ inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int
  * Array is initialized using method initDataRandValue.
  */
 template <typename T>
-inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int align)
+inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align)
 {
-  DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace);
+  DataSpace init_dataSpace = hostCopyDataSpace(dataSpace);
 
   allocData(init_dataSpace, ptr, len, align);
 
@@ -361,13 +384,13 @@ inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int
  * Calculate and return checksum for arrays.
  */
 template <typename T>
-inline long double calcChecksum(DataSpace dataSpace, T* ptr, int len, int align,
+inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size_type align,
                                 Real_type scale_factor)
 {
   T* check_ptr = ptr;
   T* copied_ptr = nullptr;
 
-  DataSpace check_dataSpace = hostAccessibleDataSpace(dataSpace);
+  DataSpace check_dataSpace = hostCopyDataSpace(dataSpace);
   if (check_dataSpace != dataSpace) {
     allocData(check_dataSpace, copied_ptr, len, align);
 
@@ -428,9 +451,9 @@ struct RAJAPoolAllocatorHolder
     }
 
     /*[[nodiscard]]*/
-    value_type* allocate(size_t num)
+    value_type* allocate(Size_type num)
     {
-      if (num > std::numeric_limits<size_t>::max() / sizeof(value_type)) {
+      if (num > std::numeric_limits<Size_type>::max() / sizeof(value_type)) {
         throw std::bad_alloc();
       }
 
diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index d730fb21d..6a951334a 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -24,7 +24,10 @@
 #include "basic/REDUCE3_INT.hpp"
 #include "basic/INDEXLIST_3LOOP.hpp"
 #include "algorithm/SORT.hpp"
-#include "apps/HALOEXCHANGE_FUSED.hpp"
+#include "comm/HALO_PACKING_FUSED.hpp"
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+#include "comm/HALO_EXCHANGE_FUSED.hpp"
+#endif
 
 #include <list>
 #include <vector>
@@ -121,12 +124,13 @@ Executor::Executor(int argc, char** argv)
 {
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   configuration cc;
-  adiak::init(NULL);
-  adiak::user();
-  adiak::launchdate();
-  adiak::libraries();
-  adiak::cmdline();
-  adiak::clustername();
+  #if defined(RAJA_PERFSUITE_ENABLE_MPI)
+    MPI_Comm adiak_comm = MPI_COMM_WORLD;
+    adiak::init(&adiak_comm);
+  #else
+    adiak::init(nullptr);
+  #endif
+  adiak::collect_all();
   adiak::value("perfsuite_version", cc.adiak_perfsuite_version);
   adiak::value("raja_version", cc.adiak_raja_version);
   adiak::value("cmake_build_type", cc.adiak_cmake_build_type);
@@ -165,8 +169,20 @@ Executor::Executor(int argc, char** argv)
   if (strlen(cc.adiak_cmake_hip_architectures) > 0) {
     adiak::value("cmake_hip_architectures", cc.adiak_cmake_hip_architectures);
   }
-  if (cc.adiak_gpu_targets_block_sizes.size() > 0) {
-    adiak::value("gpu_targets_block_sizes", cc.adiak_gpu_targets_block_sizes);
+  if (strlen(cc.adiak_tuning_cuda_arch) > 0) {
+    adiak::value("tuning_cuda_arch", cc.adiak_tuning_cuda_arch);
+  }
+  if (strlen(cc.adiak_tuning_hip_arch) > 0) {
+    adiak::value("tuning_hip_arch", cc.adiak_tuning_hip_arch);
+  }
+  if (cc.adiak_gpu_block_sizes.size() > 0) {
+    adiak::value("gpu_block_sizes", cc.adiak_gpu_block_sizes);
+  }
+  if (cc.adiak_atomic_replications.size() > 0) {
+    adiak::value("atomic_replications", cc.adiak_atomic_replications);
+  }
+  if (cc.adiak_gpu_items_per_thread.size() > 0) {
+    adiak::value("gpu_items_per_thread", cc.adiak_gpu_items_per_thread);
   }
   if (cc.adiak_raja_hipcc_flags.size() > 0) {
     adiak::value("raja_hipcc_flags", cc.adiak_raja_hipcc_flags);
@@ -316,7 +332,8 @@ void Executor::setupSuite()
       KernelBase::setCaliperMgrVariantTuning(vid,
                                              tstr,
                                              run_params.getOutputDirName(),
-                                             run_params.getAddToSpotConfig());
+                                             run_params.getAddToSpotConfig(),
+                                             run_params.getAddToCaliperConfig());
 #endif
     }
 
@@ -392,6 +409,13 @@ void Executor::reportRunSummary(ostream& str) const
     str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl;
     str << "\t Output files will be named " << ofiles << endl;
 
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+    str << "\nRunning with " << run_params.getMPISize() << " MPI procs" << endl;
+    auto div3d = run_params.getMPI3DDivision();
+    const char* valid3d = run_params.validMPI3DDivision() ? "" : "invalid";
+    str << "\t 3D division = " << div3d[0] << " x " << div3d[1] << " x " << div3d[2] << " " << valid3d << endl;
+#endif
+
     str << "\nThe following kernels and variants (when available for a kernel) will be run:" << endl;
 
     str << "\nData Spaces"
@@ -409,11 +433,55 @@ void Executor::reportRunSummary(ostream& str) const
     if (isVariantAvailable(VariantID::Base_HIP)) {
       str << "\nHip - " << getDataSpaceName(run_params.getHipDataSpace());
     }
+    if (isVariantAvailable(VariantID::Base_SYCL)) {
+      str << "\nSycl - " << getDataSpaceName(run_params.getSyclDataSpace());
+    }
     if (isVariantAvailable(VariantID::Kokkos_Lambda)) {
       str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosDataSpace());
     }
     str << endl;
 
+    str << "\nReduction Data Spaces"
+        << "\n--------";
+    str << "\nSeq - " << getDataSpaceName(run_params.getSeqReductionDataSpace());
+    if (isVariantAvailable(VariantID::Base_OpenMP)) {
+      str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpReductionDataSpace());
+    }
+    if (isVariantAvailable(VariantID::Base_OpenMPTarget)) {
+      str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetReductionDataSpace());
+    }
+    if (isVariantAvailable(VariantID::Base_CUDA)) {
+      str << "\nCuda - " << getDataSpaceName(run_params.getCudaReductionDataSpace());
+    }
+    if (isVariantAvailable(VariantID::Base_HIP)) {
+      str << "\nHip - " << getDataSpaceName(run_params.getHipReductionDataSpace());
+    }
+    if (isVariantAvailable(VariantID::Kokkos_Lambda)) {
+      str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosReductionDataSpace());
+    }
+    str << endl;
+
+    str << "\nMPI Data Spaces"
+        << "\n--------";
+    str << "\nSeq - " << getDataSpaceName(run_params.getSeqMPIDataSpace());
+    if (isVariantAvailable(VariantID::Base_OpenMP)) {
+      str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpMPIDataSpace());
+    }
+    if (isVariantAvailable(VariantID::Base_OpenMPTarget)) {
+      str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetMPIDataSpace());
+    }
+    if (isVariantAvailable(VariantID::Base_CUDA)) {
+      str << "\nCuda - " << getDataSpaceName(run_params.getCudaMPIDataSpace());
+    }
+    if (isVariantAvailable(VariantID::Base_HIP)) {
+      str << "\nHip - " << getDataSpaceName(run_params.getHipMPIDataSpace());
+    }
+    if (isVariantAvailable(VariantID::Kokkos_Lambda)) {
+      str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosMPIDataSpace());
+    }
+    str << endl;
+
+
     str << "\nVariants and Tunings"
         << "\n--------\n";
     for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
@@ -456,15 +524,21 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
   Index_type itsrep_width = 0;
   Index_type bytesrep_width = 0;
   Index_type flopsrep_width = 0;
+  Index_type bytesReadrep_width = 0;
+  Index_type bytesWrittenrep_width = 0;
+  Index_type bytesAtomicModifyWrittenrep_width = 0;
   Index_type dash_width = 0;
 
   for (size_t ik = 0; ik < kernels.size(); ++ik) {
     kercol_width = max(kercol_width, kernels[ik]->getName().size());
     psize_width = max(psize_width, kernels[ik]->getActualProblemSize());
     reps_width = max(reps_width, kernels[ik]->getRunReps());
-    itsrep_width = max(reps_width, kernels[ik]->getItsPerRep());
+    itsrep_width = max(itsrep_width, kernels[ik]->getItsPerRep());
     bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep());
-    flopsrep_width = max(bytesrep_width, kernels[ik]->getFLOPsPerRep());
+    flopsrep_width = max(flopsrep_width, kernels[ik]->getFLOPsPerRep());
+    bytesReadrep_width = max(bytesReadrep_width, kernels[ik]->getBytesReadPerRep());
+    bytesWrittenrep_width = max(bytesWrittenrep_width, kernels[ik]->getBytesWrittenPerRep());
+    bytesAtomicModifyWrittenrep_width = max(bytesAtomicModifyWrittenrep_width, kernels[ik]->getBytesAtomicModifyWrittenPerRep());
   }
 
   const string sepchr(" , ");
@@ -508,6 +582,24 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
                          static_cast<Index_type>(frsize) ) + 3;
   dash_width += flopsrep_width + static_cast<Index_type>(sepchr.size());
 
+  double brrsize = log10( static_cast<double>(bytesReadrep_width) );
+  string bytesReadrep_head("BytesRead/rep");
+  bytesReadrep_width = max( static_cast<Index_type>(bytesReadrep_head.size()),
+                        static_cast<Index_type>(brrsize) ) + 3;
+  dash_width += bytesReadrep_width + static_cast<Index_type>(sepchr.size());
+
+  double bwrsize = log10( static_cast<double>(bytesWrittenrep_width) );
+  string bytesWrittenrep_head("BytesWritten/rep");
+  bytesWrittenrep_width = max( static_cast<Index_type>(bytesWrittenrep_head.size()),
+                        static_cast<Index_type>(bwrsize) ) + 3;
+  dash_width += bytesWrittenrep_width + static_cast<Index_type>(sepchr.size());
+
+  double bamrrsize = log10( static_cast<double>(bytesAtomicModifyWrittenrep_width) );
+  string bytesAtomicModifyWrittenrep_head("BytesAtomicModifyWritten/rep");
+  bytesAtomicModifyWrittenrep_width = max( static_cast<Index_type>(bytesAtomicModifyWrittenrep_head.size()),
+                        static_cast<Index_type>(bamrrsize) ) + 3;
+  dash_width += bytesAtomicModifyWrittenrep_width + static_cast<Index_type>(sepchr.size());
+
   str <<left<< setw(kercol_width) << kern_head
       << sepchr <<right<< setw(psize_width) << psize_head
       << sepchr <<right<< setw(reps_width) << rsize_head
@@ -515,6 +607,9 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
       << sepchr <<right<< setw(kernsrep_width) << kernsrep_head
       << sepchr <<right<< setw(bytesrep_width) << bytesrep_head
       << sepchr <<right<< setw(flopsrep_width) << flopsrep_head
+      << sepchr <<right<< setw(bytesReadrep_width) << bytesReadrep_head
+      << sepchr <<right<< setw(bytesWrittenrep_width) << bytesWrittenrep_head
+      << sepchr <<right<< setw(bytesAtomicModifyWrittenrep_width) << bytesAtomicModifyWrittenrep_head
       << endl;
 
   if ( !to_file ) {
@@ -533,6 +628,9 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
         << sepchr <<right<< setw(kernsrep_width) << kern->getKernelsPerRep()
         << sepchr <<right<< setw(bytesrep_width) << kern->getBytesPerRep()
         << sepchr <<right<< setw(flopsrep_width) << kern->getFLOPsPerRep()
+        << sepchr <<right<< setw(bytesReadrep_width) << kern->getBytesReadPerRep()
+        << sepchr <<right<< setw(bytesWrittenrep_width) << kern->getBytesWrittenPerRep()
+        << sepchr <<right<< setw(bytesAtomicModifyWrittenrep_width) << kern->getBytesAtomicModifyWrittenPerRep()
         << endl;
   }
 
@@ -632,59 +730,77 @@ void Executor::runWarmupKernels()
   getCout() << "\n\nRun warmup kernels...\n";
 
   //
-  // For kernels to be run, assemble a set of feature IDs
+  // Get warmup kernels to run from input
   //
-  std::set<FeatureID> feature_ids;
-  for (size_t ik = 0; ik < kernels.size(); ++ik) {
-    KernelBase* kernel = kernels[ik];
+  std::set<KernelID> kernel_ids = run_params.getWarmupKernelIDsToRun();
+
+  if ( kernel_ids.empty() ) {
+
+    //
+    // If no warmup kernels were given, choose a warmup kernel for each feature
+    //
+
+    //
+    // For kernels to be run, assemble a set of feature IDs
+    //
+    std::set<FeatureID> feature_ids;
+    for (size_t ik = 0; ik < kernels.size(); ++ik) {
+      KernelBase* kernel = kernels[ik];
 
-    for (size_t fid = 0; fid < NumFeatures; ++fid) {
-      FeatureID tfid = static_cast<FeatureID>(fid);
-      if (kernel->usesFeature(tfid) ) {
-         feature_ids.insert( tfid );
+      for (size_t fid = 0; fid < NumFeatures; ++fid) {
+        FeatureID tfid = static_cast<FeatureID>(fid);
+        if (kernel->usesFeature(tfid) ) {
+           feature_ids.insert( tfid );
+        }
       }
-    }
-  
-  } // iterate over kernels
 
-  //
-  // Map feature IDs to set of warmup kernel IDs
-  //
-  std::set<KernelID> kernel_ids;
-  for ( auto fid = feature_ids.begin(); fid != feature_ids.end(); ++ fid ) {
+    } // iterate over kernels
 
-    switch (*fid) {
+    //
+    // Map feature IDs to set of warmup kernel IDs
+    //
+    for ( auto fid = feature_ids.begin(); fid != feature_ids.end(); ++ fid ) {
 
-      case Forall:
-      case Kernel:
-      case Launch:
-        kernel_ids.insert(Basic_DAXPY); break;
+      switch (*fid) {
 
-      case Sort:
-        kernel_ids.insert(Algorithm_SORT); break;
-   
-      case Scan:
-        kernel_ids.insert(Basic_INDEXLIST_3LOOP); break;
+        case Forall:
+        case Kernel:
+        case Launch:
+          kernel_ids.insert(Basic_DAXPY); break;
 
-      case Workgroup:
-        kernel_ids.insert(Apps_HALOEXCHANGE_FUSED); break;
+        case Sort:
+          kernel_ids.insert(Algorithm_SORT); break;
 
-      case Reduction:
-        kernel_ids.insert(Basic_REDUCE3_INT); break;
+        case Scan:
+          kernel_ids.insert(Basic_INDEXLIST_3LOOP); break;
 
-      case Atomic:
-        kernel_ids.insert(Basic_PI_ATOMIC); break; 
+        case Workgroup:
+          kernel_ids.insert(Comm_HALO_PACKING_FUSED); break;
 
-      case View:
-        break;
-  
-      default:
-        break;
+        case Reduction:
+          kernel_ids.insert(Basic_REDUCE3_INT); break;
+
+        case Atomic:
+          kernel_ids.insert(Basic_PI_ATOMIC); break;
+
+        case View:
+          break;
+
+  #ifdef RAJA_PERFSUITE_ENABLE_MPI
+        case MPI:
+          kernel_ids.insert(Comm_HALO_EXCHANGE_FUSED); break;
+  #endif
+
+        default:
+          break;
+
+      }
 
     }
 
   }
 
+
   //
   // Run warmup kernels
   //
diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp
index c16286700..348fb44b7 100644
--- a/src/common/Executor.hpp
+++ b/src/common/Executor.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp
index 8d6012a6d..dcf309ec9 100644
--- a/src/common/GPUUtils.hpp
+++ b/src/common/GPUUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -16,10 +16,12 @@
 
 #include "rajaperf_config.hpp"
 
+#include <climits>
+
 namespace rajaperf
 {
 
-namespace gpu_block_size
+namespace integer
 {
 
 namespace detail
@@ -44,50 +46,40 @@ constexpr size_t lesser_of_squarest_factor_pair_helper(size_t n, size_t guess)
            : lesser_of_squarest_factor_pair_helper(n, guess - 1); // continue searching
 }
 
-// class to get the size of a camp::int_seq
-template < typename IntSeq >
-struct SizeOfIntSeq;
-///
-template < size_t... Is >
-struct SizeOfIntSeq<camp::int_seq<size_t, Is...>>
-{
-   static const size_t size = sizeof...(Is);
-};
-
 // class to help prepend integers to a list
-// this is used for the false case where I is not prepended to IntSeq
-template < bool B, size_t I, typename IntSeq >
+// this is used for the false case where I is not prepended to List
+template < bool B, typename T, typename List >
 struct conditional_prepend
 {
-  using type = IntSeq;
+  using type = List;
 };
-/// this is used for the true case where I is prepended to IntSeq
-template < size_t I, size_t... Is >
-struct conditional_prepend<true, I, camp::int_seq<size_t, Is...>>
+/// this is used for the true case where I is prepended to List
+template < typename T, typename... Ts >
+struct conditional_prepend<true, T, camp::list<Ts...>>
 {
-  using type = camp::int_seq<size_t, I, Is...>;
+  using type = camp::list<T, Ts...>;
 };
 
-// class to help create a sequence that is only the valid values in IntSeq
-template < typename validity_checker, typename IntSeq >
+// class to help create a sequence that is only the valid values in List
+template < typename validity_checker, typename List >
 struct remove_invalid;
 
 // base case where the list is empty, use the empty list
 template < typename validity_checker >
-struct remove_invalid<validity_checker, camp::int_seq<size_t>>
+struct remove_invalid<validity_checker, camp::list<>>
 {
-  using type = camp::int_seq<size_t>;
+  using type = camp::list<>;
 };
 
-// check validity of I and conditionally prepend I to a recursively generated
+// check validity of T and conditionally prepend T to a recursively generated
 // list of valid values
-template < typename validity_checker, size_t I, size_t... Is >
-struct remove_invalid<validity_checker, camp::int_seq<size_t, I, Is...>>
+template < typename validity_checker, typename T, typename... Ts >
+struct remove_invalid<validity_checker, camp::list<T, Ts...>>
 {
   using type = typename conditional_prepend<
-      validity_checker::template valid<I>(),
-      I,
-      typename remove_invalid<validity_checker, camp::int_seq<size_t, Is...>>::type
+      validity_checker::valid(T{}),
+      T,
+      typename remove_invalid<validity_checker, camp::list<Ts...>>::type
     >::type;
 };
 
@@ -119,55 +111,236 @@ constexpr size_t greater_of_squarest_factor_pair(size_t n)
 // always true
 struct AllowAny
 {
-  template < size_t I >
-  static constexpr bool valid() { return true; }
+  static constexpr bool valid(size_t RAJAPERF_UNUSED_ARG(i)) { return true; }
+};
+
+// true only if i > 0
+struct PositiveOnly 
+{
+  static constexpr bool valid(size_t i) { return i > 0; }
 };
 
-// true if of I is a multiple of N, false otherwise
+// true if of i is a multiple of N, false otherwise
 template < size_t N >
 struct MultipleOf
 {
-  template < size_t I >
-  static constexpr bool valid() { return (I/N)*N == I; }
+  static constexpr bool valid(size_t i) { return (i/N)*N == i; }
 };
 
-// true if the sqrt of I is representable as a size_t, false otherwise
+// true if the sqrt of i is representable as a size_t, false otherwise
 struct ExactSqrt
 {
-  template < size_t I >
-  static constexpr bool valid() { return sqrt(I)*sqrt(I) == I; }
+  static constexpr bool valid(size_t i) { return sqrt(i)*sqrt(i) == i; }
 };
 
-template < size_t... block_sizes >
-using list_type = camp::int_seq<size_t, block_sizes...>;
+template < size_t N >
+struct LessEqual
+{
+  static constexpr bool valid(size_t i) { return i <= N; }
+};
 
-// A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes
-// if rajaperf::configuration::gpu_block_sizes is not empty
-// and a camp::int_seq of default_block_size otherwise
-// with invalid entries removed according to validity_checker
-template < size_t default_block_size, typename validity_checker = AllowAny >
-using make_list_type =
+// A camp::list of camp::integral_constant<size_t, I> types.
+// If gpu_block_sizes from the configuration is not empty it is those gpu_block_sizes,
+// otherwise it is a list containing just default_block_size.
+// Invalid entries are removed according to validity_checker in either case.
+template < size_t default_block_size, typename validity_checker = PositiveOnly >
+using make_gpu_block_size_list_type =
       typename detail::remove_invalid<validity_checker,
-        typename std::conditional< (detail::SizeOfIntSeq<rajaperf::configuration::gpu_block_sizes>::size > 0),
+        typename std::conditional< (camp::size<rajaperf::configuration::gpu_block_sizes>::value > 0),
           rajaperf::configuration::gpu_block_sizes,
           list_type<default_block_size>
         >::type
       >::type;
 
-} // closing brace for gpu_block_size namespace
+// A camp::list of camp::integral_constant<size_t, I> types.
+// If atomic_replications from the configuration is not empty it is those atomic_replications,
+// otherwise it is a list containing just default_atomic_replication.
+// Invalid entries are removed according to validity_checker in either case.
+template < size_t default_atomic_replication, typename validity_checker = PositiveOnly >
+using make_atomic_replication_list_type =
+      typename detail::remove_invalid<validity_checker,
+        typename std::conditional< (camp::size<rajaperf::configuration::atomic_replications>::value > 0),
+          rajaperf::configuration::atomic_replications,
+          list_type<default_atomic_replication>
+        >::type
+      >::type;
+
+// A camp::list of camp::integral_constant<size_t, I> types.
+// If gpu_items_per_thread from the configuration is not empty it is those gpu_items_per_thread,
+// otherwise it is a list containing just default_gpu_items_per_thread.
+// Invalid entries are removed according to validity_checker in either case.
+template < size_t default_gpu_items_per_thread, typename validity_checker = PositiveOnly >
+using make_gpu_items_per_thread_list_type =
+      typename detail::remove_invalid<validity_checker,
+        typename std::conditional< (camp::size<rajaperf::configuration::gpu_items_per_thread>::value > 0),
+          rajaperf::configuration::gpu_items_per_thread,
+          list_type<default_gpu_items_per_thread>
+        >::type
+      >::type;
+
+} // closing brace for integer namespace
 
-//compile time loop over an integer sequence
-//this allows for creating a loop over a compile time constant variable
-template <typename Func, typename T, T... ts>
-inline void seq_for(camp::int_seq<T, ts...> const&, Func&& func)
+namespace gpu_algorithm {
+
+struct block_atomic_helper
 {
-  // braced init lists are evaluated in order
-  int seq_unused_array[] = {(func(camp::integral_constant<T,ts>{}), 0)...};
-  RAJAPERF_UNUSED_VAR(seq_unused_array);
-}
+  static constexpr bool atomic = true;
+  static std::string get_name() { return "blkatm"; }
+};
+
+struct block_device_helper
+{
+  static constexpr bool atomic = false;
+  static std::string get_name() { return "blkdev"; }
+};
+
+struct block_host_helper
+{
+  static constexpr bool atomic = false;
+  static std::string get_name() { return "blkhst"; }
+};
+
+using reducer_helpers = camp::list<
+    block_atomic_helper,
+    block_device_helper >;
+
+} // closing brace for gpu_algorithm namespace
+
+namespace gpu_mapping {
+
+struct global_direct_helper
+{
+  static constexpr bool direct = true;
+  static std::string get_name() { return "direct"; }
+};
+
+struct global_loop_occupancy_grid_stride_helper
+{
+  static constexpr bool direct = false;
+  static std::string get_name() { return "occgs"; }
+};
+
+using reducer_helpers = camp::list<
+    global_direct_helper,
+    global_loop_occupancy_grid_stride_helper >;
+
+} // closing brace for gpu_mapping namespace
 
 } // closing brace for rajaperf namespace
 
+// Get the max number of blocks to launch with the given MappingHelper
+// for kernel func with the given block_size and shmem.
+// This will use the occupancy calculator if MappingHelper::direct is false
+#define RAJAPERF_CUDA_GET_MAX_BLOCKS(MappingHelper, func, block_size, shmem)   \
+  MappingHelper::direct                                                        \
+      ? std::numeric_limits<int>::max()                       \
+      : detail::getCudaOccupancyMaxBlocks(                                     \
+            (func), (block_size), (shmem));
+///
+#define RAJAPERF_HIP_GET_MAX_BLOCKS(MappingHelper, func, block_size, shmem)    \
+  MappingHelper::direct                                                        \
+      ? std::numeric_limits<int>::max()                       \
+      : detail::getHipOccupancyMaxBlocks(                                      \
+            (func), (block_size), (shmem));
+
+// allocate pointer of pointer_type with length
+// device_ptr_name gets memory in the reduction data space for the current variant
+// host_ptr_name is set to either device_ptr_name if the reduction data space is
+// host accessible or a new allocation in a host accessible data space otherwise
+#define RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication) \
+  DataSpace reduction_data_space = getReductionDataSpace(vid);                 \
+  DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space);   \
+                                                                               \
+  pointer_type device_ptr_name;                                                \
+  allocData(reduction_data_space, device_ptr_name, (length)*(replication));    \
+  pointer_type host_ptr_name = device_ptr_name;                                \
+  if (reduction_data_space != host_data_space) {                               \
+    allocData(host_data_space, host_ptr_name, (length)*(replication));         \
+  }
+
+// deallocate device_ptr_name and host_ptr_name
+// must be in the same scope as RAJAPERF_GPU_REDUCER_SETUP_IMPL
+#define RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name)     \
+  deallocData(reduction_data_space, device_ptr_name);                          \
+  if (reduction_data_space != host_data_space) {                               \
+    deallocData(host_data_space, host_ptr_name);                               \
+  }
+
+// Initialize device_ptr_name with length copies of init_value
+// host_ptr_name will be used as an intermediary with an explicit copy
+// if the reduction data space is not host accessible
+#define RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(gpu_type, init_value, device_ptr_name, host_ptr_name, length, replication) \
+  if (device_ptr_name != host_ptr_name) {                                      \
+    for (size_t i = 0; i < static_cast<size_t>(length); ++i) {                 \
+      for (size_t r = 0; r < static_cast<size_t>(replication); ++r) {          \
+        host_ptr_name[i*(replication) + r] = (init_value);                     \
+      }                                                                        \
+    }                                                                          \
+    gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name,   \
+        (length)*(replication)*sizeof(device_ptr_name[0]),                     \
+        gpu_type##MemcpyHostToDevice, res.get_stream() ) );                    \
+  } else {                                                                     \
+    for (size_t i = 0; i < static_cast<size_t>(length); ++i) {                 \
+      for (size_t r = 0; r < static_cast<size_t>(replication); ++r) {          \
+        device_ptr_name[i*(replication) + r] = (init_value);                   \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// Initialize device_ptr_name with values in init_ptr
+// host_ptr_name will be used as an intermediary with an explicit copy
+// if the reduction data space is not host accessible
+#define RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(gpu_type, init_ptr, device_ptr_name, host_ptr_name, length, replication) \
+  if (device_ptr_name != host_ptr_name) {                                      \
+    for (size_t i = 0; i < static_cast<size_t>(length); ++i) {                 \
+      for (size_t r = 0; r < static_cast<size_t>(replication); ++r) {          \
+        host_ptr_name[i*(replication) + r] = (init_ptr)[i];                    \
+      }                                                                        \
+    }                                                                          \
+    gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name,   \
+        (length)*(replication)*sizeof(device_ptr_name[0]),                     \
+        gpu_type##MemcpyHostToDevice, res.get_stream() ) );                    \
+  } else {                                                                     \
+    for (size_t i = 0; i < static_cast<size_t>(length); ++i) {                 \
+      for (size_t r = 0; r < static_cast<size_t>(replication); ++r) {          \
+        device_ptr_name[i*(replication) + r] = (init_ptr)[i];                  \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// Copy back data from device_ptr_name into host_ptr_name
+// if the reduction data space is not host accessible
+#define RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(gpu_type, device_ptr_name, host_ptr_name, length, replication) \
+  if (device_ptr_name != host_ptr_name) {                                      \
+    gpu_type##Errchk( gpu_type##MemcpyAsync( host_ptr_name, device_ptr_name,   \
+        (length)*(replication)*sizeof(device_ptr_name[0]),                     \
+        gpu_type##MemcpyDeviceToHost, res.get_stream() ) );                    \
+  }                                                                            \
+  gpu_type##Errchk( gpu_type##StreamSynchronize( res.get_stream() ) );
+
+#define RAJAPERF_CUDA_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length, replication) \
+  RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication)
+#define RAJAPERF_CUDA_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \
+  RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name)
+#define RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length, replication) \
+  RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(cuda, init_value, device_ptr_name, host_ptr_name, length, replication)
+#define RAJAPERF_CUDA_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length, replication) \
+  RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(cuda, init_ptr, device_ptr_name, host_ptr_name, length, replication)
+#define RAJAPERF_CUDA_REDUCER_COPY_BACK(device_ptr_name, host_ptr_name, length, replication) \
+  RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, device_ptr_name, host_ptr_name, length, replication)
+
+#define RAJAPERF_HIP_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length, replication) \
+  RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication)
+#define RAJAPERF_HIP_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \
+  RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name)
+#define RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length, replication) \
+  RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(hip, init_value, device_ptr_name, host_ptr_name, length, replication)
+#define RAJAPERF_HIP_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length, replication) \
+  RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(hip, init_ptr, device_ptr_name, host_ptr_name, length, replication)
+#define RAJAPERF_HIP_REDUCER_COPY_BACK(device_ptr_name, host_ptr_name, length, replication) \
+  RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, device_ptr_name, host_ptr_name, length, replication)
+
+
 //
 #define RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(kernel, variant)     \
   void kernel::run##variant##Variant(VariantID vid, size_t tune_idx)           \
@@ -190,7 +363,11 @@ inline void seq_for(camp::int_seq<T, ts...> const&, Func&& func)
     seq_for(gpu_block_sizes_type{}, [&](auto block_size) {                     \
       if (run_params.numValidGPUBlockSize() == 0u ||                           \
           run_params.validGPUBlockSize(block_size)) {                          \
-        addVariantTuningName(vid, "block_"+std::to_string(block_size));        \
+        if (block_size == 0u) {                                                 \
+          addVariantTuningName(vid, "block_auto");                             \
+        } else {                                                               \
+          addVariantTuningName(vid, "block_"+std::to_string(block_size));      \
+        }                                                                      \
       }                                                                        \
     });                                                                        \
   }
diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp
index 8046fe785..14c1b7381 100644
--- a/src/common/HipDataUtils.hpp
+++ b/src/common/HipDataUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -29,6 +29,33 @@
 namespace rajaperf
 {
 
+/*!
+ * \brief Method for launching a HIP kernel with given configuration.
+ *
+ *        Note: method checks whether number of args and their types in
+ *              kernel signature matches args passed to this method.
+ */
+template <typename... Args, typename...KernArgs>
+void RPlaunchHipKernel(void (*kernel)(KernArgs...),
+                       const dim3& numBlocks, const dim3& dimBlocks,
+                       std::uint32_t sharedMemBytes, hipStream_t stream,
+                       Args const&... args)
+{
+  static_assert(sizeof...(KernArgs) == sizeof...(Args),
+                "Number of kernel args doesn't match what's passed to method");
+
+  static_assert(conjunction<std::is_same<std::decay_t<KernArgs>, std::decay_t<Args>>...>::value,
+                "Kernel arg types don't match what's passed to method");
+
+  constexpr size_t count = sizeof...(Args);
+  void* arg_arr[count]{(void*)&args...};
+
+  auto k = reinterpret_cast<const void*>(kernel);
+  hipErrchk( hipLaunchKernel(k, numBlocks, dimBlocks,
+                             arg_arr,
+                             sharedMemBytes, stream) );
+}
+
 /*!
  * \brief Simple forall hip kernel that runs a lambda.
  */
@@ -81,10 +108,37 @@ inline int getHipDevice()
   return device;
 }
 
+/*!
+ * \brief Get properties of the current hip device.
+ */
+inline hipDeviceProp_t getHipDeviceProp()
+{
+  hipDeviceProp_t prop;
+  hipErrchk(hipGetDeviceProperties(&prop, getHipDevice()));
+  return prop;
+}
+
+/*!
+ * \brief Get max occupancy in blocks for the given kernel for the current
+ *        hip device.
+ */
+template < typename Func >
+RAJA_INLINE
+int getHipOccupancyMaxBlocks(Func&& func, int num_threads, size_t shmem_size)
+{
+  int max_blocks = -1;
+  hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_blocks, func, num_threads, shmem_size));
+
+  size_t multiProcessorCount = getHipDeviceProp().multiProcessorCount;
+
+  return max_blocks * multiProcessorCount;
+}
+
 /*
  * Copy memory len bytes from src to dst.
  */
-inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len)
+inline void copyHipData(void* dst_ptr, const void* src_ptr, Size_type len)
 {
   hipErrchk( hipMemcpy( dst_ptr, src_ptr, len,
              hipMemcpyDefault ) );
@@ -93,7 +147,7 @@ inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len)
 /*!
  * \brief Allocate HIP device data array (dptr).
  */
-inline void* allocHipDeviceData(size_t len)
+inline void* allocHipDeviceData(Size_type len)
 {
   void* dptr = nullptr;
   hipErrchk( hipMalloc( &dptr, len ) );
@@ -103,7 +157,7 @@ inline void* allocHipDeviceData(size_t len)
 /*!
  * \brief Allocate HIP fine-grained device data array (dfptr).
  */
-inline void* allocHipDeviceFineData(size_t len)
+inline void* allocHipDeviceFineData(Size_type len)
 {
   void* dfptr = nullptr;
   hipErrchk( hipExtMallocWithFlags( &dfptr, len,
@@ -114,7 +168,7 @@ inline void* allocHipDeviceFineData(size_t len)
 /*!
  * \brief Allocate HIP managed data array (mptr).
  */
-inline void* allocHipManagedData(size_t len)
+inline void* allocHipManagedData(Size_type len)
 {
   void* mptr = nullptr;
   hipErrchk( hipMallocManaged( &mptr, len,
@@ -125,7 +179,7 @@ inline void* allocHipManagedData(size_t len)
 /*!
  * \brief Allocate HIP pinned data array (pptr).
  */
-inline void* allocHipPinnedData(size_t len)
+inline void* allocHipPinnedData(Size_type len)
 {
   void* pptr = nullptr;
   hipErrchk( hipHostMalloc( &pptr, len,
@@ -136,7 +190,7 @@ inline void* allocHipPinnedData(size_t len)
 /*!
  * \brief Allocate HIP fine-grained pinned data array (pfptr).
  */
-inline void* allocHipPinnedFineData(size_t len)
+inline void* allocHipPinnedFineData(Size_type len)
 {
   void* pfptr = nullptr;
   hipErrchk( hipHostMalloc( &pfptr, len,
@@ -147,7 +201,7 @@ inline void* allocHipPinnedFineData(size_t len)
 /*!
  * \brief Allocate HIP coarse-grained pinned data array (pcptr).
  */
-inline void* allocHipPinnedCoarseData(size_t len)
+inline void* allocHipPinnedCoarseData(Size_type len)
 {
   void* pcptr = nullptr;
   hipErrchk( hipHostMalloc( &pcptr, len,
@@ -158,7 +212,7 @@ inline void* allocHipPinnedCoarseData(size_t len)
 /*!
  * \brief Apply mem advice to HIP data array (ptr).
  */
-inline void adviseHipData(void* ptr, int len, hipMemoryAdvise advice, int device)
+inline void adviseHipData(void* ptr, size_t len, hipMemoryAdvise advice, int device)
 {
   hipErrchk( hipMemAdvise( ptr, len, advice, device ) );
 }
diff --git a/src/common/HipGridScan.hpp b/src/common/HipGridScan.hpp
new file mode 100644
index 000000000..c8c0c6e8b
--- /dev/null
+++ b/src/common/HipGridScan.hpp
@@ -0,0 +1,260 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include <rocprim/block/block_scan.hpp>
+#include <rocprim/block/block_exchange.hpp>
+#include <rocprim/warp/warp_reduce.hpp>
+#include <rocprim/warp/warp_scan.hpp>
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace detail
+{
+namespace hip
+{
+
+//
+// Define magic numbers for HIP execution
+//
+const size_t warp_size = 64;
+const size_t max_static_shmem = 65536;
+
+
+// perform a grid scan on val and returns the result at each thread
+// in exclusive and inclusive, note that val is used as scratch space
+template < typename DataType, size_t block_size, size_t items_per_thread >
+struct GridScan
+{
+  using BlockScan = rocprim::block_scan<DataType, block_size>; //, rocprim::block_scan_algorithm::reduce_then_scan>;
+  using BlockExchange = rocprim::block_exchange<DataType, block_size, items_per_thread>;
+  using WarpReduce = rocprim::warp_reduce<DataType, warp_size>;
+
+  union SharedStorage {
+    typename BlockScan::storage_type block_scan_storage;
+    typename BlockExchange::storage_type block_exchange_storage;
+    typename WarpReduce::storage_type warp_reduce_storage;
+    volatile DataType prev_grid_count;
+  };
+
+  static constexpr size_t shmem_size = sizeof(SharedStorage);
+
+  __device__
+  static void grid_scan(const int block_id,
+                        DataType (&val)[items_per_thread],
+                        DataType (&exclusive)[items_per_thread],
+                        DataType (&inclusive)[items_per_thread],
+                        DataType* block_counts,
+                        DataType* grid_counts,
+                        unsigned* block_readys)
+  {
+    const bool first_block = (block_id == 0);
+    const bool last_block = (block_id == static_cast<int>(gridDim.x-1));
+    const bool last_thread = (threadIdx.x == block_size-1);
+    const bool last_warp = (threadIdx.x >= block_size - warp_size);
+    const int warp_index = (threadIdx.x % warp_size);
+    const unsigned long long warp_index_mask = (1ull << warp_index);
+    const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull);
+
+    __shared__ SharedStorage s_temp_storage;
+
+
+    BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage);
+    __syncthreads();
+
+
+    BlockScan().exclusive_scan(val, exclusive, DataType{0}, s_temp_storage.block_scan_storage);
+    __syncthreads();
+
+    for (size_t ti = 0; ti < items_per_thread; ++ti) {
+      inclusive[ti] = exclusive[ti] + val[ti];
+    }
+
+    BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage);
+    __syncthreads();
+    BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage);
+    __syncthreads();
+    if (first_block) {
+
+      if (!last_block && last_thread) {
+        block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
+        grid_counts[block_id] = inclusive[items_per_thread-1];  // write inclusive scan result for grid through block
+        __threadfence();                         // ensure block_counts, grid_counts ready (release)
+        atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready
+      }
+
+    } else {
+
+      if (!last_block && last_thread) {
+        block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block
+        __threadfence();                         // ensure block_counts ready (release)
+        atomicExch(&block_readys[block_id], 1u); // write block_counts is ready
+      }
+
+      // get prev_grid_count using last warp in block
+      if (last_warp) {
+
+        DataType prev_grid_count = 0;
+
+        // accumulate previous block counts into registers of warp
+
+        int prev_block_base_id = block_id - warp_size;
+
+        unsigned prev_block_ready = 0u;
+        unsigned long long prev_blocks_ready_ballot = 0ull;
+        unsigned long long prev_grids_ready_ballot = 0ull;
+
+        // accumulate full warp worths of block counts
+        // stop if run out of full warps of a grid count is ready
+        while (prev_block_base_id >= 0) {
+
+          const int prev_block_id = prev_block_base_id + warp_index;
+
+          // ensure previous block_counts are ready
+          do {
+            prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
+
+            prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u);
+
+          } while (prev_blocks_ready_ballot != 0xffffffffffffffffull);
+
+          prev_grids_ready_ballot = __ballot(prev_block_ready == 2u);
+
+          if (prev_grids_ready_ballot != 0ull) {
+            break;
+          }
+
+          __threadfence(); // ensure block_counts or grid_counts ready (acquire)
+
+          // accumulate block_counts for prev_block_id
+          prev_grid_count += block_counts[prev_block_id];
+
+          prev_block_ready = 0u;
+
+          prev_block_base_id -= warp_size;
+        }
+
+        const int prev_block_id = prev_block_base_id + warp_index;
+
+        // ensure previous block_counts are ready
+        // this checks that block counts is ready for all blocks above
+        // the highest grid count that is ready
+        while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) {
+
+          if (prev_block_id >= 0) {
+            prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u);
+          }
+
+          prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u);
+          prev_grids_ready_ballot = __ballot(prev_block_ready == 2u);
+        }
+        __threadfence(); // ensure block_counts or grid_counts ready (acquire)
+
+        // read one grid_count from a block with id grid_count_ready_id
+        // and read the block_counts from blocks with higher ids.
+        if (warp_index_mask > prev_grids_ready_ballot) {
+          // accumulate block_counts for prev_block_id
+          prev_grid_count += block_counts[prev_block_id];
+        } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) {
+          // accumulate grid_count for grid_count_ready_id
+          prev_grid_count += grid_counts[prev_block_id];
+        }
+
+
+        WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage);
+        prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp
+
+        if (last_thread) {
+
+          if (!last_block) {
+            grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1];   // write inclusive scan result for grid through block
+            __threadfence();                        // ensure grid_counts ready (release)
+            atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready
+          }
+
+          s_temp_storage.prev_grid_count = prev_grid_count;
+        }
+      }
+
+      __syncthreads();
+      DataType prev_grid_count = s_temp_storage.prev_grid_count;
+
+      for (size_t ti = 0; ti < items_per_thread; ++ti) {
+        exclusive[ti] = prev_grid_count + exclusive[ti];
+        inclusive[ti] = prev_grid_count + inclusive[ti];
+      }
+    }
+  }
+
+};
+
+
+namespace detail
+{
+
+template < typename T, size_t block_size, size_t max_items_per_thread >
+struct grid_scan_max_items_per_thread
+  : std::conditional_t< (GridScan<T, block_size, max_items_per_thread>::shmem_size <= max_static_shmem),
+        grid_scan_max_items_per_thread<T, block_size, max_items_per_thread+1>,
+        std::integral_constant<size_t, max_items_per_thread-1> >
+{
+};
+
+}
+
+template < typename T, size_t block_size >
+struct grid_scan_max_items_per_thread
+  : detail::grid_scan_max_items_per_thread<T, block_size, 1>
+{
+};
+
+
+// tune grid scan to maximize throughput while minimizing items_per_thread
+
+// default tuning for unknown DataType or hip_arch
+template < typename DataType, size_t block_size, size_t hip_arch, typename enable = void >
+struct grid_scan_default_items_per_thread
+{
+  static constexpr size_t value =
+      grid_scan_max_items_per_thread<DataType, block_size>::value / 2;
+};
+
+// tuning for gfx90a
+template < typename DataType, size_t block_size >
+struct grid_scan_default_items_per_thread<
+    DataType, block_size, 910, std::enable_if_t<sizeof(DataType) == sizeof(double)> >
+{
+  static constexpr size_t value =
+      (block_size <= 64) ? 6 :
+      (block_size <= 128) ? 4 :
+      (block_size <= 256) ? 4 :
+      (block_size <= 512) ? 4 :
+      (block_size <= 1024) ? 2 : 1;
+};
+
+// tuning for gfx942
+template < typename DataType, size_t block_size >
+struct grid_scan_default_items_per_thread<
+    DataType, block_size, 942, std::enable_if_t<sizeof(DataType) == sizeof(double)>>
+{
+  static constexpr size_t value =
+      (block_size <= 64) ? 22 :
+      (block_size <= 128) ? 22 :
+      (block_size <= 256) ? 19 :
+      (block_size <= 512) ? 13 :
+      (block_size <= 1024) ? 7 : 1;
+};
+
+} // end namespace hip
+} // end namespace detail
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_HIP
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 646c9bd8d..679001df5 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -11,6 +11,7 @@
 #include "RunParams.hpp"
 #include "OpenMPTargetDataUtils.hpp"
 
+#include "RAJA/RAJA.hpp"
 #include <cmath>
 #include <limits>
 #include <stdexcept>
@@ -37,7 +38,9 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
 
   its_per_rep = -1;
   kernels_per_rep = -1;
-  bytes_per_rep = -1;
+  bytes_read_per_rep = -1;
+  bytes_written_per_rep = -1;
+  bytes_atomic_modify_written_per_rep = -1;
   FLOPs_per_rep = -1;
 
   running_variant = NumVariants;
@@ -68,6 +71,18 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
                                          CALI_ATTR_ASVALUE |
                                          CALI_ATTR_AGGREGATABLE |
                                          CALI_ATTR_SKIP_EVENTS);
+  Bytes_Read_Rep_attr = cali_create_attribute("BytesRead/Rep", CALI_TYPE_DOUBLE,
+                                              CALI_ATTR_ASVALUE |
+                                              CALI_ATTR_AGGREGATABLE |
+                                              CALI_ATTR_SKIP_EVENTS);
+  Bytes_Written_Rep_attr = cali_create_attribute("BytesWritten/Rep", CALI_TYPE_DOUBLE,
+                                                 CALI_ATTR_ASVALUE |
+                                                 CALI_ATTR_AGGREGATABLE |
+                                                 CALI_ATTR_SKIP_EVENTS);
+  Bytes_AtomicModifyWritten_Rep_attr = cali_create_attribute("BytesAtomicModifyWritten/Rep", CALI_TYPE_DOUBLE,
+                                                             CALI_ATTR_ASVALUE |
+                                                             CALI_ATTR_AGGREGATABLE |
+                                                             CALI_ATTR_SKIP_EVENTS);
   Flops_Rep_attr = cali_create_attribute("Flops/Rep", CALI_TYPE_DOUBLE,
                                          CALI_ATTR_ASVALUE |
                                          CALI_ATTR_AGGREGATABLE |
@@ -76,6 +91,14 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
                                            CALI_ATTR_ASVALUE |
                                            CALI_ATTR_AGGREGATABLE |
                                            CALI_ATTR_SKIP_EVENTS);
+  for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) {
+    FeatureID fid = static_cast<FeatureID>(i);
+    std::string feature = getFeatureName(fid);
+    Feature_attrs[feature] = cali_create_attribute(feature.c_str(), CALI_TYPE_INT,
+                                              CALI_ATTR_ASVALUE |
+                                              CALI_ATTR_AGGREGATABLE |
+                                              CALI_ATTR_SKIP_EVENTS);
+  }
 #endif
 }
 
@@ -167,13 +190,23 @@ void KernelBase::setVariantDefined(VariantID vid)
 #endif
       break;
     }
+
+    case Base_SYCL:
+    case RAJA_SYCL:
+    {
+#if defined(RAJA_ENABLE_SYCL)
+      setSyclTuningDefinitions(vid);
+#endif
+      break;
+    }
+
 // Required for running Kokkos
     case Kokkos_Lambda :
     {
 #if defined(RUN_KOKKOS)
-    setKokkosTuningDefinitions(vid);
+      setKokkosTuningDefinitions(vid);
 #endif
-    break;
+      break;
     }
 
     default : {
@@ -194,7 +227,7 @@ void KernelBase::setVariantDefined(VariantID vid)
   #endif
 }
 
-int KernelBase::getDataAlignment() const
+Size_type KernelBase::getDataAlignment() const
 {
   return run_params.getDataAlignment();
 }
@@ -227,6 +260,10 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const
     case RAJA_HIP :
       return run_params.getHipDataSpace();
 
+    case Base_SYCL :
+    case RAJA_SYCL :
+      return run_params.getSyclDataSpace();
+
     case Kokkos_Lambda :
       return run_params.getKokkosDataSpace();
 
@@ -235,9 +272,84 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const
   }
 }
 
-DataSpace KernelBase::getHostAccessibleDataSpace(VariantID vid) const
+DataSpace KernelBase::getMPIDataSpace(VariantID vid) const
+{
+  switch ( vid ) {
+
+    case Base_Seq :
+    case Lambda_Seq :
+    case RAJA_Seq :
+      return run_params.getSeqMPIDataSpace();
+
+    case Base_OpenMP :
+    case Lambda_OpenMP :
+    case RAJA_OpenMP :
+      return run_params.getOmpMPIDataSpace();
+
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+      return run_params.getOmpTargetMPIDataSpace();
+
+    case Base_CUDA :
+    case Lambda_CUDA :
+    case RAJA_CUDA :
+      return run_params.getCudaMPIDataSpace();
+
+    case Base_HIP :
+    case Lambda_HIP :
+    case RAJA_HIP :
+      return run_params.getHipMPIDataSpace();
+
+    case Base_SYCL :
+    case RAJA_SYCL :
+      return run_params.getSyclMPIDataSpace();
+
+    case Kokkos_Lambda :
+      return run_params.getKokkosMPIDataSpace();
+
+    default:
+      throw std::invalid_argument("getDataSpace : Unknown variant id");
+  }
+}
+
+DataSpace KernelBase::getReductionDataSpace(VariantID vid) const
 {
-  return hostAccessibleDataSpace(getDataSpace(vid));
+  switch ( vid ) {
+
+    case Base_Seq :
+    case Lambda_Seq :
+    case RAJA_Seq :
+      return run_params.getSeqReductionDataSpace();
+
+    case Base_OpenMP :
+    case Lambda_OpenMP :
+    case RAJA_OpenMP :
+      return run_params.getOmpReductionDataSpace();
+
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+      return run_params.getOmpTargetReductionDataSpace();
+
+    case Base_CUDA :
+    case Lambda_CUDA :
+    case RAJA_CUDA :
+      return run_params.getCudaReductionDataSpace();
+
+    case Base_HIP :
+    case Lambda_HIP :
+    case RAJA_HIP :
+      return run_params.getHipReductionDataSpace();
+
+    case Base_SYCL :
+    case RAJA_SYCL :
+      return run_params.getSyclReductionDataSpace();
+
+    case Kokkos_Lambda :
+      return run_params.getKokkosReductionDataSpace();
+
+    default:
+      throw std::invalid_argument("getReductionDataSpace : Unknown variant id");
+  }
 }
 
 void KernelBase::execute(VariantID vid, size_t tune_idx)
@@ -339,11 +451,22 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
 #endif
       break;
     }
+
+    case Base_SYCL:
+    case RAJA_SYCL:
+    {
+#if defined(RAJA_ENABLE_SYCL)
+      runSyclVariant(vid, tune_idx);
+#endif
+      break;
+    }
+
     case Kokkos_Lambda :
     {
 #if defined(RUN_KOKKOS)
       runKokkosVariant(vid, tune_idx);
 #endif
+      break;
     }
 
     default : {
@@ -384,7 +507,9 @@ void KernelBase::print(std::ostream& os) const
   }
   os << "\t\t\t its_per_rep = " << its_per_rep << std::endl;
   os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl;
-  os << "\t\t\t bytes_per_rep = " << bytes_per_rep << std::endl;
+  os << "\t\t\t bytes_read_per_rep = " << bytes_read_per_rep << std::endl;
+  os << "\t\t\t bytes_written_per_rep = " << bytes_written_per_rep << std::endl;
+  os << "\t\t\t bytes_atomic_modify_written_per_rep = " << bytes_atomic_modify_written_per_rep << std::endl;
   os << "\t\t\t FLOPs_per_rep = " << FLOPs_per_rep << std::endl;
   os << "\t\t\t num_exec: " << std::endl;
   for (unsigned j = 0; j < NumVariants; ++j) {
@@ -439,8 +564,16 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx)
     cali_set_double(Iters_Rep_attr,(double)getItsPerRep());
     cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
     cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
+    cali_set_double(Bytes_Read_Rep_attr,(double)getBytesReadPerRep());
+    cali_set_double(Bytes_Written_Rep_attr,(double)getBytesWrittenPerRep());
+    cali_set_double(Bytes_AtomicModifyWritten_Rep_attr,(double)getBytesAtomicModifyWrittenPerRep());
     cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
     cali_set_double(BlockSize_attr, getBlockSize());
+    for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) {
+        FeatureID fid = static_cast<FeatureID>(i);
+        std::string feature = getFeatureName(fid);
+        cali_set_int(Feature_attrs[feature], usesFeature(fid));
+    }
   }
 }
 
@@ -454,7 +587,8 @@ void KernelBase::doOnceCaliMetaEnd(VariantID vid, size_t tune_idx)
 void KernelBase::setCaliperMgrVariantTuning(VariantID vid,
                                   std::string tstr,
                                   const std::string& outdir,
-                                  const std::string& addToConfig)
+                                  const std::string& addToSpotConfig,
+                                  const std::string& addToCaliConfig)
 {
   static bool ran_spot_config_check = false;
   bool config_ok = true;
@@ -476,8 +610,21 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid,
           { "expr": "any(max#Iterations/Rep)", "as": "Iterations/Rep" },
           { "expr": "any(max#Kernels/Rep)", "as": "Kernels/Rep" },
           { "expr": "any(max#Bytes/Rep)", "as": "Bytes/Rep" },
+          { "expr": "any(max#BytesRead/Rep)", "as": "BytesRead/Rep" },
+          { "expr": "any(max#BytesWritten/Rep)", "as": "BytesWritten/Rep" },
+          { "expr": "any(max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" },
           { "expr": "any(max#Flops/Rep)", "as": "Flops/Rep" },
-          { "expr": "any(max#BlockSize)", "as": "BlockSize" }
+          { "expr": "any(max#BlockSize)", "as": "BlockSize" },
+          { "expr": "any(max#Forall)", "as": "FeatureForall" },
+          { "expr": "any(max#Kernel)", "as": "FeatureKernel" },
+          { "expr": "any(max#Launch)", "as": "FeatureLaunch" },
+          { "expr": "any(max#Sort)", "as": "FeatureSort" },
+          { "expr": "any(max#Scan)", "as": "FeatureScan" },
+          { "expr": "any(max#Workgroup)", "as": "FeatureWorkgroup" },
+          { "expr": "any(max#Reduction)", "as": "FeatureReduction" },
+          { "expr": "any(max#Atomic)", "as": "FeatureAtomic" },
+          { "expr": "any(max#View)", "as": "FeatureView" },
+          { "expr": "any(max#MPI)", "as": "FeatureMPI" }
         ]
       },
       {
@@ -489,21 +636,47 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid,
           { "expr": "any(any#max#Iterations/Rep)", "as": "Iterations/Rep" },
           { "expr": "any(any#max#Kernels/Rep)", "as": "Kernels/Rep" },
           { "expr": "any(any#max#Bytes/Rep)", "as": "Bytes/Rep" },
+          { "expr": "any(any#max#BytesRead/Rep)", "as": "BytesRead/Rep" },
+          { "expr": "any(any#max#BytesWritten/Rep)", "as": "BytesWritten/Rep" },
+          { "expr": "any(any#max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" },
           { "expr": "any(any#max#Flops/Rep)", "as": "Flops/Rep" },
-          { "expr": "any(any#max#BlockSize)", "as": "BlockSize" }
+          { "expr": "any(any#max#BlockSize)", "as": "BlockSize" },
+          { "expr": "any(any#max#Forall)", "as": "FeatureForall" },
+          { "expr": "any(any#max#Kernel)", "as": "FeatureKernel" },
+          { "expr": "any(any#max#Launch)", "as": "FeatureLaunch" },
+          { "expr": "any(any#max#Sort)", "as": "FeatureSort" },
+          { "expr": "any(any#max#Scan)", "as": "FeatureScan" },
+          { "expr": "any(any#max#Workgroup)", "as": "FeatureWorkgroup" },
+          { "expr": "any(any#max#Reduction)", "as": "FeatureReduction" },
+          { "expr": "any(any#max#Atomic)", "as": "FeatureAtomic" },
+          { "expr": "any(any#max#View)", "as": "FeatureView" },
+          { "expr": "any(any#max#MPI)", "as": "FeatureMPI" }
         ]
       }
     ]
   }
   )json";
 
-  if(!ran_spot_config_check && (!addToConfig.empty())) {
+  // Skip check if both empty
+  if ((!addToSpotConfig.empty() || !addToCaliConfig.empty()) && !ran_spot_config_check) {
     cali::ConfigManager cm;
-    std::string check_profile = "spot()," + addToConfig;
+    std::string check_profile;
+    // If both not empty
+    if (!addToSpotConfig.empty() && !addToCaliConfig.empty()) {
+      check_profile = "spot(" + addToSpotConfig + ")," + addToCaliConfig;
+    }
+    else if (!addToSpotConfig.empty()) {
+      check_profile = "spot(" + addToSpotConfig + ")";
+    }
+    // if !addToCaliConfig.empty()
+    else {
+      check_profile = addToCaliConfig;
+    }
+
     std::string msg = cm.check(check_profile.c_str());
     if(!msg.empty()) {
       std::cerr << "Problem with Cali Config: " << check_profile << "\n";
-      std::cerr << "Check your command line argument: " << addToConfig << "\n";
+      std::cerr << msg << "\n";
       config_ok = false;
       exit(-1);
     }
@@ -519,9 +692,13 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid,
       od = outdir + "/";
     }
     std::string vstr = getVariantName(vid);
-    std::string profile = "spot(output=" + od + vstr + "-" + tstr + ".cali)";
-    if(!addToConfig.empty()) {
-      profile += "," + addToConfig;
+    std::string profile = "spot(output=" + od + vstr + "-" + tstr + ".cali";
+    if(!addToSpotConfig.empty()) {
+      profile += "," + addToSpotConfig;
+    }
+    profile += ")";
+    if (!addToCaliConfig.empty()) {
+      profile += "," + addToCaliConfig;
     }
     std::cout << "Profile: " << profile << std::endl;
     mgr[vid][tstr].add_option_spec(kernel_info_spec);
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index b3ce7c3e3..8c7069f3c 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,6 +25,11 @@
 #if defined(RAJA_ENABLE_HIP)
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 #endif
+#if defined(RAJA_ENABLE_SYCL)
+#include <sycl.hpp>
+#endif
+
+#include "camp/resource.hpp"
 
 #include <string>
 #include <vector>
@@ -97,7 +102,9 @@ class KernelBase
   void setDefaultReps(Index_type reps) { default_reps = reps; }
   void setItsPerRep(Index_type its) { its_per_rep = its; };
   void setKernelsPerRep(Index_type nkerns) { kernels_per_rep = nkerns; };
-  void setBytesPerRep(Index_type bytes) { bytes_per_rep = bytes;}
+  void setBytesReadPerRep(Index_type bytes) { bytes_read_per_rep = bytes;}
+  void setBytesWrittenPerRep(Index_type bytes) { bytes_written_per_rep = bytes;}
+  void setBytesAtomicModifyWrittenPerRep(Index_type bytes) { bytes_atomic_modify_written_per_rep = bytes;}
   void setFLOPsPerRep(Index_type FLOPs) { FLOPs_per_rep = FLOPs; }
   void setBlockSize(Index_type size) { kernel_block_size = size; }
 
@@ -134,6 +141,11 @@ class KernelBase
   virtual void setKokkosTuningDefinitions(VariantID vid)
   { addVariantTuningName(vid, getDefaultTuningName()); }
 #endif
+#if defined(RAJA_ENABLE_SYCL)
+  virtual void setSyclTuningDefinitions(VariantID vid)
+  { addVariantTuningName(vid, getDefaultTuningName()); }
+#endif
+
 
   //
   // Getter methods used to generate kernel execution summary
@@ -145,7 +157,10 @@ class KernelBase
   Index_type getDefaultReps() const { return default_reps; }
   Index_type getItsPerRep() const { return its_per_rep; };
   Index_type getKernelsPerRep() const { return kernels_per_rep; };
-  Index_type getBytesPerRep() const { return bytes_per_rep; }
+  Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count atomic_modify_write operations as a read and a write to match previous counting
+  Index_type getBytesReadPerRep() const { return bytes_read_per_rep; }
+  Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep; }
+  Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; }
   Index_type getFLOPsPerRep() const { return FLOPs_per_rep; }
   double getBlockSize() const { return kernel_block_size; }
 
@@ -239,6 +254,17 @@ class KernelBase
     return camp::resources::Hip::get_default();
   }
 #endif
+#if defined(RAJA_ENABLE_SYCL)
+  camp::resources::Sycl getSyclResource()
+  {
+    /*
+    if (run_params.getGPUStream() == 0) {
+      return camp::resources::Sycl::SyclFromStream(0);
+    }
+    */
+    return camp::resources::Sycl::get_default();
+  }
+#endif
 
   void synchronize()
   {
@@ -256,24 +282,54 @@ class KernelBase
       hipErrchk( hipDeviceSynchronize() );
     }
 #endif
+#if defined(RAJA_ENABLE_SYCL)
+    if ( running_variant == Base_SYCL ||
+         running_variant == RAJA_SYCL ) {
+      getSyclResource().get_queue()->wait();
+    }
+#endif
+
   }
 
-  int getDataAlignment() const;
+  Size_type getDataAlignment() const;
 
   DataSpace getDataSpace(VariantID vid) const;
-  DataSpace getHostAccessibleDataSpace(VariantID vid) const;
+  DataSpace getReductionDataSpace(VariantID vid) const;
+  DataSpace getMPIDataSpace(VariantID vid) const;
 
   template <typename T>
-  void allocData(DataSpace dataSpace, T& ptr, int len)
+  void allocData(DataSpace dataSpace, T& ptr, Size_type len)
   {
     rajaperf::allocData(dataSpace,
         ptr, len, getDataAlignment());
   }
 
+  template <typename T>
+  void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len)
+  {
+    rajaperf::allocAndInitData(dataSpace,
+        ptr, len, getDataAlignment());
+  }
+
+  template <typename T>
+  void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, T val)
+  {
+    rajaperf::allocAndInitDataConst(dataSpace,
+        ptr, len, getDataAlignment(), val);
+  }
+
+  template <typename T>
+  rajaperf::AutoDataMover<T> scopedMoveData(DataSpace dataSpace, T*& ptr, Size_type len)
+  {
+    DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace);
+    rajaperf::moveData(hds, dataSpace, ptr, len, getDataAlignment());
+    return {dataSpace, hds, ptr, len, getDataAlignment()};
+  }
+
   template <typename T>
   void copyData(DataSpace dst_dataSpace, T* dst_ptr,
                 DataSpace src_dataSpace, const T* src_ptr,
-                int len)
+                Size_type len)
   {
     rajaperf::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, len);
   }
@@ -285,46 +341,47 @@ class KernelBase
   }
 
   template <typename T>
-  void allocData(T*& ptr, int len, VariantID vid)
+  void allocData(T*& ptr, Size_type len, VariantID vid)
   {
     rajaperf::allocData(getDataSpace(vid),
         ptr, len, getDataAlignment());
   }
 
   template <typename T>
-  void allocAndInitData(T*& ptr, int len, VariantID vid)
+  void allocAndInitData(T*& ptr, Size_type len, VariantID vid)
   {
     rajaperf::allocAndInitData(getDataSpace(vid),
         ptr, len, getDataAlignment());
   }
 
   template <typename T>
-  void allocAndInitDataConst(T*& ptr, int len, T val, VariantID vid)
+  void allocAndInitDataConst(T*& ptr, Size_type len, T val, VariantID vid)
   {
     rajaperf::allocAndInitDataConst(getDataSpace(vid),
         ptr, len, getDataAlignment(), val);
   }
 
   template <typename T>
-  void allocAndInitDataRandSign(T*& ptr, int len, VariantID vid)
+  void allocAndInitDataRandSign(T*& ptr, Size_type len, VariantID vid)
   {
     rajaperf::allocAndInitDataRandSign(getDataSpace(vid),
         ptr, len, getDataAlignment());
   }
 
   template <typename T>
-  void allocAndInitDataRandValue(T*& ptr, int len, VariantID vid)
+  void allocAndInitDataRandValue(T*& ptr, Size_type len, VariantID vid)
   {
     rajaperf::allocAndInitDataRandValue(getDataSpace(vid),
         ptr, len, getDataAlignment());
   }
 
   template <typename T>
-  rajaperf::AutoDataMover<T> scopedMoveData(T*& ptr, int len, VariantID vid)
+  rajaperf::AutoDataMover<T> scopedMoveData(T*& ptr, Size_type len, VariantID vid)
   {
-    rajaperf::moveData(getHostAccessibleDataSpace(vid), getDataSpace(vid),
-        ptr, len, getDataAlignment());
-    return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()};
+    DataSpace ds = getDataSpace(vid);
+    DataSpace hds = rajaperf::hostCopyDataSpace(ds);
+    rajaperf::moveData(hds, ds, ptr, len, getDataAlignment());
+    return {ds, hds, ptr, len, getDataAlignment()};
   }
 
   template <typename T>
@@ -341,14 +398,21 @@ class KernelBase
   }
 
   template <typename T>
-  long double calcChecksum(T* ptr, int len, VariantID vid)
+  long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, VariantID RAJAPERF_UNUSED_ARG(vid))
+  {
+    return rajaperf::calcChecksum(dataSpace,
+      ptr, len, getDataAlignment(), 1.0);
+  }
+
+  template <typename T>
+  long double calcChecksum(T* ptr, Size_type len, VariantID vid)
   {
     return rajaperf::calcChecksum(getDataSpace(vid),
       ptr, len, getDataAlignment(), 1.0);
   }
 
   template <typename T>
-  long double calcChecksum(T* ptr, int len, Real_type scale_factor, VariantID vid)
+  long double calcChecksum(T* ptr, Size_type len, Real_type scale_factor, VariantID vid)
   {
     return rajaperf::calcChecksum(getDataSpace(vid),
       ptr, len, getDataAlignment(), scale_factor);
@@ -406,6 +470,13 @@ class KernelBase
   virtual void runOpenMPTargetVariant(VariantID vid, size_t tune_idx) = 0;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+  virtual void runSyclVariant(VariantID vid, size_t tune_idx)
+  {
+     getCout() << "\n KernelBase: Unimplemented Sycl variant id = " << vid << std::endl;
+  }
+#endif
+
 #if defined(RUN_KOKKOS)
   virtual void runKokkosVariant(VariantID vid, size_t tune_idx)
   {
@@ -413,6 +484,7 @@ class KernelBase
   }
 #endif
 
+
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   void caliperOn() { doCaliperTiming = true; }
   void caliperOff() { doCaliperTiming = false; }
@@ -421,7 +493,8 @@ class KernelBase
   static void setCaliperMgrVariantTuning(VariantID vid,
                                     std::string tstr,
                                     const std::string& outdir,
-                                    const std::string& addToConfig);
+                                    const std::string& addToSpotConfig,
+                                    const std::string& addToCaliConfig);
 
   static void setCaliperMgrStart(VariantID vid, std::string tstr) { mgr[vid][tstr].start(); }
   static void setCaliperMgrStop(VariantID vid, std::string tstr) { mgr[vid][tstr].stop(); }
@@ -482,7 +555,9 @@ class KernelBase
   //
   Index_type its_per_rep;
   Index_type kernels_per_rep;
-  Index_type bytes_per_rep;
+  Index_type bytes_read_per_rep;
+  Index_type bytes_written_per_rep;
+  Index_type bytes_atomic_modify_written_per_rep;
   Index_type FLOPs_per_rep;
   double kernel_block_size = nan(""); // Set default value for non GPU kernels
 
@@ -501,8 +576,12 @@ class KernelBase
   cali_id_t Iters_Rep_attr;
   cali_id_t Kernels_Rep_attr;
   cali_id_t Bytes_Rep_attr;
+  cali_id_t Bytes_Read_Rep_attr;
+  cali_id_t Bytes_Written_Rep_attr;
+  cali_id_t Bytes_AtomicModifyWritten_Rep_attr;
   cali_id_t Flops_Rep_attr;
   cali_id_t BlockSize_attr;
+  std::map<std::string, cali_id_t> Feature_attrs;
 
 
   // we need a Caliper Manager object per variant
diff --git a/src/common/KokkosViewUtils.hpp b/src/common/KokkosViewUtils.hpp
index 856fcb6f1..65a700030 100644
--- a/src/common/KokkosViewUtils.hpp
+++ b/src/common/KokkosViewUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp
index b5c98cb97..b6875c7f7 100644
--- a/src/common/OpenMPTargetDataUtils.hpp
+++ b/src/common/OpenMPTargetDataUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -47,7 +47,7 @@ namespace detail
 /*
  * Copy memory len bytes from src to dst.
  */
-inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len,
+inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, Size_type len,
                           int dst_did, int src_did)
 {
   omp_target_memcpy( dst_ptr, const_cast<void*>(src_ptr), len,
@@ -58,7 +58,7 @@ inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len,
  * \brief Allocate device data array (dptr) and copy given hptr (host)
  * data to device array.
  */
-inline void* allocOpenMPDeviceData(size_t len,
+inline void* allocOpenMPDeviceData(Size_type len,
                            int did = getOpenMPTargetDevice())
 {
   return omp_target_alloc( len, did);
@@ -83,7 +83,7 @@ inline void deallocOpenMPDeviceData(void* dptr,
  * and of propoer size for copy operation to succeed.
  */
 template <typename T>
-void initOpenMPDeviceData(T* dptr, const T* hptr, int len,
+void initOpenMPDeviceData(T* dptr, const T* hptr, Size_type len,
                           int did = getOpenMPTargetDevice(),
                           int hid = getOpenMPTargetHost())
 {
@@ -97,7 +97,7 @@ void initOpenMPDeviceData(T* dptr, const T* hptr, int len,
  * and of propoer size for copy operation to succeed.
  */
 template <typename T>
-void getOpenMPDeviceData(T* hptr, const T* dptr, int len,
+void getOpenMPDeviceData(T* hptr, const T* dptr, Size_type len,
                          int hid = getOpenMPTargetHost(),
                          int did = getOpenMPTargetDevice())
 {
diff --git a/src/common/OutputUtils.cpp b/src/common/OutputUtils.cpp
index 87648a545..fbd7f3653 100644
--- a/src/common/OutputUtils.cpp
+++ b/src/common/OutputUtils.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/common/OutputUtils.hpp b/src/common/OutputUtils.hpp
index 5641401e9..197721133 100644
--- a/src/common/OutputUtils.hpp
+++ b/src/common/OutputUtils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp
index 085c058c4..c55dd83bd 100644
--- a/src/common/RAJAPerfSuite.cpp
+++ b/src/common/RAJAPerfSuite.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -35,6 +35,7 @@
 #include "basic/REDUCE3_INT.hpp"
 #include "basic/REDUCE_STRUCT.hpp"
 #include "basic/TRAP_INT.hpp"
+#include "basic/MULTI_REDUCE.hpp"
 
 //
 // Lcals kernels...
@@ -86,12 +87,11 @@
 #include "apps/EDGE3D.hpp"
 #include "apps/ENERGY.hpp"
 #include "apps/FIR.hpp"
-#include "apps/HALOEXCHANGE.hpp"
-#include "apps/HALOEXCHANGE_FUSED.hpp"
 #include "apps/LTIMES.hpp"
 #include "apps/LTIMES_NOVIEW.hpp"
 #include "apps/MASS3DEA.hpp"
 #include "apps/MASS3DPA.hpp"
+#include "apps/MATVEC_3D_STENCIL.hpp"
 #include "apps/NODAL_ACCUMULATION_3D.hpp"
 #include "apps/PRESSURE.hpp"
 #include "apps/VOL3D.hpp"
@@ -106,6 +106,19 @@
 #include "algorithm/REDUCE_SUM.hpp"
 #include "algorithm/MEMSET.hpp"
 #include "algorithm/MEMCPY.hpp"
+#include "algorithm/ATOMIC.hpp"
+#include "algorithm/HISTOGRAM.hpp"
+
+//
+// Comm kernels...
+//
+#include "comm/HALO_PACKING.hpp"
+#include "comm/HALO_PACKING_FUSED.hpp"
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+#include "comm/HALO_SENDRECV.hpp"
+#include "comm/HALO_EXCHANGE.hpp"
+#include "comm/HALO_EXCHANGE_FUSED.hpp"
+#endif
 
 
 #include <iostream>
@@ -133,6 +146,7 @@ static const std::string GroupNames [] =
   std::string("Stream"),
   std::string("Apps"),
   std::string("Algorithm"),
+  std::string("Comm"),
 
   std::string("Unknown Group")  // Keep this at the end and DO NOT remove....
 
@@ -175,6 +189,7 @@ static const std::string KernelNames [] =
   std::string("Basic_REDUCE3_INT"),
   std::string("Basic_REDUCE_STRUCT"),
   std::string("Basic_TRAP_INT"),
+  std::string("Basic_MULTI_REDUCE"),
 
 //
 // Lcals kernels...
@@ -226,12 +241,11 @@ static const std::string KernelNames [] =
   std::string("Apps_EDGE3D"),
   std::string("Apps_ENERGY"),
   std::string("Apps_FIR"),
-  std::string("Apps_HALOEXCHANGE"),
-  std::string("Apps_HALOEXCHANGE_FUSED"),
   std::string("Apps_LTIMES"),
   std::string("Apps_LTIMES_NOVIEW"),
   std::string("Apps_MASS3DEA"),
   std::string("Apps_MASS3DPA"),
+  std::string("Apps_MATVEC_3D_STENCIL"),
   std::string("Apps_NODAL_ACCUMULATION_3D"),
   std::string("Apps_PRESSURE"),
   std::string("Apps_VOL3D"),
@@ -246,6 +260,19 @@ static const std::string KernelNames [] =
   std::string("Algorithm_REDUCE_SUM"),
   std::string("Algorithm_MEMSET"),
   std::string("Algorithm_MEMCPY"),
+  std::string("Algorithm_ATOMIC"),
+  std::string("Algorithm_HISTOGRAM"),
+
+//
+// Comm kernels...
+//
+  std::string("Comm_HALO_PACKING"),
+  std::string("Comm_HALO_PACKING_FUSED"),
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+  std::string("Comm_HALO_SENDRECV"),
+  std::string("Comm_HALO_EXCHANGE"),
+  std::string("Comm_HALO_EXCHANGE_FUSED"),
+#endif
 
   std::string("Unknown Kernel")  // Keep this at the end and DO NOT remove....
 
@@ -288,6 +315,9 @@ static const std::string VariantNames [] =
 
   std::string("Kokkos_Lambda"),
 
+  std::string("Base_SYCL"),
+  std::string("RAJA_SYCL"),
+
   std::string("Unknown Variant")  // Keep this at the end and DO NOT remove....
 
 }; // END VariantNames
@@ -321,6 +351,10 @@ static const std::string FeatureNames [] =
 
   std::string("View"),
 
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+  std::string("MPI"),
+#endif
+
   std::string("Unknown Feature")  // Keep this at the end and DO NOT remove....
 
 }; // END FeatureNames
@@ -348,6 +382,10 @@ static const std::string DataSpaceNames [] =
 
   std::string("CudaPinned"),
   std::string("CudaManaged"),
+  std::string("CudaManagedHostPreferred"),
+  std::string("CudaManagedDevicePreferred"),
+  std::string("CudaManagedHostPreferredDeviceAccessed"),
+  std::string("CudaManagedDevicePreferredHostAccessed"),
   std::string("CudaDevice"),
 
   std::string("HipHostAdviseFine"),
@@ -361,6 +399,14 @@ static const std::string DataSpaceNames [] =
   std::string("HipDevice"),
   std::string("HipDeviceFine"),
 
+  std::string("SyclPinned"),
+  std::string("SyclManaged"),
+  std::string("SyclDevice"),
+
+  std::string("Unknown Memory"), // Keep this at the end and DO NOT remove....
+
+  std::string("Copy"),
+
   std::string("Unknown Memory")  // Keep this at the end and DO NOT remove....
 
 }; // END VariantNames
@@ -478,6 +524,13 @@ bool isVariantAvailable(VariantID vid)
   }
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+  if ( vid == Base_SYCL ||
+       vid == RAJA_SYCL ) {
+    ret_val = true;
+  }
+#endif
+
   return ret_val;
 }
 
@@ -539,6 +592,13 @@ bool isVariantGPU(VariantID vid)
   }
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+  if ( vid == Base_SYCL ||
+       vid == RAJA_SYCL ) {
+    ret_val = true;
+  }
+#endif
+
   return ret_val;
 }
 
@@ -570,7 +630,7 @@ const std::string& getDataSpaceName(DataSpace ds)
 /*!
  *******************************************************************************
  *
- * Return true if the allocate associated with DataSpace enum value is available.
+ * Return true if the allocator associated with DataSpace enum value is available.
  *
  *******************************************************************************
  */
@@ -579,24 +639,37 @@ bool isDataSpaceAvailable(DataSpace dataSpace)
   bool ret_val = false;
 
   switch (dataSpace) {
-    case DataSpace::Host:
-      ret_val = true; break;
+
+    case DataSpace::Host: {
+      ret_val = true;
+      break;
+    }
 
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
-    case DataSpace::Omp:
-      ret_val = true; break;
+    case DataSpace::Omp: {
+      ret_val = true;
+      break;
+    }
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    case DataSpace::OmpTarget:
-      ret_val = true; break;
+    case DataSpace::OmpTarget: {
+      ret_val = true;
+      break;
+    }
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
     case DataSpace::CudaPinned:
     case DataSpace::CudaManaged:
-    case DataSpace::CudaDevice:
-      ret_val = true; break;
+    case DataSpace::CudaManagedHostPreferred:
+    case DataSpace::CudaManagedDevicePreferred:
+    case DataSpace::CudaManagedHostPreferredDeviceAccessed:
+    case DataSpace::CudaManagedDevicePreferredHostAccessed:
+    case DataSpace::CudaDevice: {
+      ret_val = true;
+      break;
+    }
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
@@ -613,17 +686,57 @@ bool isDataSpaceAvailable(DataSpace dataSpace)
     case DataSpace::HipManagedAdviseCoarse:
 #endif
     case DataSpace::HipDevice:
-    case DataSpace::HipDeviceFine:
-      ret_val = true; break;
+    case DataSpace::HipDeviceFine: {
+      ret_val = true;
+      break;
+    } 
 #endif
 
-    default:
-      ret_val = false; break;
-  }
+#if defined(RAJA_ENABLE_SYCL)
+    case DataSpace::SyclPinned:
+    case DataSpace::SyclManaged:
+    case DataSpace::SyclDevice: {
+      ret_val = true;
+      break;
+    }
+#endif
+
+    default: {
+      ret_val = false;
+      break;
+    }
+
+  } // close switch (dataSpace)
 
   return ret_val;
 }
 
+/*!
+ *******************************************************************************
+ *
+ * Return true if the DataSpace enum value is a psuedo DataSpace.
+ *
+ *******************************************************************************
+ */
+bool isPseudoDataSpace(DataSpace dataSpace)
+{
+  bool ret_val = false;
+
+  switch (dataSpace) {
+
+    case DataSpace::Copy: {
+      ret_val = true;
+      break;
+    }
+    default: {
+      ret_val = false;
+      break;
+    }
+
+  }
+
+  return ret_val;
+}
 
 /*
  *******************************************************************************
@@ -714,6 +827,10 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new basic::TRAP_INT(run_params);
        break;
     }
+    case Basic_MULTI_REDUCE : {
+       kernel = new basic::MULTI_REDUCE(run_params);
+       break;
+    }
 
 //
 // Lcals kernels...
@@ -871,14 +988,6 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new apps::FIR(run_params);
        break;
     }
-    case Apps_HALOEXCHANGE : {
-       kernel = new apps::HALOEXCHANGE(run_params);
-       break;
-    }
-    case Apps_HALOEXCHANGE_FUSED : {
-       kernel = new apps::HALOEXCHANGE_FUSED(run_params);
-       break;
-    }
     case Apps_LTIMES : {
        kernel = new apps::LTIMES(run_params);
        break;
@@ -895,6 +1004,10 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new apps::MASS3DPA(run_params);
        break;
     }
+    case Apps_MATVEC_3D_STENCIL : {
+       kernel = new apps::MATVEC_3D_STENCIL(run_params);
+       break;
+    }
     case Apps_NODAL_ACCUMULATION_3D : {
        kernel = new apps::NODAL_ACCUMULATION_3D(run_params);
        break;
@@ -939,6 +1052,40 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new algorithm::MEMCPY(run_params);
        break;
     }
+    case Algorithm_ATOMIC: {
+       kernel = new algorithm::ATOMIC(run_params);
+       break;
+    }
+    case Algorithm_HISTOGRAM: {
+       kernel = new algorithm::HISTOGRAM(run_params);
+       break;
+    }
+
+//
+// Comm kernels...
+//
+    case Comm_HALO_PACKING : {
+       kernel = new comm::HALO_PACKING(run_params);
+       break;
+    }
+    case Comm_HALO_PACKING_FUSED : {
+       kernel = new comm::HALO_PACKING_FUSED(run_params);
+       break;
+    }
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+    case Comm_HALO_SENDRECV : {
+       kernel = new comm::HALO_SENDRECV(run_params);
+       break;
+    }
+    case Comm_HALO_EXCHANGE : {
+       kernel = new comm::HALO_EXCHANGE(run_params);
+       break;
+    }
+    case Comm_HALO_EXCHANGE_FUSED : {
+       kernel = new comm::HALO_EXCHANGE_FUSED(run_params);
+       break;
+    }
+#endif
 
     default: {
       getCout() << "\n Unknown Kernel ID = " << kid << std::endl;
@@ -949,6 +1096,7 @@ KernelBase* getKernelObject(KernelID kid,
   return kernel;
 }
 
+
 // subclass of streambuf that ignores overflow
 // never printing anything to the underlying stream
 struct NullStream : std::streambuf, std::ostream
diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp
index 3270a4090..35a400b32 100644
--- a/src/common/RAJAPerfSuite.hpp
+++ b/src/common/RAJAPerfSuite.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,6 +52,7 @@ enum GroupID {
   Stream,
   Apps,
   Algorithm,
+  Comm,
 
   NumGroups // Keep this one last and DO NOT remove (!!)
 
@@ -94,6 +95,7 @@ enum KernelID {
   Basic_REDUCE3_INT,
   Basic_REDUCE_STRUCT,
   Basic_TRAP_INT,
+  Basic_MULTI_REDUCE,
 
 //
 // Lcals kernels...
@@ -145,12 +147,11 @@ enum KernelID {
   Apps_EDGE3D,
   Apps_ENERGY,
   Apps_FIR,
-  Apps_HALOEXCHANGE,
-  Apps_HALOEXCHANGE_FUSED,
   Apps_LTIMES,
   Apps_LTIMES_NOVIEW,
   Apps_MASS3DEA,
   Apps_MASS3DPA,
+  Apps_MATVEC_3D_STENCIL,
   Apps_NODAL_ACCUMULATION_3D,
   Apps_PRESSURE,
   Apps_VOL3D,
@@ -165,6 +166,19 @@ enum KernelID {
   Algorithm_REDUCE_SUM,
   Algorithm_MEMSET,
   Algorithm_MEMCPY,
+  Algorithm_ATOMIC,
+  Algorithm_HISTOGRAM,
+
+//
+// Comm kernels...
+//
+  Comm_HALO_PACKING,
+  Comm_HALO_PACKING_FUSED,
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+  Comm_HALO_SENDRECV,
+  Comm_HALO_EXCHANGE,
+  Comm_HALO_EXCHANGE_FUSED,
+#endif
 
   NumKernels // Keep this one last and NEVER comment out (!!)
 
@@ -206,6 +220,9 @@ enum VariantID {
 
   Kokkos_Lambda,
 
+  Base_SYCL,
+  RAJA_SYCL,
+
   NumVariants // Keep this one last and NEVER comment out (!!)
 
 };
@@ -238,6 +255,10 @@ enum FeatureID {
 
   View,
 
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+  MPI,
+#endif
+
   NumFeatures // Keep this one last and NEVER comment out (!!)
 
 };
@@ -266,6 +287,10 @@ enum struct DataSpace {
 
   CudaPinned,
   CudaManaged,
+  CudaManagedHostPreferred,
+  CudaManagedDevicePreferred,
+  CudaManagedHostPreferredDeviceAccessed,
+  CudaManagedDevicePreferredHostAccessed,
   CudaDevice,
 
   HipHostAdviseFine,
@@ -279,7 +304,15 @@ enum struct DataSpace {
   HipDevice,
   HipDeviceFine,
 
-  NumSpaces // Keep this one last and NEVER comment out (!!)
+  SyclPinned,
+  SyclManaged,
+  SyclDevice,
+
+  NumSpaces, // Keep this one here and NEVER comment out (!!)
+
+  Copy,
+
+  EndPseudoSpaces // Keep this one last and NEVER comment out (!!)
 
 };
 
@@ -365,12 +398,21 @@ const std::string& getDataSpaceName(DataSpace cd);
 /*!
  *******************************************************************************
  *
- * Return true if the allocate associated with DataSpace enum value is available.
+ * Return true if the allocator associated with DataSpace enum value is available.
  *
  *******************************************************************************
  */
 bool isDataSpaceAvailable(DataSpace dataSpace);
 
+/*!
+ *******************************************************************************
+ *
+ * Return true if the DataSpace enum value is a pseudo DataSpace.
+ *
+ *******************************************************************************
+ */
+bool isPseudoDataSpace(DataSpace dataSpace);
+
 /*!
  *******************************************************************************
  *
diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp
index b86f6b7b6..9ec2566eb 100644
--- a/src/common/RPTypes.hpp
+++ b/src/common/RPTypes.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -61,6 +61,16 @@ using Index_type = RAJA::Index_type;
 using Index_ptr = Index_type*;
 
 
+/*!
+ ******************************************************************************
+ *
+ * \brief Type used for sizing allocations.
+ *
+ ******************************************************************************
+ */
+using Size_type = size_t;
+
+
 /*!
  ******************************************************************************
  *
@@ -95,10 +105,14 @@ using Checksum_type = long double;
 #if defined(RP_USE_DOUBLE)
 ///
 using Real_type = double;
+///
+#define Real_MPI_type MPI_DOUBLE
 
 #elif defined(RP_USE_FLOAT)
 ///
 using Real_type = float;
+///
+#define Real_MPI_type MPI_FLOAT
 
 #else
 #error Real_type is undefined!
diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp
index 96e6821a8..1665783a9 100644
--- a/src/common/RunParams.cpp
+++ b/src/common/RunParams.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -38,12 +38,27 @@ RunParams::RunParams(int argc, char** argv)
    size(0.0),
    size_factor(0.0),
    data_alignment(RAJA::DATA_ALIGN),
+   multi_reduce_num_bins(10),
+   multi_reduce_bin_assignment_algorithm(BinAssignmentAlgorithm::RunsRandomSizes),
+   ltimes_num_d(64),
+   ltimes_num_g(32),
+   ltimes_num_m(25),
+   array_of_ptrs_array_size(ARRAY_OF_PTRS_MAX_ARRAY_SIZE),
+   halo_width(1),
+   halo_num_vars(3),
    gpu_stream(1),
    gpu_block_sizes(),
+   atomic_replications(),
+   items_per_threads(),
+   mpi_size(1),
+   mpi_rank(0),
+   mpi_3d_division({-1, -1, -1}),
    pf_tol(0.1),
    checkrun_reps(1),
    reference_variant(),
    reference_vid(NumVariants),
+   warmup_kernel_input(),
+   invalid_warmup_kernel_input(),
    kernel_input(),
    invalid_kernel_input(),
    exclude_kernel_input(),
@@ -115,11 +130,37 @@ void RunParams::print(std::ostream& str) const
   str << "\n size = " << size;
   str << "\n size_factor = " << size_factor;
   str << "\n data_alignment = " << data_alignment;
+
+  str << "\n multi_reduce_num_bins = " << multi_reduce_num_bins;
+  str << "\n multi_reduce_bin_assignment_algorithm = " << BinAssignmentAlgorithmToStr(multi_reduce_bin_assignment_algorithm);
+
+  str << "\n ltimes_num_d = " << ltimes_num_d;
+  str << "\n ltimes_num_g = " << ltimes_num_g;
+  str << "\n ltimes_num_m = " << ltimes_num_m;
+
+  str << "\n array_of_ptrs_array_size = " << array_of_ptrs_array_size;
+
+  str << "\n halo_width = " << halo_width;
+  str << "\n halo_num_vars = " << halo_num_vars;
+
   str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default");
   str << "\n gpu_block_sizes = ";
   for (size_t j = 0; j < gpu_block_sizes.size(); ++j) {
     str << "\n\t" << gpu_block_sizes[j];
   }
+  str << "\n atomic_replications = ";
+  for (size_t j = 0; j < atomic_replications.size(); ++j) {
+    str << "\n\t" << atomic_replications[j];
+  }
+  str << "\n items_per_threads = ";
+  for (size_t j = 0; j < items_per_threads.size(); ++j) {
+    str << "\n\t" << items_per_threads[j];
+  }
+  str << "\n mpi_size = " << mpi_size;
+  str << "\n mpi_3d_division = ";
+  for (size_t j = 0; j < 3; ++j) {
+    str << "\n\t" << mpi_3d_division[j];
+  }
   str << "\n pf_tol = " << pf_tol;
   str << "\n checkrun_reps = " << checkrun_reps;
   str << "\n reference_variant = " << reference_variant;
@@ -140,6 +181,30 @@ void RunParams::print(std::ostream& str) const
   str << "\n cuda data space = " << getDataSpaceName(cudaDataSpace);
   str << "\n hip data space = " << getDataSpaceName(hipDataSpace);
   str << "\n kokkos data space = " << getDataSpaceName(kokkosDataSpace);
+  str << "\n sycl data space = " << getDataSpaceName(syclDataSpace);
+
+  str << "\n seq reduction data space = " << getDataSpaceName(seqReductionDataSpace);
+  str << "\n omp reduction data space = " << getDataSpaceName(ompReductionDataSpace);
+  str << "\n omp target reduction data space = " << getDataSpaceName(ompTargetReductionDataSpace);
+  str << "\n cuda reduction data space = " << getDataSpaceName(cudaReductionDataSpace);
+  str << "\n hip reduction data space = " << getDataSpaceName(hipReductionDataSpace);
+  str << "\n kokkos reduction data space = " << getDataSpaceName(kokkosReductionDataSpace);
+
+  str << "\n seq MPI data space = " << getDataSpaceName(seqMPIDataSpace);
+  str << "\n omp MPI data space = " << getDataSpaceName(ompMPIDataSpace);
+  str << "\n omp target MPI data space = " << getDataSpaceName(ompTargetMPIDataSpace);
+  str << "\n cuda MPI data space = " << getDataSpaceName(cudaMPIDataSpace);
+  str << "\n hip MPI data space = " << getDataSpaceName(hipMPIDataSpace);
+  str << "\n kokkos MPI data space = " << getDataSpaceName(kokkosMPIDataSpace);
+
+  str << "\n warmup_kernel_input = ";
+  for (size_t j = 0; j < warmup_kernel_input.size(); ++j) {
+    str << "\n\t" << warmup_kernel_input[j];
+  }
+  str << "\n invalid_warmup_kernel_input = ";
+  for (size_t j = 0; j < invalid_warmup_kernel_input.size(); ++j) {
+    str << "\n\t" << invalid_warmup_kernel_input[j];
+  }
 
   str << "\n kernel_input = ";
   for (size_t j = 0; j < kernel_input.size(); ++j) {
@@ -232,6 +297,11 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
 {
   getCout() << "\n\nReading command line input..." << std::endl;
 
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+#endif
+
   for (int i = 1; i < argc; ++i) {
 
     std::string opt(argv[i]);
@@ -403,6 +473,192 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         input_state = BadInput;
       }
 
+    } else if ( opt == std::string("--multi_reduce_num_bins") ) {
+
+      i++;
+      if ( i < argc ) {
+        long long num_bins = ::atoll( argv[i] );
+        long long min_num_bins = 1;
+        if ( num_bins < min_num_bins ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at least " << min_num_bins
+                << std::endl;
+          input_state = BadInput;
+        } else {
+          multi_reduce_num_bins = num_bins;
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--multi_reduce_bin_assignment_algorithm") ) {
+
+      i++;
+      if ( i < argc ) {
+
+        std::string bin_assignment_algorithm_name(argv[i]);
+
+        if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Random)) {
+          multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::Random;
+        } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsRandomSizes)) {
+          multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::RunsRandomSizes;
+        } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsEvenSizes)) {
+          multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::RunsEvenSizes;
+        } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Single)) {
+          multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::Single;
+        } else {
+          getCout() << "\nBad input:"
+                    << " must give " << opt << " one of the following values\n"
+                    << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Random) << ", "
+                    << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsRandomSizes) << ", "
+                    << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsEvenSizes) << ", "
+                    << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Single)
+                    << std::endl;
+          input_state = BadInput;
+          invalid_npasses_combiner_input.emplace_back(bin_assignment_algorithm_name);
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (string)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--ltimes_num_d") ) {
+
+      i++;
+      if ( i < argc ) {
+        long long num = ::atoll( argv[i] );
+        long long min_num = 1;
+        if ( num < min_num ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at least " << min_num
+                << std::endl;
+          input_state = BadInput;
+        } else {
+          ltimes_num_d = num;
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--ltimes_num_g") ) {
+
+      i++;
+      if ( i < argc ) {
+        long long num = ::atoll( argv[i] );
+        long long min_num = 1;
+        if ( num < min_num ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at least " << min_num
+                << std::endl;
+          input_state = BadInput;
+        } else {
+          ltimes_num_g = num;
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--ltimes_num_m") ) {
+
+      i++;
+      if ( i < argc ) {
+        long long num = ::atoll( argv[i] );
+        long long min_num = 1;
+        if ( num < min_num ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at least " << min_num
+                << std::endl;
+          input_state = BadInput;
+        } else {
+          ltimes_num_m = num;
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--array_of_ptrs_array_size") ) {
+
+      i++;
+      if ( i < argc ) {
+        long long num = ::atoll( argv[i] );
+        long long min_num = 1;
+        long long max_num = ARRAY_OF_PTRS_MAX_ARRAY_SIZE;
+        if ( num < min_num ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at least " << min_num
+                << std::endl;
+          input_state = BadInput;
+        } else if ( num > max_num ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at most " << max_num
+                << std::endl;
+          input_state = BadInput;
+        } else {
+          array_of_ptrs_array_size = num;
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--halo_width") ) {
+
+      i++;
+      if ( i < argc ) {
+        long long num = ::atoll( argv[i] );
+        long long min_num = 1;
+        if ( num < min_num ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at least " << min_num
+                << std::endl;
+          input_state = BadInput;
+        } else {
+          halo_width = num;
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--halo_num_vars") ) {
+
+      i++;
+      if ( i < argc ) {
+        long long num = ::atoll( argv[i] );
+        long long min_num = 1;
+        if ( num < min_num ) {
+          getCout() << "\nBad input:"
+                << " must give " << opt << " a value of at least " << min_num
+                << std::endl;
+          input_state = BadInput;
+        } else {
+          halo_num_vars = num;
+        }
+      } else {
+        getCout() << "\nBad input:"
+                  << " must give " << opt << " a value (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
     } else if ( opt == std::string("--gpu_stream_0") ) {
 
       gpu_stream = 0;
@@ -438,6 +694,99 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         input_state = BadInput;
       }
 
+    } else if ( opt == std::string("--atomic_replication") ) {
+
+      bool got_someting = false;
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          got_someting = true;
+          int atomic_replication = ::atoi( opt.c_str() );
+          if ( atomic_replication <= 0 ) {
+            getCout() << "\nBad input:"
+                      << " must give --atomic_replication POSITIVE values (int)"
+                      << std::endl;
+            input_state = BadInput;
+          } else {
+            atomic_replications.push_back(atomic_replication);
+          }
+          ++i;
+        }
+      }
+      if (!got_someting) {
+        getCout() << "\nBad input:"
+                  << " must give --atomic_replication one or more values (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--items_per_thread") ) {
+
+      bool got_someting = false;
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          got_someting = true;
+          int items_per_thread = ::atoi( opt.c_str() );
+          if ( items_per_thread <= 0 ) {
+            getCout() << "\nBad input:"
+                      << " must give --items_per_thread POSITIVE values (int)"
+                      << std::endl;
+            input_state = BadInput;
+          } else {
+            items_per_threads.push_back(items_per_thread);
+          }
+          ++i;
+        }
+      }
+      if (!got_someting) {
+        getCout() << "\nBad input:"
+                  << " must give --items_per_thread one or more values (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
+    } else if ( opt == std::string("--mpi_3d_division") ) {
+
+      int num_got = 0;
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          num_got += 1;
+          int number = ::atoi( opt.c_str() );
+          if ( number <= 0 ) {
+            getCout() << "\nBad input:"
+                      << " must give --mpi_3d_division POSITIVE values (int)"
+                      << std::endl;
+            input_state = BadInput;
+          } else if (num_got <= 3) {
+            mpi_3d_division[num_got-1] = number;
+          }
+          ++i;
+        }
+      }
+      if (num_got != 3) {
+        getCout() << "\nBad input:"
+                  << " must give --mpi_3d_division three values (int)"
+                  << std::endl;
+        input_state = BadInput;
+      }
+
     } else if ( opt == std::string("--pass-fail-tol") ||
                 opt == std::string("-pftol") ) {
 
@@ -451,6 +800,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         input_state = BadInput;
       }
 
+    } else if ( opt == std::string("--warmup-kernels") ||
+                opt == std::string("-wk") ) {
+
+      bool done = false;
+      i++;
+      while ( i < argc && !done ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+          done = true;
+        } else {
+          warmup_kernel_input.push_back(opt);
+          ++i;
+        }
+      }
+
     } else if ( opt == std::string("--kernels") ||
                 opt == std::string("-k") ) {
 
@@ -525,11 +890,28 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
                 opt == std::string("-cds") ||
                 opt == std::string("--hip-data-space") ||
                 opt == std::string("-hds") ||
+		opt == std::string("--sycl-data-space") ||
+                opt == std::string("-syds") ||
                 opt == std::string("--kokkos-data-space") ||
-                opt == std::string("-kds") ) {
+                opt == std::string("-kds") ||
+                opt == std::string("--seq-reduction-data-space") ||
+                opt == std::string("--omp-reduction-data-space") ||
+                opt == std::string("--omptarget-reduction-data-space") ||
+                opt == std::string("--cuda-reduction-data-space") ||
+                opt == std::string("--hip-reduction-data-space") ||
+                opt == std::string("--sycl-reduction-data-space") ||
+                opt == std::string("--kokkos-reduction-data-space") ||
+                opt == std::string("--seq-mpi-data-space") ||
+                opt == std::string("--omp-mpi-data-space") ||
+                opt == std::string("--omptarget-mpi-data-space") ||
+                opt == std::string("--cuda-mpi-data-space") ||
+                opt == std::string("--hip-mpi-data-space") ||
+                opt == std::string("--sycl-mpi-data-space") ||
+                opt == std::string("--kokkos-mpi-data-space") ) {
 
       bool got_someting = false;
       bool got_something_available = false;
+      bool got_something_pseudo = false;
       i++;
       if ( i < argc ) {
         auto opt_name = std::move(opt);
@@ -537,11 +919,12 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         if ( opt.at(0) == '-' ) {
           i--;
         } else {
-          for (int ids = 0; ids < static_cast<int>(DataSpace::NumSpaces); ++ids) {
+          for (int ids = 0; ids < static_cast<int>(DataSpace::EndPseudoSpaces); ++ids) {
             DataSpace ds = static_cast<DataSpace>(ids);
             if (getDataSpaceName(ds) == opt) {
               got_someting = true;
               got_something_available = isDataSpaceAvailable(ds);
+              got_something_pseudo = isPseudoDataSpace(ds);
               if (        opt_name == std::string("--seq-data-space") ||
                           opt_name == std::string("-sds") ) {
                 seqDataSpace = ds;
@@ -557,9 +940,47 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
               } else if ( opt_name == std::string("--hip-data-space") ||
                           opt_name == std::string("-hds") ) {
                 hipDataSpace = ds;
+              } else if ( opt_name == std::string("--sycl-data-space") ||
+                          opt_name == std::string("-syds") ) {
+                syclDataSpace = ds;
               } else if ( opt_name == std::string("--kokkos-data-space") ||
                           opt_name == std::string("-kds") ) {
                 kokkosDataSpace = ds;
+              } else if ( opt_name == std::string("--seq-reduction-data-space") ) {
+                seqReductionDataSpace = ds;
+              } else if ( opt_name == std::string("--omp-reduction-data-space") ) {
+                ompReductionDataSpace = ds;
+              } else if ( opt_name == std::string("--omptarget-reduction-data-space") ) {
+                ompTargetReductionDataSpace = ds;
+              } else if ( opt_name == std::string("--cuda-reduction-data-space") ) {
+                cudaReductionDataSpace = ds;
+              } else if ( opt_name == std::string("--hip-reduction-data-space") ) {
+                hipReductionDataSpace = ds;
+              } else if ( opt_name == std::string("--sycl-reduction-data-space") ) {
+                syclReductionDataSpace = ds;
+              } else if ( opt_name == std::string("--kokkos-reduction-data-space") ) {
+                kokkosReductionDataSpace = ds;
+              } else if ( opt_name == std::string("--seq-mpi-data-space") ) {
+                seqMPIDataSpace = ds;
+                got_something_available = got_something_available || got_something_pseudo;
+              } else if ( opt_name == std::string("--omp-mpi-data-space") ) {
+                ompMPIDataSpace = ds;
+                got_something_available = got_something_available || got_something_pseudo;
+              } else if ( opt_name == std::string("--omptarget-mpi-data-space") ) {
+                ompTargetMPIDataSpace = ds;
+                got_something_available = got_something_available || got_something_pseudo;
+              } else if ( opt_name == std::string("--cuda-mpi-data-space") ) {
+                cudaMPIDataSpace = ds;
+                got_something_available = got_something_available || got_something_pseudo;
+              } else if ( opt_name == std::string("--hip-mpi-data-space") ) {
+                hipMPIDataSpace = ds;
+                got_something_available = got_something_available || got_something_pseudo;
+              } else if ( opt_name == std::string("--sycl-mpi-data-space") ) {
+                syclMPIDataSpace = ds;
+                got_something_available = got_something_available || got_something_pseudo;
+              } else if ( opt_name == std::string("--kokkos-mpi-data-space") ) {
+                kokkosMPIDataSpace = ds;
+                got_something_available = got_something_available || got_something_pseudo;
               } else {
                 got_someting = false;
               }
@@ -580,6 +1001,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
           }
         }
       }
+
     } else if ( std::string(argv[i]) == std::string("--tunings") ||
                 std::string(argv[i]) == std::string("-t") ) {
 
@@ -727,6 +1149,17 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
           add_to_spot_config = std::string( argv[i] );
         }
       }
+    } else if ( std::string(argv[i]) == std::string("--add-to-cali-config") ||
+               std::string(argv[i]) == std::string("-atcc") ) {
+      i++;
+      if ( i < argc ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+        } else {
+          add_to_cali_config = std::string( argv[i] );
+        }
+      }
 #endif
 
     } else {
@@ -739,6 +1172,10 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
 
     }
 
+    if (input_state == InfoRequest) {
+      break;
+    }
+
   }
 
   // Default size and size_meaning if unset
@@ -747,6 +1184,55 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
     size_factor = 1.0;
   }
 
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+
+  // assumes number is >= 0
+  // returns {0} if number is 0
+  //         {1} if number is 1
+  //         {prime factors in non-decreasing order} otherwise
+  auto factorize = [](int number) {
+    std::vector<int> prime_factors;
+    int factor = 2;
+    while (factor <= std::sqrt(number)) {
+      int quotient = number / factor;
+      if (quotient * factor == number) {
+        prime_factors.emplace_back(factor);
+        number = quotient;
+      } else {
+        factor++;
+      }
+    }
+    prime_factors.emplace_back(number);
+    return prime_factors;
+  };
+
+  // Uses prime factors to set division
+  // to a relatively square grid
+  auto set_division = [](int* division, const int dims,
+                          std::vector<int> const& prime_factors) {
+    for (int d = 0; d < dims; ++d) {
+      division[d] = 1;
+    }
+
+    for (int factor : prime_factors) {
+
+      int min_d = 0;
+      for (int d = 1; d < dims; ++d) {
+        if (division[d] < division[min_d]) {
+          min_d = d;
+        }
+      }
+
+      division[min_d] *= factor;
+    }
+  };
+
+  if (mpi_3d_division[0] == -1) {
+    std::vector<int> prime_factors = factorize(mpi_size);
+    set_division(mpi_3d_division.data(), 3, prime_factors);
+  }
+#endif
+
   processNpassesCombinerInput();
 
   processKernelInput();
@@ -755,12 +1241,12 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
 
   processTuningInput();
 
-  if ( input_state != BadInput &&
+  if ( input_state != InfoRequest && 
+       input_state != BadInput &&
        input_state != DryRun && 
-       input_state != CheckRun ) {
+       input_state != CheckRun) {
     input_state = PerfRun;
   }
-
 }
 
 
@@ -829,6 +1315,15 @@ void RunParams::printHelpMessage(std::ostream& str) const
 
   str << "\t --disable-warmup (disable warmup kernels) [Default is run warmup kernels that are relevant to kernels selected to run]\n\n";
 
+  str << "\t --warmup-kernels, -wk <space-separated strings> [Default is run warmup kernels that are relevant to kernels selected to run]\n"
+      << "\t      (names of individual kernels and/or groups of kernels to warmup)\n"
+      << "\t      See '--print-kernels'/'-pk' option for list of valid kernel and group names.\n"
+      << "\t      Kernel names are listed as <group name>_<kernel name>.\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --warmup-kernels Polybench (warmup all kernels in Polybench group)\n"
+      << "\t\t -wk INIT3 MULADDSUB (warmup INIT3 and MULADDSUB kernels)\n"
+      << "\t\t -wk INIT3 Apps (warmup INIT3 kernel and all kernels in Apps group)\n\n";
+
   str << "\t --kernels, -k <space-separated strings> [Default is run all]\n"
       << "\t      (names of individual kernels and/or groups of kernels to run)\n"
       << "\t      See '--print-kernels'/'-pk' option for list of valid kernel and group names.\n"
@@ -915,12 +1410,36 @@ void RunParams::printHelpMessage(std::ostream& str) const
 
   str << "\t --gpu_block_size <space-separated ints> [no default]\n"
       << "\t      (block sizes to run for all GPU kernels)\n"
+      << "\t      Given int values must be > 0\n."
       << "\t      GPU kernels not supporting gpu_block_size option will be skipped.\n"
-      << "\t      Behavior depends on kernel implementations and \n"
-      << "\t      values give via CMake variable RAJA_PERFSUITE_GPU_BLOCKSIZES.\n";
+      << "\t      Behavior depends on individual kernel implementations and \n"
+      << "\t      compile configuration values given via CMake variable \n"
+      << "\t      RAJA_PERFSUITE_GPU_BLOCKSIZES.\n";
   str << "\t\t Example...\n"
       << "\t\t --gpu_block_size 128 256 512 (runs kernels with gpu_block_size 128, 256, and 512)\n\n";
 
+  str << "\t --atomic_replication <space-separated ints> [no default]\n"
+      << "\t      (atomic replications to run for all GPU kernels)\n"
+      << "\t      GPU kernels not supporting atomic_replication option will be skipped.\n"
+      << "\t      Behavior depends on kernel implementations and \n"
+      << "\t      values give via CMake variable RAJA_PERFSUITE_ATOMIC_REPLICATIONS.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --atomic_replication 128 256 512 (runs kernels with atomic_replication 128, 256, and 512)\n\n";
+
+  str << "\t --items_per_thread <space-separated ints> [no default]\n"
+      << "\t      (items per thread to run for all GPU kernels)\n"
+      << "\t      GPU kernels not supporting items_per_thread option will be skipped.\n"
+      << "\t      Behavior depends on kernel implementations and \n"
+      << "\t      values give via CMake variable RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --items_per_thread 128 256 512 (runs kernels with items_per_thread 128, 256, and 512)\n\n";
+
+  str << "\t --mpi_3d_division <space-separated ints> [no default]\n"
+      << "\t      (number of mpi ranks in each dimension in a 3d grid)\n"
+      << "\t      (3D MPI kernels will be skipped if the product of mpi_3d_division is not equal to the number of ranks)\n";
+  str << "\t\t Example...\n"
+      << "\t\t --mpi_3d_division 2 3 5 (runs 3d MPI kernels on a 2 by 3 by 5 grid)\n\n";
+
   str << "\t --tunings, -t <space-separated strings> [Default is run all]\n"
       << "\t      (names of tunings to run)\n"
       << "\t      Note: knowing which tunings are available requires knowledge about the variants,\n"
@@ -937,7 +1456,7 @@ void RunParams::printHelpMessage(std::ostream& str) const
       << "\t\t -et default library (exclude default and library tunings)\n\n";
 
   str << "\t Options for selecting kernel data used in kernels....\n"
-      << "\t ======================================================\n\n";;
+      << "\t ======================================================\n\n";
 
   str << "\t --data_alignment, -align <int> [default is RAJA::DATA_ALIGN]\n"
       << "\t      (minimum memory alignment for host allocations)\n"
@@ -945,6 +1464,55 @@ void RunParams::printHelpMessage(std::ostream& str) const
   str << "\t\t Example...\n"
       << "\t\t -align 4096 (allocates memory aligned to 4KiB boundaries)\n\n";
 
+  str << "\t --multi_reduce_num_bins <int> [default is 10]\n"
+      << "\t      (number of bins used in multi-reduce kernels)\n"
+      << "\t      Must be greater than 0.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --multi_reduce_num_bins 100\n\n";
+
+  str << "\t --multi_reduce_bin_assignment_algorithm <string> [default is RunsRandomSizes]\n"
+      << "\t      (algorithm used to assign bins to iterates in multi-reduce kernels)\n"
+      << "\t      Valid assignment algorithm names are 'Random', 'RunsRandomSizes', 'RunsEvenSizes', or 'Single'\n";
+  str << "\t\t Example...\n"
+      << "\t\t --multi_reduce_bin_assignment_algorithm Random\n\n";
+
+  str << "\t --ltimes_num_d <int> [default is 64]\n"
+      << "\t      (num_d used in ltimes kernels)\n"
+      << "\t      Must be greater than 0.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --ltimes_num_d 32\n\n";
+
+  str << "\t --ltimes_num_g <int> [default is 32]\n"
+      << "\t      (num_g used in ltimes kernels)\n"
+      << "\t      Must be greater than 0.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --ltimes_num_g 64\n\n";
+
+  str << "\t --ltimes_num_m <int> [default is 25]\n"
+      << "\t      (num_m used in ltimes kernels)\n"
+      << "\t      Must be greater than 0.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --ltimes_num_m 100\n\n";
+
+  str << "\t --array_of_ptrs_array_size <int> [default is " << ARRAY_OF_PTRS_MAX_ARRAY_SIZE << "]\n"
+      << "\t      (array size used in ARRAY_OF_PTRS kernel)\n"
+      << "\t      Must be greater than 0.\n"
+      << "\t      Must be less than or equal to " << ARRAY_OF_PTRS_MAX_ARRAY_SIZE << ".\n";
+  str << "\t\t Example...\n"
+      << "\t\t --array_of_ptrs_array_size 4\n\n";
+
+  str << "\t --halo_width <int> [default is 1]\n"
+      << "\t      (halo width used in halo kernels)\n"
+      << "\t      Must be greater than 0.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --halo_width 2\n\n";
+
+  str << "\t --halo_num_vars <int> [default is 3]\n"
+      << "\t      (num vars used in halo kernels)\n"
+      << "\t      Must be greater than 0.\n";
+  str << "\t\t Example...\n"
+      << "\t\t --halo_num_vars 10\n\n";
+
   str << "\t --seq-data-space, -sds <string> [Default is Host]\n"
       << "\t      (name of data space to use for sequential variants)\n"
       << "\t      Valid data space names are 'Host' or 'CudaPinned'\n";
@@ -980,17 +1548,105 @@ void RunParams::printHelpMessage(std::ostream& str) const
       << "\t\t --hip-data-space HipManaged (run HIP variants with Hip Managed memory)\n"
       << "\t\t -hds HipPinned (run HIP variants with Hip Pinned memory)\n\n";
 
+  str << "\t --sycl-data-space, -syds <string> [Default is SyclDevice]\n"
+      << "\t      (names of data space to use for SYCL variants)\n"
+      << "\t      Valid data space names are 'SyclDevice', 'SyclPinned', or 'SyclManaged'\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --sycl-data-space SyclManaged (run SYCL variants with Sycl Managed memory)\n"
+      << "\t\t -syds SyclPinned (run SYCL variants with Sycl Pinned memory)\n\n";
+
   str << "\t --kokkos-data-space, -kds <string> [Default is Host]\n"
       << "\t      (names of data space to use)\n";
   str << "\t\t Examples...\n"
       << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n"
-      << "\t\t -kds HipPinned (run KOKKOS variants with Hip Pinned memory)\n\n";
+      << "\t\t -kds HipPinned (run KOKKOS variants with CUDA Pinned memory)\n\n";
+
+  str << "\t --seq-reduction-data-space <string> [Default is Host]\n"
+      << "\t      (name of data space to use with reductions for sequential variants)\n"
+      << "\t      Valid data space names are 'Host' or 'CudaPinned'\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --seq-reduction-data-space Host (run sequential variants with Host memory)\n\n";
+
+  str << "\t --omp-reduction-data-space <string> [Default is Omp]\n"
+      << "\t      (names of data space to use with reductions for OpenMP variants)\n"
+      << "\t      Valid data space names are 'Host' or 'Omp'\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --omp-reduction-data-space Omp (run Omp variants with Omp memory)\n\n";
+
+  str << "\t --omptarget-reduction-data-space <string> [Default is OmpTarget]\n"
+      << "\t      (names of data space to use with reductions for OpenMP Target variants)\n"
+      << "\t      Valid data space names are 'OmpTarget' or 'CudaPinned'\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --omptarget-reduction-data-space OmpTarget (run Omp Target variants with Omp Target memory)\n\n";
+
+  str << "\t --cuda-reduction-data-space <string> [Default is CudaManagedDevicePreferredHostAccessed]\n"
+      << "\t      (names of data space to use with reductions for CUDA variants)\n"
+      << "\t      Valid data space names are 'CudaDevice', 'CudaPinned', or 'CudaManaged'\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --cuda-reduction-data-space CudaManaged (run CUDA variants with Cuda Managed memory)\n\n";
+
+  str << "\t --hip-reduction-data-space <string> [Default is HipDevice]\n"
+      << "\t      (names of data space to use with reductions for HIP variants)\n"
+      << "\t      Valid data space names are 'HipDevice', 'HipPinned', or 'HipManaged'\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --hip-reduction-data-space HipManaged (run HIP variants with Hip Managed memory)\n\n";
+
+  str << "\t --sycl-reduction-data-space <string> [Default is SyclDevice]\n"
+      << "\t      (names of data space to use with reductions for SYCL variants)\n"
+      << "\t      Valid data space names are 'SyclDevice', 'SyclPinned', or 'SyclManaged'\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --sycl-reduction-data-space SyclManaged (run SYCL variants with Sycl Managed memory)\n\n";
+
+  str << "\t --kokkos-reduction-data-space <string> [Default is Host]\n"
+      << "\t      (names of data space to use with reductions)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n"
+      << "\t\t -kds HipPinned (run KOKKOS variants with HIP Pinned memory)\n\n";
+
+  str << "\t --seq-mpi-data-space <string> [Default is Host]\n"
+      << "\t      (name of data space to use with MPI and sequential execution)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --seq-mpi-data-space Host (run sequential variants with Host memory for MPI buffers)\n\n";
+
+  str << "\t --omp-mpi-data-space <string> [Default is Omp]\n"
+      << "\t      (name of data space to use with MPI and OpenMP execution)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --omp-mpi-data-space Omp (run Omp variants with Omp memory for MPI buffers)\n\n";
+
+  str << "\t --omptarget-mpi-data-space <string> [Default is Copy]\n"
+      << "\t      (name of data space to use with MPI and OpenMP target execution)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --omptarget-mpi-data-space Copy (run Omp Target variants and copy to Host memory for MPI buffers)\n\n";
+
+  str << "\t --cuda-mpi-data-space <string> [Default is CudaPinned]\n"
+      << "\t      (name of data space to use with MPI and CUDA execution)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --cuda-mpi-data-space CudaPinned (run CUDA variants with Cuda Pinned memory for MPI buffers)\n\n";
+
+  str << "\t --hip-mpi-data-space <string> [Default is HipPinned]\n"
+      << "\t      (name of data space to use with MPI and HIP execution)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --hip-mpi-data-space Copy (run HIP variants and copy to Host memory for MPI buffers)\n\n";
+
+  str << "\t --sycl-mpi-data-space <string> [Default is SyclPinned]\n"
+      << "\t      (name of data space to use with MPI and SYCL execution)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --sycl-mpi-data-space Copy (run SYCL variants and copy to Host memory for MPI buffers)\n\n";
+
+  str << "\t --kokkos-mpi-data-space <string> [Default is Copy]\n"
+      << "\t      (name of data space to use with MPI and kokkos execution)\n";
+  str << "\t\t Examples...\n"
+      << "\t\t --kokkos-mpi-data-space Copy (run KOKKOS variants and copy to Host memory for MPI buffers)\n\n";
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   str << "\t --add-to-spot-config, -atsc <string> [Default is none]\n"
-      << "\t\t appends additional parameters to the built-in Caliper spot config\n";
+      << "\t\t appends additional parameters to the built-in Caliper spot config (CALI_CONFIG=spot(...))\n";
   str << "\t\t Example to include some PAPI counters (Intel arch)\n"
       << "\t\t -atsc topdown.all\n\n";
+  str << "\t --add-to-cali-config, -atcc <string> [Default is none]\n"
+      << "\t\t include parameters in the Caliper config (same as CALI_CONFIG=...)\n";
+  str << "\t\t Example to include time spent in MPI functions\n"
+      << "\t\t -atcc mpi-report\n\n";
 #endif
 
   str << std::endl;
@@ -1034,21 +1690,27 @@ void RunParams::printVariantNames(std::ostream& str) const
 void RunParams::printDataSpaceNames(std::ostream& str) const
 {
   str << "\nAvailable data spaces:";
-  str << "\n-------------------\n";
+  str << "\n----------------------\n";
   for (int ids = 0; ids < static_cast<int>(DataSpace::NumSpaces); ++ids) {
     DataSpace ds = static_cast<DataSpace>(ids);
     if (isDataSpaceAvailable(ds)) {
       str << getDataSpaceName(ds) << std::endl;
     }
   }
-  str << "\nUnavailable data spaces:";
-  str << "\n-------------------\n";
+  str << "\nUnavailable data spaces in current build configuration:";
+  str << "\n-------------------------------------------------------\n";
   for (int ids = 0; ids < static_cast<int>(DataSpace::NumSpaces); ++ids) {
     DataSpace ds = static_cast<DataSpace>(ids);
     if (!isDataSpaceAvailable(ds)) {
       str << getDataSpaceName(ds) << std::endl;
     }
   }
+  str << "\nPseudo data spaces:";
+  str << "\n-------------------\n";
+  for (int ids = static_cast<int>(DataSpace::NumSpaces)+1; ids < static_cast<int>(DataSpace::EndPseudoSpaces); ++ids) {
+    DataSpace ds = static_cast<DataSpace>(ids);
+    str << getDataSpaceName(ds) << std::endl;
+  }
   str.flush();
 }
 
@@ -1316,6 +1978,77 @@ void RunParams::processKernelInput()
   //
   // ================================================================
 
+  run_warmup_kernels.clear();
+
+  if ( !warmup_kernel_input.empty() ) {
+
+    //
+    // Need to parse input to determine which warmup kernels to run
+    //
+
+    // Make list copy of warmup kernel name input to manipulate for
+    // processing potential group names and/or kernel names, next
+    Slist warmup_kern_names(warmup_kernel_input.begin(), warmup_kernel_input.end());
+
+    //
+    // Search warmup_kern_names for matching group names.
+    // warmup_groups2run will contain names of groups to run.
+    //
+    Svector warmup_groups2run;
+    for (Slist::iterator it = warmup_kern_names.begin(); it != warmup_kern_names.end(); ++it)
+    {
+      for (size_t ig = 0; ig < NumGroups; ++ig) {
+        const std::string& group_name = getGroupName(static_cast<GroupID>(ig));
+        if ( group_name == *it ) {
+          warmup_groups2run.push_back(group_name);
+        }
+      }
+    }
+
+    //
+    // If group name(s) found in warmup_kern_names, assemble kernels in group(s)
+    // to run and remove those group name(s) from warmup_kern_names list.
+    //
+    for (size_t ig = 0; ig < warmup_groups2run.size(); ++ig) {
+      const std::string& gname(warmup_groups2run[ig]);
+
+      for (size_t kid = 0; kid < NumKernels; ++kid) {
+        KernelID tkid = static_cast<KernelID>(kid);
+        if ( getFullKernelName(tkid).find(gname) != std::string::npos &&
+             exclude_kernels.find(tkid) == exclude_kernels.end()) {
+          run_warmup_kernels.insert(tkid);
+        }
+      }
+
+      warmup_kern_names.remove(gname);
+    }
+
+    //
+    // Look for matching names of individual kernels in remaining warmup_kern_names.
+    //
+    for (Slist::iterator it = warmup_kern_names.begin(); it != warmup_kern_names.end(); ++it)
+    {
+      bool found_it = false;
+
+      for (size_t kid = 0; kid < NumKernels && !found_it; ++kid) {
+        KernelID tkid = static_cast<KernelID>(kid);
+        if ( getKernelName(tkid) == *it || getFullKernelName(tkid) == *it ) {
+          if (exclude_kernels.find(tkid) == exclude_kernels.end()) {
+            run_warmup_kernels.insert(tkid);
+          }
+          found_it = true;
+        }
+      }
+
+      // Assemble invalid input for output message.
+      if ( !found_it ) {
+        invalid_warmup_kernel_input.push_back(*it);
+      }
+
+    } // iterate over kernel name input
+
+  }
+
   run_kernels.clear();
 
   if ( kernel_input.empty() && feature_input.empty() ) {
@@ -1465,7 +2198,8 @@ void RunParams::processKernelInput()
   // Set BadInput state based on invalid kernel input
   //
 
-  if ( !(invalid_kernel_input.empty()) ||
+  if ( !(invalid_warmup_kernel_input.empty()) ||
+       !(invalid_kernel_input.empty()) ||
        !(invalid_exclude_kernel_input.empty()) ) {
     input_state = BadInput;
   }
diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp
index bfa8f8896..46cd78f4f 100644
--- a/src/common/RunParams.hpp
+++ b/src/common/RunParams.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -12,9 +12,15 @@
 #include <string>
 #include <set>
 #include <vector>
+#include <array>
 #include <iosfwd>
 
 #include "RAJAPerfSuite.hpp"
+#include "RPTypes.hpp"
+
+
+#define ARRAY_OF_PTRS_MAX_ARRAY_SIZE 26
+
 
 namespace rajaperf
 {
@@ -95,6 +101,37 @@ class RunParams {
     }
   }
 
+  /*!
+   * \brief Enumeration for the bin assignment algorithm used in multi-reduce kernels
+   */
+  enum struct BinAssignmentAlgorithm : int {
+    Random,          /*!< random bin for each iterate */
+    RunsRandomSizes, /*!< each bin in turn is repeated a random number of times,
+                          Ex. 6 bins and 10 iterates [ 0 0 1 2 2 2 2 3 3 5] */
+    RunsEvenSizes,   /*!< each bin in turn is repeated the same number of times,
+                          Ex. 6 bins and 10 iterates [ 0 0 1 1 2 2 3 3 4 5] */
+    Single           /*!< use bin 0 for each iterate */
+  };
+
+  /*!
+   * \brief Translate BinAssignmentAlgorithm enum value to string
+   */
+  static std::string BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm baa)
+  {
+    switch (baa) {
+      case BinAssignmentAlgorithm::Random:
+        return "Random";
+      case BinAssignmentAlgorithm::RunsRandomSizes:
+        return "RunsRandomSizes";
+      case BinAssignmentAlgorithm::RunsEvenSizes:
+        return "RunsEvenSizes";
+      case BinAssignmentAlgorithm::Single:
+        return "Single";
+      default:
+        return "Unknown";
+    }
+  }
+
   /*!
    * \brief Return state of input parsed to this point.
    */
@@ -119,7 +156,19 @@ class RunParams {
 
   double getSizeFactor() const { return size_factor; }
 
-  size_t getDataAlignment() const { return data_alignment; }
+  Size_type getDataAlignment() const { return data_alignment; }
+
+  Index_type getMultiReduceNumBins() const { return multi_reduce_num_bins; }
+  BinAssignmentAlgorithm getMultiReduceBinAssignmentAlgorithm() const { return multi_reduce_bin_assignment_algorithm; }
+
+  Index_type getLtimesNumD() const { return ltimes_num_d; }
+  Index_type getLtimesNumG() const { return ltimes_num_g; }
+  Index_type getLtimesNumM() const { return ltimes_num_m; }
+
+  Index_type getArrayOfPtrsArraySize() const { return array_of_ptrs_array_size; }
+
+  Index_type getHaloWidth() const { return halo_width; }
+  Index_type getHaloNumVars() const { return halo_num_vars; }
 
   int getGPUStream() const { return gpu_stream; }
   size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); }
@@ -132,6 +181,31 @@ class RunParams {
     }
     return false;
   }
+  size_t numValidAtomicReplication() const { return atomic_replications.size(); }
+  bool validAtomicReplication(size_t atomic_replication) const
+  {
+    for (size_t valid_atomic_replication : atomic_replications) {
+      if (valid_atomic_replication == atomic_replication) {
+        return true;
+      }
+    }
+    return false;
+  }
+  size_t numValidItemsPerThread() const { return items_per_threads.size(); }
+  bool validItemsPerThread(size_t items_per_thread) const
+  {
+    for (size_t valid_items_per_thread : items_per_threads) {
+      if (valid_items_per_thread == items_per_thread) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  int getMPISize() const { return mpi_size; }
+  int getMPIRank() const { return mpi_rank; }
+  bool validMPI3DDivision() const { return (mpi_3d_division[0]*mpi_3d_division[1]*mpi_3d_division[2] == mpi_size); }
+  std::array<int, 3> const& getMPI3DDivision() const { return mpi_3d_division; }
 
   DataSpace getSeqDataSpace() const { return seqDataSpace; }
   DataSpace getOmpDataSpace() const { return ompDataSpace; }
@@ -139,6 +213,23 @@ class RunParams {
   DataSpace getCudaDataSpace() const { return cudaDataSpace; }
   DataSpace getHipDataSpace() const { return hipDataSpace; }
   DataSpace getKokkosDataSpace() const { return kokkosDataSpace; }
+  DataSpace getSyclDataSpace() const { return syclDataSpace; }
+
+  DataSpace getSeqReductionDataSpace() const { return seqReductionDataSpace; }
+  DataSpace getOmpReductionDataSpace() const { return ompReductionDataSpace; }
+  DataSpace getOmpTargetReductionDataSpace() const { return ompTargetReductionDataSpace; }
+  DataSpace getCudaReductionDataSpace() const { return cudaReductionDataSpace; }
+  DataSpace getHipReductionDataSpace() const { return hipReductionDataSpace; }
+  DataSpace getSyclReductionDataSpace() const { return syclReductionDataSpace; }
+  DataSpace getKokkosReductionDataSpace() const { return kokkosReductionDataSpace; }
+
+  DataSpace getSeqMPIDataSpace() const { return seqMPIDataSpace; }
+  DataSpace getOmpMPIDataSpace() const { return ompMPIDataSpace; }
+  DataSpace getOmpTargetMPIDataSpace() const { return ompTargetMPIDataSpace; }
+  DataSpace getCudaMPIDataSpace() const { return cudaMPIDataSpace; }
+  DataSpace getHipMPIDataSpace() const { return hipMPIDataSpace; }
+  DataSpace getSyclMPIDataSpace() const { return syclMPIDataSpace; }
+  DataSpace getKokkosMPIDataSpace() const { return kokkosMPIDataSpace; }
 
   double getPFTolerance() const { return pf_tol; }
 
@@ -156,10 +247,12 @@ class RunParams {
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   const std::string& getAddToSpotConfig() const { return add_to_spot_config; }
+  const std::string& getAddToCaliperConfig() const { return add_to_cali_config; }
 #endif
 
   bool getDisableWarmup() const { return disable_warmup; }
 
+  const std::set<KernelID>& getWarmupKernelIDsToRun() const { return run_warmup_kernels; }
   const std::set<KernelID>& getKernelIDsToRun() const { return run_kernels; }
   const std::set<VariantID>& getVariantIDsToRun() const { return run_variants; }
   VariantID getReferenceVariantID() const { return reference_vid; }
@@ -208,10 +301,28 @@ class RunParams {
   SizeMeaning size_meaning; /*!< meaning of size value */
   double size;           /*!< kernel size to run (input option) */
   double size_factor;    /*!< default kernel size multipier (input option) */
-  size_t data_alignment;
+  Size_type data_alignment;
+
+  Index_type multi_reduce_num_bins; /*!< number of bins used in multi reduction kernels (input option) */
+  BinAssignmentAlgorithm multi_reduce_bin_assignment_algorithm; /*!< algorithm used to assign bins to iterates used in multi reduction kernels (input option) */
+
+  Index_type ltimes_num_d; /*!< num_d used in ltimes kernels (input option) */
+  Index_type ltimes_num_g; /*!< num_g used in ltimes kernels (input option) */
+  Index_type ltimes_num_m; /*!< num_m used in ltimes kernels (input option) */
+
+  Index_type array_of_ptrs_array_size; /*!< number of pointers used in ARRAY_OF_PTRS kernel (input option) */
+
+  Index_type halo_width; /*!< halo width used in halo kernels (input option) */
+  Index_type halo_num_vars; /*!< num vars used in halo kernels (input option) */
 
   int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */
   std::vector<size_t> gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */
+  std::vector<size_t> atomic_replications; /*!< Atomic replications for gpu tunings to run (input option) */
+  std::vector<size_t> items_per_threads; /*!< Items per thread for gpu tunings to run (input option) */
+
+  int mpi_size;           /*!< Number of MPI ranks */
+  int mpi_rank;           /*!< Rank of this MPI process */
+  std::array<int, 3> mpi_3d_division; /*!< Number of MPI ranks in each dimension of a 3D grid */
 
   double pf_tol;         /*!< pct RAJA variant run time can exceed base for
                               each PM case to pass/fail acceptance */
@@ -228,11 +339,30 @@ class RunParams {
   DataSpace cudaDataSpace = DataSpace::CudaDevice;
   DataSpace hipDataSpace = DataSpace::HipDevice;
   DataSpace kokkosDataSpace = DataSpace::Host;
+  DataSpace syclDataSpace = DataSpace::SyclDevice;
+
+  DataSpace seqReductionDataSpace = DataSpace::Host;
+  DataSpace ompReductionDataSpace = DataSpace::Omp;
+  DataSpace ompTargetReductionDataSpace = DataSpace::OmpTarget;
+  DataSpace cudaReductionDataSpace = DataSpace::CudaManagedDevicePreferredHostAccessed;
+  DataSpace hipReductionDataSpace = DataSpace::HipDevice;
+  DataSpace syclReductionDataSpace = DataSpace::SyclDevice;
+  DataSpace kokkosReductionDataSpace = DataSpace::Host;
+
+  DataSpace seqMPIDataSpace = DataSpace::Host;
+  DataSpace ompMPIDataSpace = DataSpace::Omp;
+  DataSpace ompTargetMPIDataSpace = DataSpace::Copy;
+  DataSpace cudaMPIDataSpace = DataSpace::CudaPinned;
+  DataSpace hipMPIDataSpace = DataSpace::HipPinned;
+  DataSpace syclMPIDataSpace = DataSpace::SyclPinned;
+  DataSpace kokkosMPIDataSpace = DataSpace::Copy;
 
   //
   // Arrays to hold input strings for valid/invalid input. Helpful for
   // debugging command line args.
   //
+  std::vector<std::string> warmup_kernel_input;
+  std::vector<std::string> invalid_warmup_kernel_input;
   std::vector<std::string> kernel_input;
   std::vector<std::string> invalid_kernel_input;
   std::vector<std::string> exclude_kernel_input;
@@ -258,10 +388,12 @@ class RunParams {
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   std::string add_to_spot_config;
+  std::string add_to_cali_config;
 #endif
 
   bool disable_warmup;
 
+  std::set<KernelID>  run_warmup_kernels;
   std::set<KernelID>  run_kernels;
   std::set<VariantID> run_variants;
 
diff --git a/src/common/SyclDataUtils.hpp b/src/common/SyclDataUtils.hpp
new file mode 100644
index 000000000..e426476c4
--- /dev/null
+++ b/src/common/SyclDataUtils.hpp
@@ -0,0 +1,157 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Methods for SYCL kernel data allocation, initialization, and deallocation.
+///
+
+
+#ifndef RAJAPerf_SyclDataUtils_HPP
+#define RAJAPerf_SyclDataUtils_HPP
+
+#include "RPTypes.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/GPUUtils.hpp"
+
+#include <sycl.hpp>
+
+
+namespace rajaperf
+{
+
+/*!
+ * \brief Copy given hptr (host) data to SYCL device (dptr).
+ *
+ * Method assumes both host and device data arrays are allocated
+ * and of propoer size for copy operation to succeed.
+ */
+template <typename T>
+void initSyclDeviceData(T& dptr, const T hptr, int len, sycl::queue* qu)
+{
+  auto e = qu->memcpy( dptr, hptr,
+                       len * sizeof(typename std::remove_pointer<T>::type));
+  e.wait();
+
+  detail::incDataInitCount();
+}
+
+/*!
+ * \brief Allocate SYCL device data array (dptr) and copy given hptr (host) 
+ * data to device array.
+ */
+template <typename T>
+void allocAndInitSyclDeviceData(T& dptr, const T hptr, int len, sycl::queue *qu)
+{
+  dptr = sycl::malloc_device<typename std::remove_pointer<T>::type>(len, *qu);
+
+  initSyclDeviceData(dptr, hptr, len, qu);
+}
+
+/*!
+ * \brief Copy given dptr (SYCL device) data to host (hptr).
+ *
+ * Method assumes both host and device data arrays are allocated
+ * and of propoer size for copy operation to succeed.
+ */
+template <typename T>
+void getSyclDeviceData(T& hptr, const T dptr, int len, sycl::queue *qu)
+{
+  auto e = qu->memcpy( hptr, dptr,
+                      len * sizeof(typename std::remove_pointer<T>::type));
+  e.wait();
+}
+
+/*!
+ * \brief Free device data array.
+ */
+template <typename T>
+void deallocSyclDeviceData(T& dptr, sycl::queue *qu)
+{
+  sycl::free(dptr, *qu);
+  dptr = 0;
+}
+
+namespace detail
+{
+/*
+ * Copy memory len bytes from src to dst.
+ */
+inline void copySyclData(void* dst_ptr, const void* src_ptr, Size_type len, sycl::queue *qu)
+{
+  auto e = qu->memcpy( dst_ptr, src_ptr, len);
+  e.wait();
+}
+
+/*!
+ * \brief Allocate SYCL device data array (dptr).
+ */
+inline void* allocSyclDeviceData(Size_type len, sycl::queue *qu)
+{
+  void* dptr = nullptr;
+  dptr = sycl::malloc_device(len, *qu);
+  return dptr;
+}
+
+/*!
+ * \brief Allocate SYCL managed data array (dptr).
+ */
+inline void* allocSyclManagedData(Size_type len, sycl::queue *qu)
+{
+  void* mptr = nullptr;
+  mptr = sycl::malloc_shared(len, *qu);
+  return mptr;
+}
+
+/*!
+ * \brief Allocate SYCL pinned data array (pptr).
+ */
+inline void* allocSyclPinnedData(Size_type len, sycl::queue *qu)
+{
+  void* pptr = nullptr;
+  pptr = sycl::malloc_host(len, *qu);
+  return pptr;
+}
+
+
+/*!
+ * \brief Free device data array.
+ */
+inline void deallocSyclDeviceData(void* dptr, sycl::queue *qu)
+{
+  sycl::free(dptr, *qu);
+  dptr = 0;
+}
+
+/*!
+ * \brief Free managed data array.
+ */
+inline void deallocSyclManagedData(void* dptr, sycl::queue *qu)
+{
+  sycl::free(dptr, *qu);
+  dptr = 0;
+}
+
+/*!
+ * \brief Free managed data array.
+ */
+inline void deallocSyclPinnedData(void* dptr, sycl::queue *qu)
+{
+  sycl::free(dptr, *qu);
+  dptr = 0;
+}
+
+}  // closing brace for detail namespac
+
+}  // closing brace for rajaperf namespace
+
+#endif // RAJA_ENABLE_SYCL
+
+#endif  // closing endif for header file include guard
+
diff --git a/src/lcals-kokkos/CMakeLists.txt b/src/lcals-kokkos/CMakeLists.txt
index 47e5b48c8..7cb6706e8 100644
--- a/src/lcals-kokkos/CMakeLists.txt
+++ b/src/lcals-kokkos/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp
index 4c7dd6b39..b8d8311ba 100644
--- a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp
+++ b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/EOS-Kokkos.cpp b/src/lcals-kokkos/EOS-Kokkos.cpp
index be30c0b60..2046b540d 100644
--- a/src/lcals-kokkos/EOS-Kokkos.cpp
+++ b/src/lcals-kokkos/EOS-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp
index 02ae5097e..071e2687c 100644
--- a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp
+++ b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp
index cd2957436..ebc31ddff 100644
--- a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp
+++ b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp
index b7da76fd0..37b2d0c41 100644
--- a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp
+++ b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp
index 00960c3aa..8dce97c22 100644
--- a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp
+++ b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp
index 20e05fde4..a2fdcfd02 100644
--- a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp
+++ b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp
index 45761b11e..e9b388105 100644
--- a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp
+++ b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp
index 451e6fe77..7609b3f3c 100644
--- a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp
+++ b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp
index b2c582790..e5263cf07 100644
--- a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp
+++ b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp
index ac0943dd8..f0ec388e7 100644
--- a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp
+++ b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt
index f767bbd0b..6fc819b2b 100644
--- a/src/lcals/CMakeLists.txt
+++ b/src/lcals/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -14,65 +14,76 @@ blt_add_library(
           DIFF_PREDICT-Cuda.cpp
           DIFF_PREDICT-OMP.cpp
           DIFF_PREDICT-OMPTarget.cpp
+	  DIFF_PREDICT-Sycl.cpp
           EOS.cpp
           EOS-Seq.cpp
           EOS-Hip.cpp
           EOS-Cuda.cpp
           EOS-OMP.cpp
           EOS-OMPTarget.cpp
+	  EOS-Sycl.cpp
           FIRST_DIFF.cpp
           FIRST_DIFF-Seq.cpp
           FIRST_DIFF-Hip.cpp
           FIRST_DIFF-Cuda.cpp
           FIRST_DIFF-OMP.cpp
           FIRST_DIFF-OMPTarget.cpp
+	  FIRST_DIFF-Sycl.cpp
           FIRST_MIN.cpp
           FIRST_MIN-Seq.cpp
           FIRST_MIN-Hip.cpp
           FIRST_MIN-Cuda.cpp
           FIRST_MIN-OMP.cpp
           FIRST_MIN-OMPTarget.cpp
+          FIRST_MIN-Sycl.cpp
           FIRST_SUM.cpp
           FIRST_SUM-Seq.cpp
           FIRST_SUM-Hip.cpp
           FIRST_SUM-Cuda.cpp
           FIRST_SUM-OMP.cpp
           FIRST_SUM-OMPTarget.cpp
+          FIRST_SUM-Sycl.cpp
           GEN_LIN_RECUR.cpp
           GEN_LIN_RECUR-Seq.cpp
           GEN_LIN_RECUR-Hip.cpp
           GEN_LIN_RECUR-Cuda.cpp
           GEN_LIN_RECUR-OMP.cpp
           GEN_LIN_RECUR-OMPTarget.cpp
+	  GEN_LIN_RECUR-Sycl.cpp
           HYDRO_1D.cpp
           HYDRO_1D-Seq.cpp
           HYDRO_1D-Hip.cpp
           HYDRO_1D-Cuda.cpp
           HYDRO_1D-OMP.cpp
           HYDRO_1D-OMPTarget.cpp
+	  HYDRO_1D-Sycl.cpp
           HYDRO_2D.cpp
           HYDRO_2D-Seq.cpp
           HYDRO_2D-Hip.cpp
           HYDRO_2D-Cuda.cpp
           HYDRO_2D-OMP.cpp
           HYDRO_2D-OMPTarget.cpp
+	  HYDRO_2D-Sycl.cpp
           INT_PREDICT.cpp
           INT_PREDICT-Seq.cpp
           INT_PREDICT-Hip.cpp
           INT_PREDICT-Cuda.cpp
           INT_PREDICT-OMP.cpp
           INT_PREDICT-OMPTarget.cpp
+	  INT_PREDICT-Sycl.cpp
           PLANCKIAN.cpp
           PLANCKIAN-Seq.cpp
           PLANCKIAN-Hip.cpp
           PLANCKIAN-Cuda.cpp
           PLANCKIAN-OMP.cpp
           PLANCKIAN-OMPTarget.cpp
+	  PLANCKIAN-Sycl.cpp
           TRIDIAG_ELIM.cpp
           TRIDIAG_ELIM-Seq.cpp
           TRIDIAG_ELIM-Hip.cpp
           TRIDIAG_ELIM-Cuda.cpp
           TRIDIAG_ELIM-OMP.cpp
           TRIDIAG_ELIM-OMPTarget.cpp
+	  TRIDIAG_ELIM-Sycl.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp
index c66ca2598..a33f1aecf 100644
--- a/src/lcals/DIFF_PREDICT-Cuda.cpp
+++ b/src/lcals/DIFF_PREDICT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,10 +52,11 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       diff_predict<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( px, cx,
-                                                offset,
-                                                iend );
-       cudaErrchk( cudaGetLastError() );
+   
+       RPlaunchCudaKernel( (diff_predict<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           px, cx, offset, iend );
 
     }
     stopTimer();
diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp
index 7bd49a994..6d25a6f42 100644
--- a/src/lcals/DIFF_PREDICT-Hip.cpp
+++ b/src/lcals/DIFF_PREDICT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,10 +53,11 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hipLaunchKernelGGL((diff_predict<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  px, cx,
-                                                offset,
-                                                iend );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (diff_predict<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          px, cx, offset, iend );
 
     }
     stopTimer();
diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp
index 09da23262..6e2110edb 100644
--- a/src/lcals/DIFF_PREDICT-OMP.cpp
+++ b/src/lcals/DIFF_PREDICT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp
index e04b1e07d..3509b6aaa 100644
--- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp
+++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp
index eae7cda8f..9dcd9a035 100644
--- a/src/lcals/DIFF_PREDICT-Seq.cpp
+++ b/src/lcals/DIFF_PREDICT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp
new file mode 100644
index 000000000..5ac815671
--- /dev/null
+++ b/src/lcals/DIFF_PREDICT-Sycl.cpp
@@ -0,0 +1,82 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DIFF_PREDICT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+template <size_t work_group_size >
+void DIFF_PREDICT::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  DIFF_PREDICT_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h)
+      {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            DIFF_PREDICT_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         DIFF_PREDICT_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  DIFF_PREDICT : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DIFF_PREDICT, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp
index b5ddc90e4..40ff30713 100644
--- a/src/lcals/DIFF_PREDICT.cpp
+++ b/src/lcals/DIFF_PREDICT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
 
   setKernelsPerRep(1);
-  setBytesPerRep( (10*sizeof(Real_type) + 10*sizeof(Real_type)) * getActualProblemSize());
+  setBytesReadPerRep( 10*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 10*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(9 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -50,6 +52,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp
index 3a583381b..7bd77eade 100644
--- a/src/lcals/DIFF_PREDICT.hpp
+++ b/src/lcals/DIFF_PREDICT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -93,18 +93,24 @@ class DIFF_PREDICT : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_px;
   Real_ptr m_cx;
diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp
index a3583ca53..fafbdef56 100644
--- a/src/lcals/EOS-Cuda.cpp
+++ b/src/lcals/EOS-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,10 +52,13 @@ void EOS::runCudaVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       eos<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( x, y, z, u,
-                                       q, r, t,
-                                       iend );
-       cudaErrchk( cudaGetLastError() );
+
+       RPlaunchCudaKernel( (eos<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           x, y, z, u,
+                           q, r, t, 
+                           iend );
 
     }
     stopTimer();
diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp
index 2cbd78891..35c80e320 100644
--- a/src/lcals/EOS-Hip.cpp
+++ b/src/lcals/EOS-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,10 +52,13 @@ void EOS::runHipVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hipLaunchKernelGGL((eos<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  x, y, z, u,
-                                       q, r, t,
-                                       iend );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (eos<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, y, z, u,
+                          q, r, t, 
+                          iend );
 
     }
     stopTimer();
diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp
index 7ac9cdb8f..88e8e9da1 100644
--- a/src/lcals/EOS-OMP.cpp
+++ b/src/lcals/EOS-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp
index 16a6b841b..b9bf454eb 100644
--- a/src/lcals/EOS-OMPTarget.cpp
+++ b/src/lcals/EOS-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp
index 083fc343e..384a9d260 100644
--- a/src/lcals/EOS-Seq.cpp
+++ b/src/lcals/EOS-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp
new file mode 100644
index 000000000..898d25bc8
--- /dev/null
+++ b/src/lcals/EOS-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "EOS.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace lcals
+{
+ 
+template <size_t work_group_size >
+void EOS::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  EOS_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) { 
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            EOS_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         EOS_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  EOS : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(EOS, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp
index 517d144f8..a9076c144 100644
--- a/src/lcals/EOS.cpp
+++ b/src/lcals/EOS.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -31,8 +31,10 @@ EOS::EOS(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_array_length );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() +
+                      1*sizeof(Real_type) * m_array_length );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(16 * getActualProblemSize());
 
   checksum_scale_factor = 0.0001 *
@@ -58,6 +60,9 @@ EOS::EOS(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp
index 9cc202a02..fed56916d 100644
--- a/src/lcals/EOS.hpp
+++ b/src/lcals/EOS.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -62,18 +62,24 @@ class EOS : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp
index 05d73d38f..2101da14f 100644
--- a/src/lcals/FIRST_DIFF-Cuda.cpp
+++ b/src/lcals/FIRST_DIFF-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,12 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       first_diff<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( x, y,
-                                              iend );
-       cudaErrchk( cudaGetLastError() );
+
+       RPlaunchCudaKernel( (first_diff<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           x, y,
+                           iend );
 
     }
     stopTimer();
diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp
index 651590776..666b9783d 100644
--- a/src/lcals/FIRST_DIFF-Hip.cpp
+++ b/src/lcals/FIRST_DIFF-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,12 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hipLaunchKernelGGL((first_diff<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  x, y,
-                                              iend );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (first_diff<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, y, 
+                          iend );
 
     }
     stopTimer();
diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp
index a3b814124..b664bfbf7 100644
--- a/src/lcals/FIRST_DIFF-OMP.cpp
+++ b/src/lcals/FIRST_DIFF-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp
index 341ef57f4..bf3a40ad9 100644
--- a/src/lcals/FIRST_DIFF-OMPTarget.cpp
+++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp
index 54d2a8ce1..2382015e0 100644
--- a/src/lcals/FIRST_DIFF-Seq.cpp
+++ b/src/lcals/FIRST_DIFF-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp
new file mode 100644
index 000000000..41bacafe3
--- /dev/null
+++ b/src/lcals/FIRST_DIFF-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "FIRST_DIFF.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+template <size_t work_group_size >
+void FIRST_DIFF::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+  
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  FIRST_DIFF_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            FIRST_DIFF_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         FIRST_DIFF_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  FIRST_DIFF : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_DIFF, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_Sycl
diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp
index 3e8e42ec6..aa5aaa31a 100644
--- a/src/lcals/FIRST_DIFF.cpp
+++ b/src/lcals/FIRST_DIFF.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,11 +28,11 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params)
 
   m_N = getActualProblemSize()+1;
 
-  setItsPerRep( getActualProblemSize() );
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() +
-                  (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_N );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_N );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -54,6 +54,9 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp
index f3f6424f0..c01907f9b 100644
--- a/src/lcals/FIRST_DIFF.hpp
+++ b/src/lcals/FIRST_DIFF.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,18 +52,24 @@ class FIRST_DIFF : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp
index e7d860877..08f2ab240 100644
--- a/src/lcals/FIRST_MIN-Cuda.cpp
+++ b/src/lcals/FIRST_MIN-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,10 @@
 #include "common/CudaDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -56,11 +60,10 @@ __global__ void first_min(Real_ptr x,
 }
 
 
-template < size_t block_size >
-void FIRST_MIN::runCudaVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void FIRST_MIN::runCudaVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getCudaResource()};
@@ -69,50 +72,71 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-    MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value
+    constexpr size_t shmem = sizeof(MyMinLoc)*block_size;
+    const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, (first_min<block_size>), block_size, shmem);
+
+    const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+    const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-    MyMinLoc* dminloc;
-    cudaErrchk( cudaMalloc( (void**)&dminloc, 
-                            grid_size * sizeof(MyMinLoc) ) );
+    RAJAPERF_CUDA_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size, 1);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       FIRST_MIN_MINLOC_INIT;
+      RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size, 1);
 
-      constexpr size_t shmem = sizeof(MyMinLoc)*block_size;
-      first_min<block_size><<<grid_size, block_size,
-                              shmem, res.get_stream()>>>(x, dminloc, mymin, iend);
-      cudaErrchk( cudaGetLastError() );
-
-      cudaErrchk( cudaMemcpyAsync( mymin_block, dminloc,
-                                   grid_size * sizeof(MyMinLoc),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
+      RPlaunchCudaKernel( (first_min<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           x, dminloc, mymin, 
+                           iend );
 
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(dminloc, mymin_block, grid_size, 1);
       for (Index_type i = 0; i < static_cast<Index_type>(grid_size); i++) {
         if ( mymin_block[i].val < mymin.val ) {
           mymin = mymin_block[i];
         }
       }
-      m_minloc = RAJA_MAX(m_minloc, mymin.loc);
+      m_minloc = mymin.loc;
 
     }
     stopTimer();
 
-    cudaErrchk( cudaFree( dminloc ) );
-    delete[] mymin_block;
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(dminloc, mymin_block);
 
-  } else if ( vid == RAJA_CUDA ) {
+  } else {
+     getCout() << "\n  FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename MappingHelper >
+void FIRST_MIN::runCudaVariantRAJA(VariantID vid)
+{
+  using reduction_policy = RAJA::cuda_reduce;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  FIRST_MIN_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       RAJA::ReduceMinLoc<RAJA::cuda_reduce, Real_type, Index_type> loc(
+       RAJA::ReduceMinLoc<reduction_policy, Real_type, Index_type> loc(
                                                         m_xmin_init, m_initloc);
 
-       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+       RAJA::forall<exec_policy>( res,
          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
          FIRST_MIN_BODY_RAJA;
        });
@@ -127,7 +151,155 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Cuda)
+template < size_t block_size, typename MappingHelper >
+void FIRST_MIN::runCudaVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  FIRST_MIN_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    using VL_TYPE = RAJA::expt::ValLoc<Real_type>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       VL_TYPE tloc(m_xmin_init, m_initloc);
+
+       RAJA::forall<exec_policy>( res,
+         RAJA::RangeSegment(ibegin, iend),
+         RAJA::expt::Reduce<RAJA::operators::minimum>(&tloc),
+         [=] __device__ (Index_type i, VL_TYPE& loc) {
+           loc.min(x[i], i);
+         }
+       );
+
+       m_minloc = static_cast<Index_type>(tloc.getLoc());
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantBase<decltype(block_size){},
+                                 decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantRAJA<decltype(block_size){},
+                                 decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantRAJANewReduce<decltype(block_size){},
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void FIRST_MIN::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            auto algorithm_helper = gpu_algorithm::block_host_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace lcals
 } // end namespace rajaperf
diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp
index bb106ce0e..3c6fd7b35 100644
--- a/src/lcals/FIRST_MIN-Hip.cpp
+++ b/src/lcals/FIRST_MIN-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,10 @@
 #include "common/HipDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
+
 
 namespace rajaperf
 {
@@ -56,11 +60,10 @@ __global__ void first_min(Real_ptr x,
 }
 
 
-template < size_t block_size >
-void FIRST_MIN::runHipVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void FIRST_MIN::runHipVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getHipResource()};
@@ -69,31 +72,28 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-    MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value
+    constexpr size_t shmem = sizeof(MyMinLoc)*block_size;
+    const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, (first_min<block_size>), block_size, shmem);
+
+    const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+    const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-    MyMinLoc* dminloc;
-    hipErrchk( hipMalloc( (void**)&dminloc, 
-                          grid_size * sizeof(MyMinLoc) ) );
+    RAJAPERF_HIP_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size, 1);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       FIRST_MIN_MINLOC_INIT;
+      RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size, 1);
 
-      constexpr size_t shmem = sizeof(MyMinLoc)*block_size;
-      hipLaunchKernelGGL( (first_min<block_size>), grid_size, block_size,
-                           shmem, res.get_stream(), x,
-                           dminloc,
-                           mymin,
-                           iend );
-      hipErrchk( hipGetLastError() );
-
-      hipErrchk( hipMemcpyAsync( mymin_block, dminloc,
-                                 grid_size * sizeof(MyMinLoc),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
+      RPlaunchHipKernel( (first_min<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         x, dminloc, mymin,
+                         iend );
 
+      RAJAPERF_HIP_REDUCER_COPY_BACK(dminloc, mymin_block, grid_size, 1);
       for (Index_type i = 0; i < static_cast<Index_type>(grid_size); i++) {
         if ( mymin_block[i].val < mymin.val ) {
           mymin = mymin_block[i];
@@ -104,18 +104,39 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid)
     }
     stopTimer();
 
-    hipErrchk( hipFree( dminloc ) );
-    delete[] mymin_block;
+    RAJAPERF_HIP_REDUCER_TEARDOWN(dminloc, mymin_block);
 
-  } else if ( vid == RAJA_HIP ) {
+  } else {
+     getCout() << "\n  FIRST_MIN : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename MappingHelper >
+void FIRST_MIN::runHipVariantRAJA(VariantID vid)
+{
+  using reduction_policy = RAJA::hip_reduce;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  FIRST_MIN_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       RAJA::ReduceMinLoc<RAJA::hip_reduce, Real_type, Index_type> loc(
+       RAJA::ReduceMinLoc<reduction_policy, Real_type, Index_type> loc(
                                                         m_xmin_init, m_initloc);
 
-       RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+       RAJA::forall<exec_policy>( res,
          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
          FIRST_MIN_BODY_RAJA;
        });
@@ -130,7 +151,153 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Hip)
+template < size_t block_size, typename MappingHelper >
+void FIRST_MIN::runHipVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  FIRST_MIN_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    using VL_TYPE = RAJA::expt::ValLoc<Real_type>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       VL_TYPE tloc(m_xmin_init, m_initloc);
+
+       RAJA::forall<exec_policy>( res,
+         RAJA::RangeSegment(ibegin, iend),
+         RAJA::expt::Reduce<RAJA::operators::minimum>(&tloc),
+         [=] __device__ (Index_type i, VL_TYPE& loc) {
+           loc.min(x[i], i);
+         }
+       );
+
+       m_minloc = static_cast<Index_type>(tloc.getLoc());
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  FIRST_MIN : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantBase<decltype(block_size){},
+                                decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_HIP ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantRAJA<decltype(block_size){},
+                                decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantRAJANewReduce<decltype(block_size){},
+                                         decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  FIRST_MIN : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void FIRST_MIN::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            auto algorithm_helper = gpu_algorithm::block_host_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+          } else if ( vid == RAJA_HIP ) {
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size)); 
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace lcals
 } // end namespace rajaperf
diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp
index 1a7722570..a9a7f1ba1 100644
--- a/src/lcals/FIRST_MIN-OMP.cpp
+++ b/src/lcals/FIRST_MIN-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace lcals
 {
 
 
-void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t tune_idx)
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -87,21 +87,49 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
 
     case RAJA_OpenMP : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+  
+          RAJA::ReduceMinLoc<RAJA::omp_reduce, Real_type, Index_type> loc(
+                                                          m_xmin_init, m_initloc);
+
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            FIRST_MIN_BODY_RAJA;
+          });
+
+          m_minloc = loc.getLoc();
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        using VL_TYPE = RAJA::expt::ValLoc<Real_type>;
 
-        RAJA::ReduceMinLoc<RAJA::omp_reduce, Real_type, Index_type> loc(
-                                                        m_xmin_init, m_initloc);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          FIRST_MIN_BODY_RAJA;
-        });
+          VL_TYPE tloc(m_xmin_init, m_initloc);
 
-        m_minloc = loc.getLoc();
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&tloc),
+            [=](Index_type i, VL_TYPE& loc) {
+              loc.min(x[i], i);
+            }
+          );
 
+          m_minloc = static_cast<Index_type>(tloc.getLoc());
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  FIRST_MIN : Unknown OpenMP tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -114,8 +142,17 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
 
 #else
   RAJA_UNUSED_VAR(vid);
+  RAJA_UNUSED_VAR(tune_idx);
 #endif
 }
 
+void FIRST_MIN::setOpenMPTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMP) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace lcals
 } // end namespace rajaperf
diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp
index 5a4dccc69..578bdfe63 100644
--- a/src/lcals/FIRST_MIN-OMPTarget.cpp
+++ b/src/lcals/FIRST_MIN-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -27,7 +27,7 @@ namespace lcals
   const size_t threads_per_team = 256;
 
 
-void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -60,21 +60,49 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
 
   } else if ( vid == RAJA_OpenMPTarget ) {
 
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    if (tune_idx == 0) {
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::ReduceMinLoc<RAJA::omp_target_reduce, Real_type, Index_type> loc(
+                                                    m_xmin_init, m_initloc);
+
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          FIRST_MIN_BODY_RAJA;
+        });
+
+        m_minloc = loc.getLoc();
+
+      }
+      stopTimer();
+
+    } else if (tune_idx == 1) {
 
-      RAJA::ReduceMinLoc<RAJA::omp_target_reduce, Real_type, Index_type> loc(
-                                                  m_xmin_init, m_initloc);
+      using VL_TYPE = RAJA::expt::ValLoc<Real_type>;
 
-      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
-        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-        FIRST_MIN_BODY_RAJA;
-      });
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      m_minloc = loc.getLoc();
+        VL_TYPE tloc(m_xmin_init, m_initloc);
 
+        RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+          RAJA::RangeSegment(ibegin, iend),
+          RAJA::expt::Reduce<RAJA::operators::minimum>(&tloc),
+          [=](Index_type i, VL_TYPE& loc) {
+            loc.min(x[i], i);
+          }
+        );
+
+        m_minloc = static_cast<Index_type>(tloc.getLoc());
+
+      }
+      stopTimer();
+
+    } else {
+      getCout() << "\n  FIRST_MIN : Unknown OMP Target tuning index = " << tune_idx << std::endl;
     }
-    stopTimer();
 
   } else {
      getCout() << "\n  FIRST_MIN : Unknown OMP Target variant id = " << vid << std::endl;
@@ -82,6 +110,14 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG
 
 }
 
+void FIRST_MIN::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMPTarget) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace lcals
 } // end namespace rajaperf
 
diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp
index 6e5de0437..a32ed4962 100644
--- a/src/lcals/FIRST_MIN-Seq.cpp
+++ b/src/lcals/FIRST_MIN-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,8 +18,11 @@ namespace lcals
 {
 
 
-void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void FIRST_MIN::runSeqVariant(VariantID vid, size_t tune_idx)
 {
+#if !defined(RUN_RAJA_SEQ)
+  RAJA_UNUSED_VAR(tune_idx);
+#endif
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
@@ -76,21 +79,49 @@ void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx
 
     case RAJA_Seq : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceMinLoc<RAJA::seq_reduce, Real_type, Index_type> loc(
+                                                          m_xmin_init, m_initloc);
+
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            FIRST_MIN_BODY_RAJA;
+          });
+  
+          m_minloc = loc.getLoc();
+  
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        using VL_TYPE = RAJA::expt::ValLoc<Real_type>;
 
-        RAJA::ReduceMinLoc<RAJA::seq_reduce, Real_type, Index_type> loc(
-                                                        m_xmin_init, m_initloc);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::seq_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          FIRST_MIN_BODY_RAJA;
-        });
+          VL_TYPE tloc(m_xmin_init, m_initloc); 
 
-        m_minloc = loc.getLoc();
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::minimum>(&tloc),
+            [=](Index_type i, VL_TYPE& loc) {
+              loc.min(x[i], i);
+            }
+          );
 
+          m_minloc = static_cast<Index_type>(tloc.getLoc());
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  FIRST_MIN : Unknown Seq tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -104,5 +135,13 @@ void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx
 
 }
 
+void FIRST_MIN::setSeqTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_Seq) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace lcals
 } // end namespace rajaperf
diff --git a/src/lcals/FIRST_MIN-Sycl.cpp b/src/lcals/FIRST_MIN-Sycl.cpp
new file mode 100644
index 000000000..616c84dcb
--- /dev/null
+++ b/src/lcals/FIRST_MIN-Sycl.cpp
@@ -0,0 +1,118 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "FIRST_MIN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+
+namespace rajaperf
+{
+namespace lcals
+{
+
+template <typename VAL_TYPE, typename IDX_TYPE>
+struct reduce_pair {
+  bool operator<(const reduce_pair& o) const {
+    return (val < o.val);
+  }
+  VAL_TYPE val;
+  IDX_TYPE idx;
+};
+
+template <size_t work_group_size >
+void FIRST_MIN::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  FIRST_MIN_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    using result_type = reduce_pair<Real_type, Index_type>;
+
+    auto result = sycl::malloc_shared< result_type >(1, *qu); 
+ 
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      result_type result_init = { m_xmin_init, m_initloc };
+      *result = result_init;
+      auto reduction_obj = sycl::reduction( result, result_init, sycl::minimum<result_type>() ); 
+
+      qu->submit([&] (sycl::handler& h) {
+
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       reduction_obj,
+                       [=] (sycl::nd_item<1> item, auto& loc) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            loc.combine( {x[i], i} );
+          }
+
+        });
+
+      });
+
+      qu->wait();
+
+      m_minloc = static_cast<Index_type>(result->idx);
+
+    }
+    stopTimer();
+
+    sycl::free(result, *qu);
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    using VL_TYPE = RAJA::expt::ValLoc<Real_type>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       VL_TYPE tloc(m_xmin_init, m_initloc);
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >(
+         res,
+         RAJA::RangeSegment(ibegin, iend), 
+         RAJA::expt::Reduce<RAJA::operators::minimum>(&tloc),
+         [=]  (Index_type i, VL_TYPE& loc) {
+           loc.min(x[i], i);
+         }
+       );
+
+       m_minloc = static_cast<Index_type>(tloc.getLoc());
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  FIRST_MIN : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp
index 875932958..63a3be8df 100644
--- a/src/lcals/FIRST_MIN.cpp
+++ b/src/lcals/FIRST_MIN.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -33,9 +33,12 @@ FIRST_MIN::FIRST_MIN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) +
-                  (1*sizeof(Index_type) + 1*sizeof(Index_type)) +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N );
+  setBytesReadPerRep( 1*sizeof(Index_type) +
+                      1*sizeof(Real_type ) +
+                      1*sizeof(Real_type ) * m_N );
+  setBytesWrittenPerRep( 1*sizeof(Index_type) +
+                         1*sizeof(Real_type ) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature(Forall);
@@ -58,6 +61,9 @@ FIRST_MIN::FIRST_MIN(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp
index dd00d4392..1660739fb 100644
--- a/src/lcals/FIRST_MIN.hpp
+++ b/src/lcals/FIRST_MIN.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -79,18 +79,37 @@ class FIRST_MIN : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
-  template < size_t block_size >
-  void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size >
-  void runHipVariantImpl(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid); 
+
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantBase(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantRAJANewReduce(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantBase(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantRAJANewReduce(VariantID vid);
+
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_type m_xmin_init;
diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp
index 2ac57e5a1..b4a025a20 100644
--- a/src/lcals/FIRST_SUM-Cuda.cpp
+++ b/src/lcals/FIRST_SUM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -49,11 +49,14 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-       constexpr size_t shmem = 0;
-       first_sum<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( x, y,
-                                              iend );
-       cudaErrchk( cudaGetLastError() );
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      constexpr size_t shmem = 0;
+
+      RPlaunchCudaKernel( (first_sum<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           x, y,
+                           iend );
 
     }
     stopTimer();
diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp
index 5f48abe69..01c2082d5 100644
--- a/src/lcals/FIRST_SUM-Hip.cpp
+++ b/src/lcals/FIRST_SUM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,12 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hipLaunchKernelGGL((first_sum<block_size>),grid_size, block_size, shmem, res.get_stream(), x, y,
-                                              iend );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (first_sum<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, y,
+                          iend );
 
     }
     stopTimer();
diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp
index e545538fc..223379dbe 100644
--- a/src/lcals/FIRST_SUM-OMP.cpp
+++ b/src/lcals/FIRST_SUM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp
index 324b26d54..932f32fc4 100644
--- a/src/lcals/FIRST_SUM-OMPTarget.cpp
+++ b/src/lcals/FIRST_SUM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp
index 4d3ef658f..4eba8626e 100644
--- a/src/lcals/FIRST_SUM-Seq.cpp
+++ b/src/lcals/FIRST_SUM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/FIRST_SUM-Sycl.cpp b/src/lcals/FIRST_SUM-Sycl.cpp
new file mode 100644
index 000000000..3d63fdcbc
--- /dev/null
+++ b/src/lcals/FIRST_SUM-Sycl.cpp
@@ -0,0 +1,82 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "FIRST_SUM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace lcals
+{
+
+
+template < size_t work_group_size >
+void FIRST_SUM::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 1;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  FIRST_SUM_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i > 0 && i < iend) {
+            FIRST_SUM_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         FIRST_SUM_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  FIRST_SUM : Unknown Syclvariant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_SUM, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp
index 046528e2b..a40bf533a 100644
--- a/src/lcals/FIRST_SUM.cpp
+++ b/src/lcals/FIRST_SUM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -30,8 +30,9 @@ FIRST_SUM::FIRST_SUM(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_N-1) +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N-1) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * (getActualProblemSize()-1));
 
   setUsesFeature(Forall);
@@ -53,6 +54,9 @@ FIRST_SUM::FIRST_SUM(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp
index 59c1c0bfd..1fc9c48cd 100644
--- a/src/lcals/FIRST_SUM.hpp
+++ b/src/lcals/FIRST_SUM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -55,18 +55,24 @@ class FIRST_SUM : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid); 
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp
index 3790be5f5..17e56a2a0 100644
--- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp
+++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -65,16 +65,24 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid)
        constexpr size_t shmem = 0;
 
        const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size);
-       genlinrecur1<block_size><<<grid_size1, block_size, shmem, res.get_stream()>>>( b5, stb5, sa, sb,
-                                                 kb5i,
-                                                 N );
-       cudaErrchk( cudaGetLastError() );
+
+       RPlaunchCudaKernel( (genlinrecur1<block_size>),
+                           grid_size1, block_size,
+                           shmem, res.get_stream(),
+                           b5, stb5,
+                           sa, sb,
+                           kb5i,
+                           N );
 
        const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size);
-       genlinrecur2<block_size><<<grid_size2, block_size, shmem, res.get_stream()>>>( b5, stb5, sa, sb,
-                                                 kb5i,
-                                                 N );
-       cudaErrchk( cudaGetLastError() );
+
+       RPlaunchCudaKernel( (genlinrecur2<block_size>),
+                           grid_size2, block_size,
+                           shmem, res.get_stream(),
+                           b5, stb5,
+                           sa, sb,
+                           kb5i,
+                           N );
 
     }
     stopTimer();
diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp
index b4dc1be54..5d428fa87 100644
--- a/src/lcals/GEN_LIN_RECUR-Hip.cpp
+++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -65,18 +65,24 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid)
        constexpr size_t shmem = 0;
 
        const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size);
-       hipLaunchKernelGGL((genlinrecur1<block_size>), grid_size1, block_size, shmem, res.get_stream(),
-                                                 b5, stb5, sa, sb,
-                                                 kb5i,
-                                                 N );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (genlinrecur1<block_size>),
+                          grid_size1, block_size,
+                          shmem, res.get_stream(),
+                          b5, stb5,
+                          sa, sb,
+                          kb5i,
+                          N );
 
        const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size);
-       hipLaunchKernelGGL((genlinrecur2<block_size>), grid_size2, block_size, shmem, res.get_stream(),
-                                                 b5, stb5, sa, sb,
-                                                 kb5i,
-                                                 N );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (genlinrecur2<block_size>),
+                          grid_size2, block_size,
+                          shmem, res.get_stream(),
+                          b5, stb5,
+                          sa, sb,
+                          kb5i,
+                          N );
 
     }
     stopTimer();
diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp
index 660d47273..d4ac65995 100644
--- a/src/lcals/GEN_LIN_RECUR-OMP.cpp
+++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp
index 3838a2af0..9932469cb 100644
--- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp
+++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp
index 9d728a9f7..b1d6c3be5 100644
--- a/src/lcals/GEN_LIN_RECUR-Seq.cpp
+++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp
new file mode 100644
index 000000000..06ca45e7b
--- /dev/null
+++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp
@@ -0,0 +1,98 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "GEN_LIN_RECUR.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+template <size_t work_group_size >
+void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  GEN_LIN_RECUR_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size1, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type k = item.get_global_id(0);
+          if (k < N) {
+            GEN_LIN_RECUR_BODY1;
+          }
+ 
+        });
+      });
+
+      const size_t global_size2 = work_group_size * RAJA_DIVIDE_CEILING_INT(N+1, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size2, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i > 0 && i < N+1) {
+            GEN_LIN_RECUR_BODY2;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(0, N), [=] (Index_type k) {
+         GEN_LIN_RECUR_BODY1;
+       });
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(1, N+1), [=] (Index_type i) {
+         GEN_LIN_RECUR_BODY2;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  GEN_LIN_RECUR : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(GEN_LIN_RECUR, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp
index 9c132a3db..80b7f9b10 100644
--- a/src/lcals/GEN_LIN_RECUR.cpp
+++ b/src/lcals/GEN_LIN_RECUR.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -30,8 +30,11 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(2);
-  setBytesPerRep( (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_N +
-                  (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_N );
+  setBytesReadPerRep( 3*sizeof(Real_type ) * m_N +
+                      3*sizeof(Real_type ) * m_N );
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N +
+                         2*sizeof(Real_type ) * m_N );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((3 +
                   3 ) * getActualProblemSize());
 
@@ -58,6 +61,9 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp
index 9586a69b4..33c0895af 100644
--- a/src/lcals/GEN_LIN_RECUR.hpp
+++ b/src/lcals/GEN_LIN_RECUR.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -76,18 +76,24 @@ class GEN_LIN_RECUR : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_b5;
   Real_ptr m_sa;
diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp
index 960f80c49..4d89cc5b6 100644
--- a/src/lcals/HYDRO_1D-Cuda.cpp
+++ b/src/lcals/HYDRO_1D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,10 +52,13 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hydro_1d<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( x, y, z,
-                                            q, r, t,
-                                            iend );
-       cudaErrchk( cudaGetLastError() );
+
+       RPlaunchCudaKernel( (hydro_1d<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           x, y, z,
+                           q, r, t,
+                           iend );
 
     }
     stopTimer();
diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp
index c04da1eb2..aa2a12c99 100644
--- a/src/lcals/HYDRO_1D-Hip.cpp
+++ b/src/lcals/HYDRO_1D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,10 +52,13 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hipLaunchKernelGGL((hydro_1d<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  x, y, z,
-                                            q, r, t,
-                                            iend );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (hydro_1d<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, y, z,
+                          q, r, t,
+                          iend );
 
     }
     stopTimer();
diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp
index f2088205a..d3ac150a4 100644
--- a/src/lcals/HYDRO_1D-OMP.cpp
+++ b/src/lcals/HYDRO_1D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp
index b5fbe0657..b5cf82420 100644
--- a/src/lcals/HYDRO_1D-OMPTarget.cpp
+++ b/src/lcals/HYDRO_1D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp
index 47ca2aedd..22f257e8d 100644
--- a/src/lcals/HYDRO_1D-Seq.cpp
+++ b/src/lcals/HYDRO_1D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp
new file mode 100644
index 000000000..3ccbad9a7
--- /dev/null
+++ b/src/lcals/HYDRO_1D-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HYDRO_1D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+template <size_t work_group_size >
+void HYDRO_1D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  HYDRO_1D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            HYDRO_1D_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         HYDRO_1D_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else { 
+     std::cout << "\n  HYDRO_1D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HYDRO_1D, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp
index d92267fc9..c4821788f 100644
--- a/src/lcals/HYDRO_1D.cpp
+++ b/src/lcals/HYDRO_1D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -30,8 +30,10 @@ HYDRO_1D::HYDRO_1D(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * getActualProblemSize() +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * (getActualProblemSize()+1) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * getActualProblemSize() +
+                      1*sizeof(Real_type ) * (getActualProblemSize()+1) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(5 * getActualProblemSize());
 
   checksum_scale_factor = 0.001 *
@@ -57,6 +59,9 @@ HYDRO_1D::HYDRO_1D(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp
index dd61f112c..4827fcecd 100644
--- a/src/lcals/HYDRO_1D.hpp
+++ b/src/lcals/HYDRO_1D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,18 +57,24 @@ class HYDRO_1D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp
index 2f46572b4..ad09dd2a3 100644
--- a/src/lcals/HYDRO_2D-Cuda.cpp
+++ b/src/lcals/HYDRO_2D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -111,25 +111,31 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid)
       HYDRO_2D_THREADS_PER_BLOCK_CUDA;
       HYDRO_2D_NBLOCKS_CUDA;
 
-      hydro_2d1<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-               <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(zadat, zbdat,
-                                                 zpdat, zqdat, zrdat, zmdat,
-                                                 jn, kn);
-      cudaErrchk( cudaGetLastError() );
-
-      hydro_2d2<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-               <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(zudat, zvdat,
-                                                 zadat, zbdat, zzdat, zrdat,
-                                                 s,
-                                                 jn, kn);
-      cudaErrchk( cudaGetLastError() );
-
-      hydro_2d3<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-               <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(zroutdat, zzoutdat,
-                                                 zrdat, zudat, zzdat, zvdat,
-                                                 t,
-                                                 jn, kn);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (hydro_2d1<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+                          nblocks, nthreads_per_block,
+                          shmem, res.get_stream(),
+                          zadat, zbdat,
+                          zpdat, zqdat,
+                          zrdat, zmdat,
+                          jn, kn);
+
+      RPlaunchCudaKernel( (hydro_2d2<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+                          nblocks, nthreads_per_block,
+                          shmem, res.get_stream(),
+                          zudat, zvdat,
+                          zadat, zbdat,
+                          zzdat, zrdat,
+                          s,
+                          jn, kn);
+
+      RPlaunchCudaKernel( (hydro_2d3<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+                          nblocks, nthreads_per_block,
+                          shmem, res.get_stream(),
+                          zroutdat, zzoutdat,
+                          zrdat, zudat,
+                          zzdat, zvdat,
+                          t,
+                          jn, kn);
 
     }
     stopTimer();
diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp
index 0d65cb260..58b530ba8 100644
--- a/src/lcals/HYDRO_2D-Hip.cpp
+++ b/src/lcals/HYDRO_2D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -111,28 +111,31 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid)
       HYDRO_2D_THREADS_PER_BLOCK_HIP;
       HYDRO_2D_NBLOCKS_HIP;
 
-      hipLaunchKernelGGL((hydro_2d1<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
+      RPlaunchHipKernel( (hydro_2d1<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+                         nblocks, nthreads_per_block,
+                         shmem, res.get_stream(),
                          zadat, zbdat,
-                         zpdat, zqdat, zrdat, zmdat,
+                         zpdat, zqdat,
+                         zrdat, zmdat,
                          jn, kn);
-       hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((hydro_2d2<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                          dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
+       RPlaunchHipKernel( (hydro_2d2<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+                          nblocks, nthreads_per_block,
+                          shmem, res.get_stream(),
                           zudat, zvdat,
-                          zadat, zbdat, zzdat, zrdat,
+                          zadat, zbdat,
+                          zzdat, zrdat,
                           s,
                           jn, kn);
-       hipErrchk( hipGetLastError() );
 
-       hipLaunchKernelGGL((hydro_2d3<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                          dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
+       RPlaunchHipKernel( (hydro_2d3<HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+                          nblocks, nthreads_per_block,
+                          shmem, res.get_stream(),
                           zroutdat, zzoutdat,
-                          zrdat, zudat, zzdat, zvdat,
+                          zrdat, zudat,
+                          zzdat, zvdat,
                           t,
                           jn, kn);
-       hipErrchk( hipGetLastError() );
 
     }
     stopTimer();
diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp
index e153dbdca..92f1bb080 100644
--- a/src/lcals/HYDRO_2D-OMP.cpp
+++ b/src/lcals/HYDRO_2D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp
index 43304884b..f830feefb 100644
--- a/src/lcals/HYDRO_2D-OMPTarget.cpp
+++ b/src/lcals/HYDRO_2D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp
index cf43e885f..522dba679 100644
--- a/src/lcals/HYDRO_2D-Seq.cpp
+++ b/src/lcals/HYDRO_2D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp
new file mode 100644
index 000000000..975467bc5
--- /dev/null
+++ b/src/lcals/HYDRO_2D-Sycl.cpp
@@ -0,0 +1,155 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HYDRO_2D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define j_wg_sz (32)
+#define k_wg_sz (work_group_size / j_wg_sz)
+
+template <size_t work_group_size >
+void HYDRO_2D::runSyclVariantImpl(VariantID vid) {
+
+  const Index_type run_reps = getRunReps();
+  const Index_type kbeg = 1;
+  const Index_type kend = m_kn - 1;
+  const Index_type jbeg = 1;
+  const Index_type jend = m_jn - 1;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  HYDRO_2D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    sycl::range<3> global_dim(1,
+                              k_wg_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_wg_sz),
+                              j_wg_sz * RAJA_DIVIDE_CEILING_INT(jn-2, j_wg_sz));
+    sycl::range<3> wkgroup_dim(1, k_wg_sz, j_wg_sz);
+ 
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&] (sycl::handler& h) { 
+
+        h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          int j = item.get_global_id(2) + 1;
+          int k = item.get_global_id(1) + 1; 
+
+          if (j < jn-1 && k < kn-1) {
+            HYDRO_2D_BODY1
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) { 
+        h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          int j = item.get_global_id(2) + 1;
+          int k = item.get_global_id(1) + 1; 
+
+          if (j < jn-1 && k < kn-1) {
+            HYDRO_2D_BODY2
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) { 
+        h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          int j = item.get_global_id(2) + 1;
+          int k = item.get_global_id(1) + 1; 
+
+          if (j < jn-1 && k < kn-1) {
+            HYDRO_2D_BODY3
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    HYDRO_2D_VIEWS_RAJA;
+
+    using EXECPOL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_1<k_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_2<j_wg_sz>,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel_resource<EXECPOL>(
+        RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
+                          RAJA::RangeSegment(jbeg, jend)),
+        res, 
+        [=] (Index_type k, Index_type j) {
+        HYDRO_2D_BODY1_RAJA;
+      });
+
+      RAJA::kernel_resource<EXECPOL>(
+        RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
+                          RAJA::RangeSegment(jbeg, jend)),
+        res, 
+        [=] (Index_type k, Index_type j) {
+        HYDRO_2D_BODY2_RAJA;
+      });
+
+      RAJA::kernel_resource<EXECPOL>(
+        RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend),
+                          RAJA::RangeSegment(jbeg, jend)),
+        res, 
+        [=] (Index_type k, Index_type j) {
+        HYDRO_2D_BODY3_RAJA;
+      });
+
+    }
+    stopTimer();
+
+  } else { 
+     std::cout << "\n  HYDRO_2D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HYDRO_2D, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp
index fd1dd9406..d1ae233d0 100644
--- a/src/lcals/HYDRO_2D.cpp
+++ b/src/lcals/HYDRO_2D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -33,18 +33,20 @@ HYDRO_2D::HYDRO_2D(const RunParams& params)
   setDefaultProblemSize(m_kn * m_jn);
   setDefaultReps(100);
 
-  m_jn = m_kn = std::sqrt(getTargetProblemSize());
+  m_jn = m_kn = std::sqrt(getTargetProblemSize()) + std::sqrt(2)-1;
   m_array_length = m_kn * m_jn;
 
   setActualProblemSize( getTargetProblemSize() );
 
   setItsPerRep( 3 * getActualProblemSize() );
   setKernelsPerRep(3);
-  setBytesPerRep( (2*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) +
-                  (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_array_length +
-                  (2*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) +
-                  (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_array_length +
-                  (2*sizeof(Real_type ) + 4*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) );
+  setBytesReadPerRep( 4*sizeof(Real_type ) * m_array_length +
+                      4*sizeof(Real_type ) * m_array_length +
+                      4*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) );
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) +
+                         2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) +
+                         2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((14 +
                   26 +
                   4  ) * (m_jn-2)*(m_kn-2));
@@ -72,6 +74,9 @@ HYDRO_2D::HYDRO_2D(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp
index b6ad936ca..1c9cc8d1c 100644
--- a/src/lcals/HYDRO_2D.hpp
+++ b/src/lcals/HYDRO_2D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -151,19 +151,25 @@ class HYDRO_2D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Real_ptr m_za;
   Real_ptr m_zb;
diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp
index 02b22cbb8..3ec139130 100644
--- a/src/lcals/INT_PREDICT-Cuda.cpp
+++ b/src/lcals/INT_PREDICT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -55,12 +55,16 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       int_predict<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( px,
-                                               dm22, dm23, dm24, dm25,
-                                               dm26, dm27, dm28, c0,
-                                               offset,
-                                               iend );
-       cudaErrchk( cudaGetLastError() );
+      
+       RPlaunchCudaKernel( (int_predict<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           px,
+                           dm22, dm23, dm24,
+                           dm25, dm26, dm27,
+                           dm28, c0,
+                           offset,
+                           iend );
 
     }
     stopTimer();
diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp
index cc0c06477..1e2741cd7 100644
--- a/src/lcals/INT_PREDICT-Hip.cpp
+++ b/src/lcals/INT_PREDICT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -55,12 +55,16 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hipLaunchKernelGGL((int_predict<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  px,
-                                               dm22, dm23, dm24, dm25,
-                                               dm26, dm27, dm28, c0,
-                                               offset,
-                                               iend );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (int_predict<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          px,
+                          dm22, dm23, dm24,
+                          dm25, dm26, dm27,
+                          dm28, c0,
+                          offset,
+                          iend );
 
     }
     stopTimer();
diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp
index 29b167881..b33e5cd2b 100644
--- a/src/lcals/INT_PREDICT-OMP.cpp
+++ b/src/lcals/INT_PREDICT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp
index 4172c1822..a7e257532 100644
--- a/src/lcals/INT_PREDICT-OMPTarget.cpp
+++ b/src/lcals/INT_PREDICT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp
index de167bc11..1d8e52fda 100644
--- a/src/lcals/INT_PREDICT-Seq.cpp
+++ b/src/lcals/INT_PREDICT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp
new file mode 100644
index 000000000..992dbcba1
--- /dev/null
+++ b/src/lcals/INT_PREDICT-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INT_PREDICT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+template <size_t work_group_size >
+void INT_PREDICT::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  INT_PREDICT_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            INT_PREDICT_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         INT_PREDICT_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  INT_PREDICT : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INT_PREDICT, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp
index eb56b5725..afb4a2ea9 100644
--- a/src/lcals/INT_PREDICT.cpp
+++ b/src/lcals/INT_PREDICT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type ) + 10*sizeof(Real_type )) * getActualProblemSize() );
+  setBytesReadPerRep( 10*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(17 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -50,6 +52,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp
index a81ae6fb2..5435af4f4 100644
--- a/src/lcals/INT_PREDICT.hpp
+++ b/src/lcals/INT_PREDICT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -72,18 +72,24 @@ class INT_PREDICT : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Index_type m_array_length;
   Index_type m_offset;
diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp
index 76c5082cd..40a8bf7f0 100644
--- a/src/lcals/PLANCKIAN-Cuda.cpp
+++ b/src/lcals/PLANCKIAN-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,10 +53,13 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       planckian<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( x, y,
-                                             u, v, w,
-                                             iend );
-       cudaErrchk( cudaGetLastError() );
+
+       RPlaunchCudaKernel( (planckian<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           x, y,
+                           u, v, w,
+                           iend );
 
     }
     stopTimer();
diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp
index 7d93b2dca..00323115d 100644
--- a/src/lcals/PLANCKIAN-Hip.cpp
+++ b/src/lcals/PLANCKIAN-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,10 +53,13 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hipLaunchKernelGGL((planckian<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  x, y,
-                                             u, v, w,
-                                             iend );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (planckian<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, y,
+                          u, v, w,
+                          iend );
 
     }
     stopTimer();
diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp
index cc90067eb..e82f9eccd 100644
--- a/src/lcals/PLANCKIAN-OMP.cpp
+++ b/src/lcals/PLANCKIAN-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp
index fb0f41cef..c69732531 100644
--- a/src/lcals/PLANCKIAN-OMPTarget.cpp
+++ b/src/lcals/PLANCKIAN-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp
index 25ff3ff2b..56c57971b 100644
--- a/src/lcals/PLANCKIAN-Seq.cpp
+++ b/src/lcals/PLANCKIAN-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp
new file mode 100644
index 000000000..31b43c2f7
--- /dev/null
+++ b/src/lcals/PLANCKIAN-Sycl.cpp
@@ -0,0 +1,84 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "PLANCKIAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+#include <cmath>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+template <size_t work_group_size >
+void PLANCKIAN::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue(); 
+
+  PLANCKIAN_DATA_SETUP;
+
+  using sycl::exp;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            PLANCKIAN_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         PLANCKIAN_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  PLANCKIAN : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PLANCKIAN, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp
index 2bb8d3f7b..cf15e6a29 100644
--- a/src/lcals/PLANCKIAN.cpp
+++ b/src/lcals/PLANCKIAN.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,7 +28,9 @@ PLANCKIAN::PLANCKIAN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * getActualProblemSize() );
+  setBytesReadPerRep( 3*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(4 * getActualProblemSize()); // 1 exp
 
   setUsesFeature(Forall);
@@ -50,6 +52,9 @@ PLANCKIAN::PLANCKIAN(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp
index 92b55fc95..a999d2178 100644
--- a/src/lcals/PLANCKIAN.hpp
+++ b/src/lcals/PLANCKIAN.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,18 +57,24 @@ class PLANCKIAN : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_x;
   Real_ptr m_y;
diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp
index 8b6643d2b..18cc284ea 100644
--- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp
+++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,8 +23,9 @@ namespace lcals
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z,
-                    Index_type N)
+__global__ void tridiag_elim(Real_ptr xout, Real_ptr xin,
+                             Real_ptr y, Real_ptr z,
+                             Index_type N)
 {
    Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i > 0 && i < N) {
@@ -51,10 +52,13 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       eos<block_size>
-          <<<grid_size, block_size, shmem, res.get_stream()>>>( xout, xin, y, z,
-                                       iend );
-       cudaErrchk( cudaGetLastError() );
+
+       RPlaunchCudaKernel( (tridiag_elim<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           xout, xin,
+                           y, z,
+                           iend );
 
     }
     stopTimer();
diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp
index f6c4c9ebe..1b0db7e7f 100644
--- a/src/lcals/TRIDIAG_ELIM-Hip.cpp
+++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,8 +23,9 @@ namespace lcals
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z,
-                    Index_type N)
+__global__ void tridiag_elim(Real_ptr xout, Real_ptr xin,
+                             Real_ptr y, Real_ptr z,
+                             Index_type N)
 {
    Index_type i = blockIdx.x * block_size + threadIdx.x;
    if (i > 0 && i < N) {
@@ -51,9 +52,13 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid)
 
        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
        constexpr size_t shmem = 0;
-       hipLaunchKernelGGL((eos<block_size>), grid_size, block_size, shmem, res.get_stream(), xout, xin, y, z,
-                                       iend );
-       hipErrchk( hipGetLastError() );
+
+       RPlaunchHipKernel( (tridiag_elim<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          xout, xin,
+                          y, z,
+                          iend );
 
     }
     stopTimer();
diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp
index 8f31c9493..22673b5f7 100644
--- a/src/lcals/TRIDIAG_ELIM-OMP.cpp
+++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp
index 59a8a323c..5433879a5 100644
--- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp
+++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp
index 5c0003d93..0b23c9143 100644
--- a/src/lcals/TRIDIAG_ELIM-Seq.cpp
+++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp
new file mode 100644
index 000000000..74e23665f
--- /dev/null
+++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIDIAG_ELIM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+template <size_t work_group_size >
+void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 1;
+  const Index_type iend = m_N;
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  TRIDIAG_ELIM_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i > 0 && i < iend) {
+            TRIDIAG_ELIM_BODY;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         TRIDIAG_ELIM_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  TRIDIAG_ELIM : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRIDIAG_ELIM, Sycl)
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp
index 710927c3e..9955bee66 100644
--- a/src/lcals/TRIDIAG_ELIM.cpp
+++ b/src/lcals/TRIDIAG_ELIM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,11 +26,13 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params)
 
   setActualProblemSize( getTargetProblemSize() );
 
-  m_N = getActualProblemSize();
+  m_N = getActualProblemSize() + 1;
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type ) + 3*sizeof(Real_type )) * (m_N-1) );
+  setBytesReadPerRep( 3*sizeof(Real_type ) * (m_N-1) );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * (getActualProblemSize()-1));
 
   setUsesFeature(Forall);
@@ -52,6 +54,9 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp
index c95685de9..69c1a2d9c 100644
--- a/src/lcals/TRIDIAG_ELIM.hpp
+++ b/src/lcals/TRIDIAG_ELIM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -57,18 +57,24 @@ class TRIDIAG_ELIM : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_xout;
   Real_ptr m_xin;
diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt
index f9cd2c1c2..2722a1fac 100644
--- a/src/polybench/CMakeLists.txt
+++ b/src/polybench/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -14,77 +14,90 @@ blt_add_library(
           POLYBENCH_2MM-Cuda.cpp
           POLYBENCH_2MM-OMP.cpp
           POLYBENCH_2MM-OMPTarget.cpp
+          POLYBENCH_2MM-Sycl.cpp
           POLYBENCH_3MM.cpp
           POLYBENCH_3MM-Seq.cpp
           POLYBENCH_3MM-Hip.cpp
           POLYBENCH_3MM-Cuda.cpp
           POLYBENCH_3MM-OMP.cpp
           POLYBENCH_3MM-OMPTarget.cpp
+          POLYBENCH_3MM-Sycl.cpp
           POLYBENCH_ADI.cpp
           POLYBENCH_ADI-Seq.cpp
           POLYBENCH_ADI-Hip.cpp
           POLYBENCH_ADI-Cuda.cpp
           POLYBENCH_ADI-OMP.cpp
           POLYBENCH_ADI-OMPTarget.cpp
+          POLYBENCH_ADI-Sycl.cpp
           POLYBENCH_ATAX.cpp
           POLYBENCH_ATAX-Seq.cpp
           POLYBENCH_ATAX-Hip.cpp
           POLYBENCH_ATAX-Cuda.cpp
           POLYBENCH_ATAX-OMP.cpp
           POLYBENCH_ATAX-OMPTarget.cpp
+          POLYBENCH_ATAX-Sycl.cpp
           POLYBENCH_FDTD_2D.cpp
           POLYBENCH_FDTD_2D-Seq.cpp
           POLYBENCH_FDTD_2D-Hip.cpp
           POLYBENCH_FDTD_2D-Cuda.cpp
           POLYBENCH_FDTD_2D-OMP.cpp
           POLYBENCH_FDTD_2D-OMPTarget.cpp
+          POLYBENCH_FDTD_2D-Sycl.cpp
           POLYBENCH_FLOYD_WARSHALL.cpp
           POLYBENCH_FLOYD_WARSHALL-Seq.cpp
           POLYBENCH_FLOYD_WARSHALL-Hip.cpp
           POLYBENCH_FLOYD_WARSHALL-Cuda.cpp
           POLYBENCH_FLOYD_WARSHALL-OMP.cpp
           POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp
+          POLYBENCH_FLOYD_WARSHALL-Sycl.cpp
           POLYBENCH_GEMM.cpp
           POLYBENCH_GEMM-Seq.cpp
           POLYBENCH_GEMM-Hip.cpp
           POLYBENCH_GEMM-Cuda.cpp
           POLYBENCH_GEMM-OMP.cpp
           POLYBENCH_GEMM-OMPTarget.cpp
+          POLYBENCH_GEMM-Sycl.cpp
           POLYBENCH_GEMVER.cpp
           POLYBENCH_GEMVER-Seq.cpp
           POLYBENCH_GEMVER-Hip.cpp
           POLYBENCH_GEMVER-Cuda.cpp
           POLYBENCH_GEMVER-OMP.cpp
           POLYBENCH_GEMVER-OMPTarget.cpp
+          POLYBENCH_GEMVER-Sycl.cpp
           POLYBENCH_GESUMMV.cpp
           POLYBENCH_GESUMMV-Seq.cpp
           POLYBENCH_GESUMMV-Hip.cpp
           POLYBENCH_GESUMMV-Cuda.cpp
           POLYBENCH_GESUMMV-OMP.cpp
           POLYBENCH_GESUMMV-OMPTarget.cpp
+          POLYBENCH_GESUMMV-Sycl.cpp
           POLYBENCH_HEAT_3D.cpp
           POLYBENCH_HEAT_3D-Seq.cpp
           POLYBENCH_HEAT_3D-Hip.cpp
           POLYBENCH_HEAT_3D-Cuda.cpp
           POLYBENCH_HEAT_3D-OMP.cpp
           POLYBENCH_HEAT_3D-OMPTarget.cpp
+          POLYBENCH_HEAT_3D-Sycl.cpp
           POLYBENCH_JACOBI_1D.cpp
           POLYBENCH_JACOBI_1D-Seq.cpp
           POLYBENCH_JACOBI_1D-Hip.cpp
           POLYBENCH_JACOBI_1D-Cuda.cpp
           POLYBENCH_JACOBI_1D-OMP.cpp
           POLYBENCH_JACOBI_1D-OMPTarget.cpp
+          POLYBENCH_JACOBI_1D-Sycl.cpp
           POLYBENCH_JACOBI_2D.cpp
           POLYBENCH_JACOBI_2D-Seq.cpp
           POLYBENCH_JACOBI_2D-Hip.cpp
           POLYBENCH_JACOBI_2D-Cuda.cpp
           POLYBENCH_JACOBI_2D-OMP.cpp
           POLYBENCH_JACOBI_2D-OMPTarget.cpp
+          POLYBENCH_JACOBI_2D-Sycl.cpp
           POLYBENCH_MVT.cpp
           POLYBENCH_MVT-Seq.cpp
           POLYBENCH_MVT-Hip.cpp
           POLYBENCH_MVT-Cuda.cpp
           POLYBENCH_MVT-OMP.cpp
           POLYBENCH_MVT-OMPTarget.cpp
+          POLYBENCH_MVT-Sycl.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp
index 7a8f43e58..28b49c779 100644
--- a/src/polybench/POLYBENCH_2MM-Cuda.cpp
+++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -125,16 +125,24 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       POLY_2MM_1_NBLOCKS_CUDA;
-      poly_2mm_1<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                <<<nblocks1, nthreads_per_block, shmem, res.get_stream()>>>(tmp, A, B, alpha,
-                                                   ni, nj, nk);
-      cudaErrchk( cudaGetLastError() );
+      
+      RPlaunchCudaKernel(
+        (poly_2mm_1<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks1, nthreads_per_block,
+        shmem, res.get_stream(),
+        tmp, A, B,
+        alpha,
+        ni, nj, nk );
 
       POLY_2MM_2_NBLOCKS_CUDA;
-      poly_2mm_2<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                <<<nblocks2, nthreads_per_block, shmem, res.get_stream()>>>(tmp, C, D, beta,
-                                                   ni, nl, nj);
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel(
+        (poly_2mm_2<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks2, nthreads_per_block,
+        shmem, res.get_stream(),
+        tmp, C, D,
+        beta,
+        ni, nl, nj );
 
     }
     stopTimer();
@@ -148,30 +156,38 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       POLY_2MM_1_NBLOCKS_CUDA;
-      poly_2mm_1_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                    <<<nblocks1, nthreads_per_block, shmem, res.get_stream()>>>(ni, nj,
-        [=] __device__ (Index_type i, Index_type j) {
-          POLYBENCH_2MM_BODY1;
-          for (Index_type k=0; k < nk; ++k) {
-            POLYBENCH_2MM_BODY2;
-          }
-          POLYBENCH_2MM_BODY3;
+
+      auto poly_2mm_1_lambda = [=] __device__ (Index_type i, Index_type j) {
+        POLYBENCH_2MM_BODY1;
+        for (Index_type k=0; k < nk; ++k) {
+          POLYBENCH_2MM_BODY2;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
+        POLYBENCH_2MM_BODY3;
+      };
+
+      RPlaunchCudaKernel(
+        (poly_2mm_1_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                        decltype(poly_2mm_1_lambda)>),
+        nblocks1, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nj, poly_2mm_1_lambda );
 
       POLY_2MM_2_NBLOCKS_CUDA;
-      poly_2mm_2_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                    <<<nblocks2, nthreads_per_block, shmem, res.get_stream()>>>(ni, nl,
-        [=] __device__ (Index_type i, Index_type l) {
-          POLYBENCH_2MM_BODY4;
-          for (Index_type j=0; j < nj; ++j) {
-            POLYBENCH_2MM_BODY5;
-          }
-          POLYBENCH_2MM_BODY6;
+
+      auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) {
+        POLYBENCH_2MM_BODY4;
+        for (Index_type j=0; j < nj; ++j) {
+          POLYBENCH_2MM_BODY5;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
+        POLYBENCH_2MM_BODY6;
+      };
+     
+      RPlaunchCudaKernel( 
+        (poly_2mm_2_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                        decltype(poly_2mm_2_lambda)>),
+        nblocks2, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nl, poly_2mm_2_lambda );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp
index 1a0f26ecd..28308ef32 100644
--- a/src/polybench/POLYBENCH_2MM-Hip.cpp
+++ b/src/polybench/POLYBENCH_2MM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -125,18 +125,24 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       POLY_2MM_1_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_2mm_1<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         tmp, A, B, alpha,
-                         ni, nj, nk);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel(
+        (poly_2mm_1<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks1, nthreads_per_block,
+        shmem, res.get_stream(),
+        tmp, A, B,
+        alpha,
+        ni, nj, nk );
 
       POLY_2MM_2_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_2mm_2<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         tmp, C, D, beta,
-                         ni, nl, nj);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel(
+        (poly_2mm_2<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks2, nthreads_per_block,
+        shmem, res.get_stream(),
+        tmp, C, D,
+        beta,
+        ni, nl, nj );
 
     }
     stopTimer();
@@ -149,6 +155,8 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid)
       POLY_2MM_THREADS_PER_BLOCK_HIP;
       constexpr size_t shmem = 0;
 
+      POLY_2MM_1_NBLOCKS_HIP;
+
       auto poly_2mm_1_lambda = [=] __device__ (Index_type i, Index_type j) {
         POLYBENCH_2MM_BODY1;
         for (Index_type k=0; k < nk; ++k) {
@@ -157,11 +165,14 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid)
         POLYBENCH_2MM_BODY3;
       };
 
-      POLY_2MM_1_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_2mm_1_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_2mm_1_lambda)>),
-                         dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         ni, nj, poly_2mm_1_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (poly_2mm_1_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                        decltype(poly_2mm_1_lambda)>),
+        nblocks1, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nj, poly_2mm_1_lambda );
+
+      POLY_2MM_2_NBLOCKS_HIP;
 
       auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) {
         POLYBENCH_2MM_BODY4;
@@ -171,11 +182,12 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid)
         POLYBENCH_2MM_BODY6;
       };
 
-      POLY_2MM_2_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_2mm_2_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_2mm_2_lambda)>),
-                         dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         ni, nl, poly_2mm_2_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (poly_2mm_2_lam<POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                        decltype(poly_2mm_2_lambda)>),
+        nblocks2, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nl, poly_2mm_2_lambda );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp
index 8b6cdb290..b73813df8 100644
--- a/src/polybench/POLYBENCH_2MM-OMP.cpp
+++ b/src/polybench/POLYBENCH_2MM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp
index 79d6f96c0..781139422 100644
--- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp
index 1a3120246..5cffa3207 100644
--- a/src/polybench/POLYBENCH_2MM-Seq.cpp
+++ b/src/polybench/POLYBENCH_2MM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp
new file mode 100644
index 000000000..867ad780e
--- /dev/null
+++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp
@@ -0,0 +1,172 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_2MM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+#include <cmath>
+
+namespace rajaperf 
+{
+namespace polybench
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define in_wg_sz (32)
+#define out_wg_sz (work_group_size / in_wg_sz)
+
+
+template <size_t work_group_size >
+void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_2MM_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    sycl::range<3> global_dim1(1,
+                               out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz),
+                               in_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, in_wg_sz));
+
+    sycl::range<3> global_dim2(1,
+                               out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz),
+                               in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz));
+
+    sycl::range<3> wkgroup_dim(1, out_wg_sz, in_wg_sz);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim), 
+                       [=] (sycl::nd_item<3> item) {
+
+          Index_type i = item.get_global_id(1); 
+          Index_type j = item.get_global_id(2); 
+
+          if (i < ni && j < nj) {
+            POLYBENCH_2MM_BODY1;
+            for (Index_type k=0; k < nk; ++k) {
+              POLYBENCH_2MM_BODY2;
+            }
+            POLYBENCH_2MM_BODY3;
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3>( global_dim2, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+         Index_type i = item.get_global_id(1); 
+         Index_type l = item.get_global_id(2);
+
+         if (i < ni && l < nl) {        
+           POLYBENCH_2MM_BODY4;
+           for (Index_type j=0; j < nj; ++j) {
+              POLYBENCH_2MM_BODY5;
+           }
+           POLYBENCH_2MM_BODY6;
+         }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+    
+    POLYBENCH_2MM_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_1<out_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_2<in_wg_sz>,
+              RAJA::statement::Lambda<0, RAJA::Params<0>>,
+              RAJA::statement::For<2, RAJA::seq_exec,
+                RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>>
+              >,
+              RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>>
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel_param_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, ni},
+                         RAJA::RangeSegment{0, nj},
+                         RAJA::RangeSegment{0, nk}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=]  (Real_type &dot) {
+          POLYBENCH_2MM_BODY1_RAJA;
+        },
+        [=] (Index_type i, Index_type j, Index_type k,
+             Real_type &dot) {
+          POLYBENCH_2MM_BODY2_RAJA;
+        },
+        [=] (Index_type i, Index_type j,
+             Real_type &dot) {
+          POLYBENCH_2MM_BODY3_RAJA;
+        }
+      );
+ 
+      RAJA::kernel_param_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, ni},
+                         RAJA::RangeSegment{0, nl},
+                         RAJA::RangeSegment{0, nj}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=]  (Real_type &dot) {
+          POLYBENCH_2MM_BODY4_RAJA;
+        },
+        [=] (Index_type i, Index_type l, Index_type j,
+             Real_type &dot) {
+          POLYBENCH_2MM_BODY5_RAJA;
+        },
+        [=]  (Index_type i, Index_type l,
+              Real_type &dot) {
+          POLYBENCH_2MM_BODY6_RAJA;
+        }
+      );
+
+    }
+    stopTimer();
+
+  } else {
+      std::cout << "\n  POLYBENCH_2MM : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_2MM, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+  
diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp
index 5c0ebe484..55cc577cc 100644
--- a/src/polybench/POLYBENCH_2MM.cpp
+++ b/src/polybench/POLYBENCH_2MM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -31,9 +31,9 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
                                    ni_default*nl_default ) );
   setDefaultReps(2);
 
-  m_ni = std::sqrt( getTargetProblemSize() ) + 1;
+  m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1;
   m_nj = m_ni;
-  m_nk = nk_default;
+  m_nk = Index_type(double(nk_default)/ni_default*m_ni);
   m_nl = m_ni;
 
   m_alpha = 1.5;
@@ -44,13 +44,15 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
 
   setItsPerRep( m_ni*m_nj + m_ni*m_nl );
   setKernelsPerRep(2);
-  setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk +
+                      1*sizeof(Real_type ) * m_nj * m_nk +
 
-                  (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nl +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nj +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nl );
+                      1*sizeof(Real_type ) * m_ni * m_nj +
+                      1*sizeof(Real_type ) * m_nj * m_nl );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj +
+
+                         1*sizeof(Real_type ) * m_ni * m_nl );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * m_ni*m_nj*m_nk +
                  2 * m_ni*m_nj*m_nl );
 
@@ -78,6 +80,9 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_2MM::~POLYBENCH_2MM()
diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp
index e11d4889b..541682454 100644
--- a/src/polybench/POLYBENCH_2MM.hpp
+++ b/src/polybench/POLYBENCH_2MM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -127,18 +127,23 @@ class POLYBENCH_2MM : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx); 
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_ni;
   Index_type m_nj;
diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp
index 9131a629a..401660aca 100644
--- a/src/polybench/POLYBENCH_3MM-Cuda.cpp
+++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -159,22 +159,31 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       POLY_3MM_1_NBLOCKS_CUDA;
-      poly_3mm_1<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                <<<nblocks1, nthreads_per_block, shmem, res.get_stream()>>>(E, A, B,
-                                                   ni, nj, nk);
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel(
+        (poly_3mm_1<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks1, nthreads_per_block,
+        shmem, res.get_stream(),
+        E, A, B,
+        ni, nj, nk );
 
       POLY_3MM_2_NBLOCKS_CUDA;
-      poly_3mm_2<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                <<<nblocks2, nthreads_per_block, shmem, res.get_stream()>>>(F, C, D,
-                                                   nj, nl, nm);
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel(
+        (poly_3mm_2<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks2, nthreads_per_block,
+        shmem, res.get_stream(),
+        F, C, D,
+        nj, nl, nm );
 
       POLY_3MM_3_NBLOCKS_CUDA;
-      poly_3mm_3<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                <<<nblocks3, nthreads_per_block, shmem, res.get_stream()>>>(G, E, F,
-                                                   ni, nl, nj);
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel(
+        (poly_3mm_3<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks3, nthreads_per_block,
+        shmem, res.get_stream(),
+        G, E, F,
+        ni, nl, nj );
 
     }
     stopTimer();
@@ -188,43 +197,55 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       POLY_3MM_1_NBLOCKS_CUDA;
-      poly_3mm_1_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                    <<<nblocks1, nthreads_per_block, shmem, res.get_stream()>>>(ni, nj,
-        [=] __device__ (Index_type i, Index_type j) {
-          POLYBENCH_3MM_BODY1;
-          for (Index_type k=0; k < nk; ++k) {
-            POLYBENCH_3MM_BODY2;
-          }
-          POLYBENCH_3MM_BODY3;
+
+      auto poly_3mm_1_lambda = [=] __device__ (Index_type i, Index_type j) {
+        POLYBENCH_3MM_BODY1;
+        for (Index_type k=0; k < nk; ++k) {
+          POLYBENCH_3MM_BODY2;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
+        POLYBENCH_3MM_BODY3;
+      };
+
+      RPlaunchCudaKernel(
+        (poly_3mm_1_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                        decltype(poly_3mm_1_lambda)>),
+        nblocks1, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nj, poly_3mm_1_lambda );
 
       POLY_3MM_2_NBLOCKS_CUDA;
-      poly_3mm_2_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                    <<<nblocks2, nthreads_per_block, shmem, res.get_stream()>>>(nj, nl,
-        [=] __device__ (Index_type j, Index_type l) {
-          POLYBENCH_3MM_BODY4;
-          for (Index_type m=0; m < nm; ++m) {
-            POLYBENCH_3MM_BODY5;
-          }
-          POLYBENCH_3MM_BODY6;
-        }
-      );
-      cudaErrchk( cudaGetLastError() );
+
+      auto poly_3mm_2_lambda = [=] __device__ (Index_type j, Index_type l) {
+        POLYBENCH_3MM_BODY4;
+        for (Index_type m=0; m < nm; ++m) { 
+          POLYBENCH_3MM_BODY5;
+        } 
+        POLYBENCH_3MM_BODY6;
+      };
+
+      RPlaunchCudaKernel(
+        (poly_3mm_2_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                        decltype(poly_3mm_2_lambda)>),
+        nblocks2, nthreads_per_block,
+        shmem, res.get_stream(),
+        nj, nl, poly_3mm_2_lambda );
 
       POLY_3MM_3_NBLOCKS_CUDA;
-      poly_3mm_3_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                    <<<nblocks3, nthreads_per_block, shmem, res.get_stream()>>>(ni, nl,
-        [=] __device__ (Index_type i, Index_type l) {
-          POLYBENCH_3MM_BODY7;
-          for (Index_type j=0; j < nj; ++j) {
-            POLYBENCH_3MM_BODY8;
-          }
-          POLYBENCH_3MM_BODY9;
+
+      auto poly_3mm_3_lambda = [=] __device__ (Index_type i, Index_type l) {
+        POLYBENCH_3MM_BODY7;
+        for (Index_type j=0; j < nj; ++j) {
+          POLYBENCH_3MM_BODY8;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
+        POLYBENCH_3MM_BODY9;
+      };
+
+      RPlaunchCudaKernel(
+        (poly_3mm_3_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                        decltype(poly_3mm_3_lambda)>),
+        nblocks3, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nl, poly_3mm_3_lambda );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp
index 2cdf3df13..53e106ad1 100644
--- a/src/polybench/POLYBENCH_3MM-Hip.cpp
+++ b/src/polybench/POLYBENCH_3MM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -158,25 +158,31 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       POLY_3MM_1_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_1<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks1) , dim3(nthreads_per_block), shmem, res.get_stream(),
-                         E, A, B,
-                         ni, nj, nk);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel(
+        (poly_3mm_1<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks1, nthreads_per_block,
+        shmem, res.get_stream(),
+        E, A, B,
+        ni, nj, nk );
 
       POLY_3MM_2_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_2<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         F, C, D,
-                         nj, nl, nm);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel(
+        (poly_3mm_2<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks2, nthreads_per_block,
+        shmem, res.get_stream(),
+        F, C, D,
+        nj, nl, nm );
 
       POLY_3MM_3_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_3<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks3), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         G, E, F,
-                         ni, nl, nj);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel(
+        (poly_3mm_3<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks3, nthreads_per_block,
+        shmem, res.get_stream(),
+        G, E, F,
+        ni, nl, nj );
 
     }
     stopTimer();
@@ -189,6 +195,8 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid)
       POLY_3MM_THREADS_PER_BLOCK_HIP;
       constexpr size_t shmem = 0;
 
+      POLY_3MM_1_NBLOCKS_HIP;
+
       auto poly_3mm_1_lambda = [=] __device__ (Index_type i, Index_type j) {
         POLYBENCH_3MM_BODY1;
         for (Index_type k=0; k < nk; ++k) {
@@ -197,11 +205,14 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid)
         POLYBENCH_3MM_BODY3;
       };
 
-      POLY_3MM_1_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_1_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_3mm_1_lambda)>),
-                         dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         ni, nj, poly_3mm_1_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (poly_3mm_1_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                        decltype(poly_3mm_1_lambda)>),
+        nblocks1, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nj, poly_3mm_1_lambda );
+
+      POLY_3MM_2_NBLOCKS_HIP;
 
       auto poly_3mm_2_lambda = [=] __device__ (Index_type j, Index_type l) {
         POLYBENCH_3MM_BODY4;
@@ -211,11 +222,14 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid)
         POLYBENCH_3MM_BODY6;
       };
 
-      POLY_3MM_2_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_2_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_3mm_2_lambda)>),
-                         dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         nj, nl, poly_3mm_2_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (poly_3mm_2_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                        decltype(poly_3mm_2_lambda)>),
+        nblocks2, nthreads_per_block,
+        shmem, res.get_stream(),
+        nj, nl, poly_3mm_2_lambda );
+
+      POLY_3MM_3_NBLOCKS_HIP;
 
       auto poly_3mm_3_lambda = [=] __device__ (Index_type i, Index_type l) {
         POLYBENCH_3MM_BODY7;
@@ -225,11 +239,12 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid)
         POLYBENCH_3MM_BODY9;
       };
 
-      POLY_3MM_3_NBLOCKS_HIP;
-      hipLaunchKernelGGL((poly_3mm_3_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_3mm_3_lambda)>),
-                         dim3(nblocks3), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         ni, nl, poly_3mm_3_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (poly_3mm_3_lam<POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                        decltype(poly_3mm_3_lambda)>),
+        nblocks3, nthreads_per_block,
+        shmem, res.get_stream(),
+        ni, nl, poly_3mm_3_lambda );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp
index 966853d7d..19b15098a 100644
--- a/src/polybench/POLYBENCH_3MM-OMP.cpp
+++ b/src/polybench/POLYBENCH_3MM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp
index c25a49dee..d165426d1 100644
--- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp
index 619b2ff10..24098e109 100644
--- a/src/polybench/POLYBENCH_3MM-Seq.cpp
+++ b/src/polybench/POLYBENCH_3MM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_3MM-Sycl.cpp b/src/polybench/POLYBENCH_3MM-Sycl.cpp
new file mode 100644
index 000000000..b6abea7b9
--- /dev/null
+++ b/src/polybench/POLYBENCH_3MM-Sycl.cpp
@@ -0,0 +1,216 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_3MM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define in_wg_sz (32)
+#define out_wg_sz (work_group_size / in_wg_sz)
+
+
+template < size_t work_group_size >
+void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_3MM_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      sycl::range<3> global_dim1(1,
+                                 out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz),
+                                 in_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, in_wg_sz));
+
+      sycl::range<3> global_dim2(1,
+                                 out_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, out_wg_sz),
+                                 in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz));
+
+      sycl::range<3> global_dim3(1,
+                                 out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz),
+                                 in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz));
+
+      sycl::range<3> wkgroup_dim(1, out_wg_sz, in_wg_sz);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          Index_type i = item.get_global_id(1);
+          Index_type j = item.get_global_id(2);
+
+          if (i < ni && j < nj) {
+            POLYBENCH_3MM_BODY1;
+            for (Index_type k=0; k < nk; ++k) {
+              POLYBENCH_3MM_BODY2;
+            }
+            POLYBENCH_3MM_BODY3;
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3>( global_dim2, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          Index_type j = item.get_global_id(1);
+          Index_type l = item.get_global_id(2);
+
+          if (j < nj && l < nl) {
+            POLYBENCH_3MM_BODY4;
+            for (Index_type m=0; m < nm; ++m) {
+              POLYBENCH_3MM_BODY5;
+            }
+            POLYBENCH_3MM_BODY6;
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3>( global_dim3, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          Index_type i = item.get_global_id(1);
+          Index_type l = item.get_global_id(2);
+
+          if (i < ni && l < nl) {
+            POLYBENCH_3MM_BODY7;
+            for (Index_type j=0; j < nj; ++j) {
+              POLYBENCH_3MM_BODY8;
+            }
+            POLYBENCH_3MM_BODY9;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_3MM_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_1<out_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_2<in_wg_sz>,
+              RAJA::statement::Lambda<0, RAJA::Params<0>>,
+              RAJA::statement::For<2, RAJA::seq_exec,
+                RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>>
+              >,
+              RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>>
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel_param_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, ni},
+                         RAJA::RangeSegment{0, nj},
+                         RAJA::RangeSegment{0, nk}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=] (Real_type &dot) {
+          POLYBENCH_3MM_BODY1_RAJA;
+        },
+        [=] (Index_type i, Index_type j, Index_type k,
+             Real_type &dot) {
+          POLYBENCH_3MM_BODY2_RAJA;
+        },
+        [=] (Index_type i, Index_type j,
+             Real_type &dot) {
+          POLYBENCH_3MM_BODY3_RAJA;
+        }
+
+      );
+
+      RAJA::kernel_param_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, nj},
+                         RAJA::RangeSegment{0, nl},
+                         RAJA::RangeSegment{0, nm}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=] (Real_type &dot) {
+          POLYBENCH_3MM_BODY4_RAJA;
+        },
+        [=] (Index_type j, Index_type l, Index_type m,
+             Real_type &dot) {
+          POLYBENCH_3MM_BODY5_RAJA;
+        },
+        [=] (Index_type j, Index_type l,
+             Real_type &dot) {
+          POLYBENCH_3MM_BODY6_RAJA;
+        }
+
+      );
+
+      RAJA::kernel_param_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, ni},
+                         RAJA::RangeSegment{0, nl},
+                         RAJA::RangeSegment{0, nj}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=] (Real_type &dot) {
+          POLYBENCH_3MM_BODY7_RAJA;
+        },
+        [=] (Index_type i, Index_type l, Index_type j,
+             Real_type &dot) {
+          POLYBENCH_3MM_BODY8_RAJA;
+        },
+        [=] (Index_type i, Index_type l,
+             Real_type &dot) {
+          POLYBENCH_3MM_BODY9_RAJA;
+        }
+
+      );
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_3MM : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_3MM, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp
index a649e2e89..eb8e63d66 100644
--- a/src/polybench/POLYBENCH_3MM.cpp
+++ b/src/polybench/POLYBENCH_3MM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -35,11 +35,11 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
   setDefaultProblemSize( ni_default * nj_default );
   setDefaultReps(2);
 
-  m_ni = std::sqrt( getTargetProblemSize() ) + 1;
+  m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1;
   m_nj = m_ni;
-  m_nk = nk_default;
+  m_nk = Index_type(double(nk_default)/ni_default*m_ni);
   m_nl = m_ni;
-  m_nm = nm_default;
+  m_nm = Index_type(double(nm_default)/ni_default*m_ni);
 
 
   setActualProblemSize( std::max( std::max( m_ni*m_nj, m_nj*m_nl ),
@@ -47,17 +47,20 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
 
   setItsPerRep( m_ni*m_nj + m_nj*m_nl + m_ni*m_nl );
   setKernelsPerRep(3);
-  setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk +
+                      1*sizeof(Real_type ) * m_nj * m_nk +
 
-                  (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_nj * m_nl +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nm +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nl * m_nm +
+                      1*sizeof(Real_type ) * m_nj * m_nm +
+                      1*sizeof(Real_type ) * m_nl * m_nm +
 
-                  (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nl +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nj +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nl );
+                      1*sizeof(Real_type ) * m_ni * m_nj +
+                      1*sizeof(Real_type ) * m_nj * m_nl );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj +
+
+                         1*sizeof(Real_type ) * m_nj * m_nl +
+
+                         1*sizeof(Real_type ) * m_ni * m_nl );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_ni*m_nj*m_nk +
                  2 * m_nj*m_nl*m_nm +
                  2 * m_ni*m_nj*m_nl );
@@ -86,6 +89,9 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+ 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_3MM::~POLYBENCH_3MM()
diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp
index 4331e3930..0d0cf79af 100644
--- a/src/polybench/POLYBENCH_3MM.hpp
+++ b/src/polybench/POLYBENCH_3MM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -153,18 +153,23 @@ class POLYBENCH_3MM : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_ni;
   Index_type m_nj;
diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp
index fc3348fff..ff5216dc9 100644
--- a/src/polybench/POLYBENCH_ADI-Cuda.cpp
+++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,7 +23,7 @@ namespace polybench
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void adi1(const Index_type n,
+__global__ void poly_adi1(const Index_type n,
                      const Real_type a, const Real_type b, const Real_type c,
                      const Real_type d, const Real_type f,
                      Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V)
@@ -43,7 +43,7 @@ __global__ void adi1(const Index_type n,
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void adi2(const Index_type n,
+__global__ void poly_adi2(const Index_type n,
                      const Real_type a, const Real_type c, const Real_type d,
                      const Real_type e, const Real_type f,
                      Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V)
@@ -63,7 +63,7 @@ __global__ void adi2(const Index_type n,
 
 template < size_t block_size, typename Lambda >
 __launch_bounds__(block_size)
-__global__ void adi_lam(const Index_type n,
+__global__ void poly_adi_lam(const Index_type n,
                         Lambda body)
 {
   Index_type i = 1 + blockIdx.x * block_size + threadIdx.x;
@@ -92,15 +92,21 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid)
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size);
         constexpr size_t shmem = 0;
 
-        adi1<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(n,
-                                        a, b, c, d, f,
-                                        P, Q, U, V);
-        cudaErrchk( cudaGetLastError() );
-
-        adi2<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(n,
-                                        a, c, d, e, f,
-                                        P, Q, U, V);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel( (poly_adi1<block_size>),
+                            grid_size, block_size,
+                            shmem, res.get_stream(),
+                            n,
+                            a, b, c,
+                            d, f,
+                            P, Q, U, V ); 
+
+        RPlaunchCudaKernel( (poly_adi2<block_size>),
+                            grid_size, block_size,
+                            shmem, res.get_stream(),
+                            n,
+                            a, c, d,
+                            e, f,
+                            P, Q, U, V ); 
 
       }  // tstep loop
 
@@ -117,33 +123,39 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid)
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size);
         constexpr size_t shmem = 0;
 
-        adi_lam<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(n,
-          [=] __device__ (Index_type i) {
-            POLYBENCH_ADI_BODY2;
-            for (Index_type j = 1; j < n-1; ++j) {
-              POLYBENCH_ADI_BODY3;
-            }
-            POLYBENCH_ADI_BODY4;
-            for (Index_type k = n-2; k >= 1; --k) {
-              POLYBENCH_ADI_BODY5;
-            }
+        auto poly_adi1_lambda = [=] __device__ (Index_type i) {
+          POLYBENCH_ADI_BODY2;
+          for (Index_type j = 1; j < n-1; ++j) {
+            POLYBENCH_ADI_BODY3;
           }
-        );
-        cudaErrchk( cudaGetLastError() );
-
-        adi_lam<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(n,
-          [=] __device__ (Index_type i) {
-            POLYBENCH_ADI_BODY6;
-            for (Index_type j = 1; j < n-1; ++j) {
-              POLYBENCH_ADI_BODY7;
-            }
-            POLYBENCH_ADI_BODY8;
-            for (Index_type k = n-2; k >= 1; --k) {
-              POLYBENCH_ADI_BODY9;
-            }
+          POLYBENCH_ADI_BODY4;
+          for (Index_type k = n-2; k >= 1; --k) {
+            POLYBENCH_ADI_BODY5;
           }
-        );
-        cudaErrchk( cudaGetLastError() );
+        };
+
+        RPlaunchCudaKernel( (poly_adi_lam<block_size,
+                                          decltype(poly_adi1_lambda)>),
+                            grid_size, block_size,
+                            shmem, res.get_stream(),
+                            n, poly_adi1_lambda );
+
+        auto poly_adi2_lambda = [=] __device__ (Index_type i) {
+          POLYBENCH_ADI_BODY6;
+          for (Index_type j = 1; j < n-1; ++j) {
+            POLYBENCH_ADI_BODY7;
+          }
+          POLYBENCH_ADI_BODY8;
+          for (Index_type k = n-2; k >= 1; --k) {
+            POLYBENCH_ADI_BODY9;
+          }
+        };
+
+        RPlaunchCudaKernel( (poly_adi_lam<block_size,
+                                          decltype(poly_adi2_lambda)>),
+                            grid_size, block_size, 
+                            shmem, res.get_stream(),
+                            n, poly_adi2_lambda );
 
       }  // tstep loop
 
diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp
index f5791ce88..51a69a986 100644
--- a/src/polybench/POLYBENCH_ADI-Hip.cpp
+++ b/src/polybench/POLYBENCH_ADI-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -23,7 +23,7 @@ namespace polybench
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void adi1(const Index_type n,
+__global__ void poly_adi1(const Index_type n,
                      const Real_type a, const Real_type b, const Real_type c,
                      const Real_type d, const Real_type f,
                      Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V)
@@ -43,7 +43,7 @@ __global__ void adi1(const Index_type n,
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void adi2(const Index_type n,
+__global__ void poly_adi2(const Index_type n,
                      const Real_type a, const Real_type c, const Real_type d,
                      const Real_type e, const Real_type f,
                      Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V)
@@ -63,7 +63,7 @@ __global__ void adi2(const Index_type n,
 
 template < size_t block_size, typename Lambda >
 __launch_bounds__(block_size)
-__global__ void adi_lam(const Index_type n,
+__global__ void poly_adi_lam(const Index_type n,
                         Lambda body)
 {
   Index_type i = 1 + blockIdx.x * block_size + threadIdx.x;
@@ -92,19 +92,21 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid)
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size);
         constexpr size_t shmem = 0;
 
-        hipLaunchKernelGGL((adi1<block_size>),
-                           dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
+        RPlaunchHipKernel( (poly_adi1<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
                            n,
-                           a, b, c, d, f,
-                           P, Q, U, V);
-        hipErrchk( hipGetLastError() );
+                           a, b, c,
+                           d, f,
+                           P, Q, U, V );
 
-        hipLaunchKernelGGL((adi2<block_size>),
-                           dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
+        RPlaunchHipKernel( (poly_adi2<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
                            n,
-                           a, c, d, e, f,
-                           P, Q, U, V);
-        hipErrchk( hipGetLastError() );
+                           a, c, d,
+                           e, f,
+                           P, Q, U, V );
 
       }  // tstep loop
 
@@ -121,7 +123,7 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid)
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size);
         constexpr size_t shmem = 0;
 
-        auto adi1_lamda = [=] __device__ (Index_type i) {
+        auto poly_adi1_lambda = [=] __device__ (Index_type i) {
           POLYBENCH_ADI_BODY2;
           for (Index_type j = 1; j < n-1; ++j) {
              POLYBENCH_ADI_BODY3;
@@ -132,12 +134,13 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid)
           }
         };
 
-        hipLaunchKernelGGL((adi_lam<block_size, decltype(adi1_lamda)>),
-                           dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                           n, adi1_lamda);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel( (poly_adi_lam<block_size,
+                                         decltype(poly_adi1_lambda)>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           n, poly_adi1_lambda );
 
-        auto adi2_lamda = [=] __device__ (Index_type i) {
+        auto poly_adi2_lambda = [=] __device__ (Index_type i) {
           POLYBENCH_ADI_BODY6;
           for (Index_type j = 1; j < n-1; ++j) {
             POLYBENCH_ADI_BODY7;
@@ -148,10 +151,11 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid)
           }
         };
 
-        hipLaunchKernelGGL((adi_lam<block_size, decltype(adi2_lamda)>),
-                           dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                           n, adi2_lamda);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel( (poly_adi_lam<block_size,
+                                         decltype(poly_adi2_lambda)>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           n, poly_adi2_lambda );
 
       }  // tstep loop
 
diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp
index 022888a54..08df1e9f7 100644
--- a/src/polybench/POLYBENCH_ADI-OMP.cpp
+++ b/src/polybench/POLYBENCH_ADI-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp
index c67e5a20a..8cee39e2c 100644
--- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp
index 899a9a57b..3ec703a50 100644
--- a/src/polybench/POLYBENCH_ADI-Seq.cpp
+++ b/src/polybench/POLYBENCH_ADI-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_ADI-Sycl.cpp b/src/polybench/POLYBENCH_ADI-Sycl.cpp
new file mode 100644
index 000000000..0fb3dfb4c
--- /dev/null
+++ b/src/polybench/POLYBENCH_ADI-Sycl.cpp
@@ -0,0 +1,169 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_ADI.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+template < size_t work_group_size >
+void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_ADI_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type t = 1; t <= tsteps; ++t) {
+
+        const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(n-2, work_group_size);
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                         [=] (sycl::nd_item<1> item) {
+
+            Index_type i = item.get_global_id(0) + 1;
+
+            if (i < n-1) {
+              POLYBENCH_ADI_BODY2;
+              for (Index_type j = 1; j < n-1; ++j) {
+                 POLYBENCH_ADI_BODY3;
+              }
+              POLYBENCH_ADI_BODY4;
+              for (Index_type k = n-2; k >= 1; --k) {
+                 POLYBENCH_ADI_BODY5;
+              }
+            }
+
+          });
+        });
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                         [=] (sycl::nd_item<1> item) {
+
+            Index_type i = item.get_global_id(0) + 1;
+
+            if (i < n-1) {
+              POLYBENCH_ADI_BODY6;
+              for (Index_type j = 1; j < n-1; ++j) {
+                 POLYBENCH_ADI_BODY7;
+              }
+              POLYBENCH_ADI_BODY8;
+              for (Index_type k = n-2; k >= 1; --k) {
+                 POLYBENCH_ADI_BODY9;
+              }
+            }
+
+          });
+        });
+
+      }  // tstep loop
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_ADI_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_2<work_group_size>,
+            RAJA::statement::Lambda<0, RAJA::Segs<0>>,
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::Lambda<1, RAJA::Segs<0,1>>
+            >,
+            RAJA::statement::Lambda<2, RAJA::Segs<0>>,
+            RAJA::statement::For<2, RAJA::seq_exec,
+              RAJA::statement::Lambda<3, RAJA::Segs<0,2>>
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type t = 1; t <= tsteps; ++t) {
+
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, n-1},
+                           RAJA::RangeSegment{1, n-1},
+                           RAJA::RangeStrideSegment{n-2, 0, -1}),
+          res,
+
+          [=] (Index_type i) {
+            POLYBENCH_ADI_BODY2_RAJA;
+          },
+          [=] (Index_type i, Index_type j) {
+            POLYBENCH_ADI_BODY3_RAJA;
+          },
+          [=] (Index_type i) {
+            POLYBENCH_ADI_BODY4_RAJA;
+          },
+          [=] (Index_type i, Index_type k) {
+            POLYBENCH_ADI_BODY5_RAJA;
+          }
+        );
+
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, n-1},
+                           RAJA::RangeSegment{1, n-1},
+                           RAJA::RangeStrideSegment{n-2, 0, -1}),
+          res,
+
+          [=] (Index_type i) {
+            POLYBENCH_ADI_BODY6_RAJA;
+          },
+          [=] (Index_type i, Index_type j) {
+            POLYBENCH_ADI_BODY7_RAJA;
+          },
+          [=] (Index_type i) {
+            POLYBENCH_ADI_BODY8_RAJA;
+          },
+          [=] (Index_type i, Index_type k) {
+            POLYBENCH_ADI_BODY9_RAJA;
+          }
+        );
+
+      }  // tstep loop
+
+    } // run_reps
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_ADI : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_ADI, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp
index 7a31468a6..1347975f2 100644
--- a/src/polybench/POLYBENCH_ADI.cpp
+++ b/src/polybench/POLYBENCH_ADI.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -20,12 +20,12 @@ namespace polybench
 POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params)
   : KernelBase(rajaperf::Polybench_ADI, params)
 {
-  Index_type n_default = 1000;
+  Index_type n_default = 1002;
 
   setDefaultProblemSize( (n_default-2) * (n_default-2) );
   setDefaultReps(4);
 
-  m_n = std::sqrt( getTargetProblemSize() ) + 1;
+  m_n = std::sqrt( getTargetProblemSize() ) + 2 + std::sqrt(2)-1;
   m_tsteps = 4;
 
   setItsPerRep( m_tsteps * ( (m_n-2) + (m_n-2) ) );
@@ -34,8 +34,11 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params)
   setActualProblemSize( (m_n-2) * (m_n-2) );
 
   setKernelsPerRep( m_tsteps * 2 );
-  setBytesPerRep( m_tsteps * ( (3*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_n * (m_n-2) +
-                               (3*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_n * (m_n-2) ) );
+  setBytesReadPerRep((3*sizeof(Real_type ) * m_n * (m_n-2) +
+                      3*sizeof(Real_type ) * m_n * (m_n-2)) * m_tsteps  );
+  setBytesWrittenPerRep((3*sizeof(Real_type ) * m_n * (m_n-2) +
+                         3*sizeof(Real_type ) * m_n * (m_n-2)) * m_tsteps  );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( m_tsteps * ( (15 + 2) * (m_n-2)*(m_n-2) +
                                (15 + 2) * (m_n-2)*(m_n-2) ) );
 
@@ -63,6 +66,9 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_ADI::~POLYBENCH_ADI()
diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp
index 848fb9dc4..613202509 100644
--- a/src/polybench/POLYBENCH_ADI.hpp
+++ b/src/polybench/POLYBENCH_ADI.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -195,17 +195,22 @@ class POLYBENCH_ADI : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Index_type m_n;
   Index_type m_tsteps;
diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp
index a787276ec..e83dc9590 100644
--- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp
+++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -83,11 +83,17 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
       constexpr size_t shmem = 0;
 
-      poly_atax_1<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(A, x, y, tmp, N);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (poly_atax_1<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          A, x, y, tmp,
+                          N );
 
-      poly_atax_2<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(A, tmp, y, N);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (poly_atax_2<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          A, tmp, y,
+                          N );
 
     }
     stopTimer();
@@ -100,27 +106,33 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
       constexpr size_t shmem = 0;
 
-      poly_atax_lam<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(N,
-        [=] __device__ (Index_type i) {
-          POLYBENCH_ATAX_BODY1;
-          for (Index_type j = 0; j < N; ++j ) {
-            POLYBENCH_ATAX_BODY2;
-          }
-          POLYBENCH_ATAX_BODY3;
+      auto poly_atax1_lambda = [=] __device__ (Index_type i) {
+        POLYBENCH_ATAX_BODY1;
+        for (Index_type j = 0; j < N; ++j ) {
+          POLYBENCH_ATAX_BODY2;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
-
-      poly_atax_lam<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(N,
-        [=] __device__ (Index_type j) {
-          POLYBENCH_ATAX_BODY4;
-          for (Index_type i = 0; i < N; ++i ) {
-            POLYBENCH_ATAX_BODY5;
-          }
-          POLYBENCH_ATAX_BODY6;
+        POLYBENCH_ATAX_BODY3;
+      };
+
+      RPlaunchCudaKernel( (poly_atax_lam<block_size,
+                                         decltype(poly_atax1_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          N, poly_atax1_lambda );
+
+      auto poly_atax2_lambda = [=] __device__ (Index_type j) {
+        POLYBENCH_ATAX_BODY4;
+        for (Index_type i = 0; i < N; ++i ) {
+          POLYBENCH_ATAX_BODY5;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
+        POLYBENCH_ATAX_BODY6;
+      }; 
+
+      RPlaunchCudaKernel( (poly_atax_lam<block_size,
+                                         decltype(poly_atax2_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          N, poly_atax2_lambda );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp
index 3ac954ab8..bbed90a83 100644
--- a/src/polybench/POLYBENCH_ATAX-Hip.cpp
+++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -83,15 +83,17 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((poly_atax_1<block_size>),
-                         dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                         A, x, y, tmp, N);
-      hipErrchk( hipGetLastError() );
+     RPlaunchHipKernel( (poly_atax_1<block_size>),
+                        grid_size, block_size,
+                        shmem, res.get_stream(),
+                        A, x, y, tmp,
+                        N );
 
-      hipLaunchKernelGGL((poly_atax_2<block_size>),
-                         dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                         A, tmp, y, N);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_atax_2<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         A, tmp, y,
+                         N );
 
     }
     stopTimer();
@@ -104,7 +106,7 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid)
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
       constexpr size_t shmem = 0;
 
-      auto poly_atax_1_lambda = [=] __device__ (Index_type i) {
+      auto poly_atax1_lambda = [=] __device__ (Index_type i) {
         POLYBENCH_ATAX_BODY1;
         for (Index_type j = 0; j < N; ++j ) {
           POLYBENCH_ATAX_BODY2;
@@ -112,12 +114,13 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid)
         POLYBENCH_ATAX_BODY3;
       };
 
-      hipLaunchKernelGGL((poly_atax_lam<block_size, decltype(poly_atax_1_lambda)>),
-        dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-        N, poly_atax_1_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_atax_lam<block_size,
+                                        decltype(poly_atax1_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         N, poly_atax1_lambda );
 
-      auto poly_atax_2_lambda = [=] __device__ (Index_type j) {
+      auto poly_atax2_lambda = [=] __device__ (Index_type j) {
         POLYBENCH_ATAX_BODY4;
         for (Index_type i = 0; i < N; ++i ) {
           POLYBENCH_ATAX_BODY5;
@@ -125,10 +128,11 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid)
         POLYBENCH_ATAX_BODY6;
       };
 
-      hipLaunchKernelGGL((poly_atax_lam<block_size, decltype(poly_atax_2_lambda)>),
-        dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-        N, poly_atax_2_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_atax_lam<block_size,
+                                        decltype(poly_atax2_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         N, poly_atax2_lambda );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp
index fda8ab7fd..a3880cc8f 100644
--- a/src/polybench/POLYBENCH_ATAX-OMP.cpp
+++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp
index e9e13e9cd..ce7ba4843 100644
--- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp
index 05a19093d..4791b7018 100644
--- a/src/polybench/POLYBENCH_ATAX-Seq.cpp
+++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_ATAX-Sycl.cpp b/src/polybench/POLYBENCH_ATAX-Sycl.cpp
new file mode 100644
index 000000000..110e58cd0
--- /dev/null
+++ b/src/polybench/POLYBENCH_ATAX-Sycl.cpp
@@ -0,0 +1,161 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_ATAX.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+template < size_t work_group_size >
+void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue(); 
+
+  POLYBENCH_ATAX_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+
+          if (i < N) {
+            POLYBENCH_ATAX_BODY1;
+            for (Index_type j = 0; j < N; ++j ) {
+              POLYBENCH_ATAX_BODY2;
+            }
+            POLYBENCH_ATAX_BODY3;
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) { 
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) { 
+
+          Index_type j = item.get_global_id(0);
+
+          if (j < N) {
+            POLYBENCH_ATAX_BODY4;
+            for (Index_type i = 0; i < N; ++i ) {
+              POLYBENCH_ATAX_BODY5;
+            }
+            POLYBENCH_ATAX_BODY6;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_ATAX_VIEWS_RAJA;
+
+    using EXEC_POL1 =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_0<work_group_size>,
+            RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>,
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>>
+            >,
+            RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>>
+          >
+        >
+      >;
+
+    using EXEC_POL2 =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<1, RAJA::sycl_global_0<work_group_size>,
+            RAJA::statement::Lambda<0, RAJA::Segs<1>, RAJA::Params<0>>,
+            RAJA::statement::For<0, RAJA::seq_exec,
+              RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>>
+            >,
+            RAJA::statement::Lambda<2, RAJA::Segs<1>, RAJA::Params<0>>
+          >
+        >
+      >;
+
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel_param_resource<EXEC_POL1>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                         RAJA::RangeSegment{0, N}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=] (Index_type i, Real_type &dot) {
+          POLYBENCH_ATAX_BODY1_RAJA;
+        },
+        [=] (Index_type i, Index_type j, Real_type &dot) {
+          POLYBENCH_ATAX_BODY2_RAJA;
+        },
+        [=] (Index_type i, Real_type &dot) {
+          POLYBENCH_ATAX_BODY3_RAJA;
+        }
+
+      );
+
+      RAJA::kernel_param_resource<EXEC_POL2>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                         RAJA::RangeSegment{0, N}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=] (Index_type j, Real_type &dot) {
+          POLYBENCH_ATAX_BODY4_RAJA;
+        },
+        [=] (Index_type i, Index_type j , Real_type &dot) {
+          POLYBENCH_ATAX_BODY5_RAJA;
+        },
+        [=] (Index_type j, Real_type &dot) {
+          POLYBENCH_ATAX_BODY6_RAJA;
+        }
+
+     );
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_ATAX : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_ATAX, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp
index 5a9d15e89..e7cb48875 100644
--- a/src/polybench/POLYBENCH_ATAX.cpp
+++ b/src/polybench/POLYBENCH_ATAX.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,18 +26,21 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params)
   setDefaultProblemSize( N_default * N_default );
   setDefaultReps(100);
 
-  m_N = std::sqrt( getTargetProblemSize() )+1;
+  m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1;
 
 
   setActualProblemSize( m_N * m_N );
 
   setItsPerRep( m_N + m_N );
   setKernelsPerRep(2);
-  setBytesPerRep( (2*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N +
-
-                  (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
+                      1*sizeof(Real_type ) * m_N * m_N +
+
+                      1*sizeof(Real_type ) * m_N +
+                      1*sizeof(Real_type ) * m_N * m_N );
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N +
+                         1*sizeof(Real_type ) * m_N);
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_N*m_N +
                  2 * m_N*m_N );
 
@@ -65,6 +68,9 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_ATAX::~POLYBENCH_ATAX()
diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp
index f94ade140..e6d43bfbc 100644
--- a/src/polybench/POLYBENCH_ATAX.hpp
+++ b/src/polybench/POLYBENCH_ATAX.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -115,17 +115,22 @@ class POLYBENCH_ATAX : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid); 
+ 
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Index_type m_N;
   Real_ptr m_tmp;
diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp
index 415e2fc94..b71772a99 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -160,23 +160,32 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid)
         constexpr size_t shmem = 0;
 
         const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size);
-        poly_fdtd2d_1<block_size><<<grid_size1, block_size, shmem, res.get_stream()>>>(ey, fict, ny, t);
-        cudaErrchk( cudaGetLastError() );
+
+        RPlaunchCudaKernel( (poly_fdtd2d_1<block_size>),
+                            grid_size1, block_size,
+                            shmem, res.get_stream(),
+                            ey, fict, ny, t );
 
         FDTD_2D_THREADS_PER_BLOCK_CUDA;
         FDTD_2D_NBLOCKS_CUDA;
 
-        poly_fdtd2d_2<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                     <<<nblocks234, nthreads_per_block234, shmem, res.get_stream()>>>(ey, hz, nx, ny);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel(
+          (poly_fdtd2d_2<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          ey, hz, nx, ny );
 
-        poly_fdtd2d_3<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                     <<<nblocks234, nthreads_per_block234, shmem, res.get_stream()>>>(ex, hz, nx, ny);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel( 
+          (poly_fdtd2d_3<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          ex, hz, nx, ny );
 
-        poly_fdtd2d_4<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                     <<<nblocks234, nthreads_per_block234, shmem, res.get_stream()>>>(hz, ex, ey, nx, ny);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel(
+          (poly_fdtd2d_4<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          hz, ex, ey, nx, ny );
 
       } // tstep loop
 
@@ -193,38 +202,55 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid)
         constexpr size_t shmem = 0;
 
         const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size);
-        poly_fdtd2d_1_lam<block_size><<<grid_size1, block_size, shmem, res.get_stream()>>>(ny,
-          [=] __device__ (Index_type j) {
-            POLYBENCH_FDTD_2D_BODY1;
-          }
-        );
 
-        FDTD_2D_THREADS_PER_BLOCK_CUDA;
-        FDTD_2D_NBLOCKS_CUDA;
+        auto poly_fdtd2d_1_lambda = [=] __device__ (Index_type j) {
+          POLYBENCH_FDTD_2D_BODY1;
+        };
 
-        poly_fdtd2d_2_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                         <<<nblocks234, nthreads_per_block234, shmem, res.get_stream()>>>(nx, ny,
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_FDTD_2D_BODY2;
-          }
-        );
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel( (poly_fdtd2d_1_lam<block_size,
+                                               decltype(poly_fdtd2d_1_lambda)>),
+                            grid_size1, block_size,
+                            shmem, res.get_stream(),
+                            ny, poly_fdtd2d_1_lambda );
 
-        poly_fdtd2d_3_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                         <<<nblocks234, nthreads_per_block234, shmem, res.get_stream()>>>(nx, ny,
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_FDTD_2D_BODY3;
-          }
-        );
-        cudaErrchk( cudaGetLastError() );
+        FDTD_2D_THREADS_PER_BLOCK_CUDA;
+        FDTD_2D_NBLOCKS_CUDA;
 
-        poly_fdtd2d_4_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                         <<<nblocks234, nthreads_per_block234, shmem, res.get_stream()>>>(nx, ny,
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_FDTD_2D_BODY4;
-          }
-        );
-        cudaErrchk( cudaGetLastError() );
+        auto poly_fdtd2d_2_lambda = [=] __device__ (Index_type i, 
+                                                    Index_type j) {
+          POLYBENCH_FDTD_2D_BODY2;
+        };
+        
+        RPlaunchCudaKernel( 
+          (poly_fdtd2d_2_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                             decltype(poly_fdtd2d_2_lambda)>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          nx, ny, poly_fdtd2d_2_lambda );
+
+        auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, 
+                                                    Index_type j) {
+          POLYBENCH_FDTD_2D_BODY3;
+        };
+
+        RPlaunchCudaKernel( 
+          (poly_fdtd2d_3_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                             decltype(poly_fdtd2d_3_lambda)>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          nx, ny, poly_fdtd2d_3_lambda );
+
+        auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i,
+                                                    Index_type j) {
+          POLYBENCH_FDTD_2D_BODY4;
+        };
+
+        RPlaunchCudaKernel(
+          (poly_fdtd2d_4_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                             decltype(poly_fdtd2d_4_lambda)>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          nx, ny, poly_fdtd2d_4_lambda );
 
       } // tstep loop
 
diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp
index ad8bd66d1..ddb62d9a5 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -159,28 +159,32 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid)
         constexpr size_t shmem = 0;
 
         const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size);
-        hipLaunchKernelGGL((poly_fdtd2d_1<block_size>),
-                           dim3(grid_size1), dim3(block_size), shmem, res.get_stream(),
-                           ey, fict, ny, t);
-        hipErrchk( hipGetLastError() );
+
+        RPlaunchHipKernel( (poly_fdtd2d_1<block_size>),
+                           grid_size1, block_size,
+                           shmem, res.get_stream(),
+                           ey, fict, ny, t );
 
         FDTD_2D_THREADS_PER_BLOCK_HIP;
         FDTD_2D_NBLOCKS_HIP;
 
-        hipLaunchKernelGGL((poly_fdtd2d_2<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                           dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(),
-                           ey, hz, nx, ny);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_fdtd2d_2<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          ey, hz, nx, ny );
 
-        hipLaunchKernelGGL((poly_fdtd2d_3<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                           dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(),
-                           ex, hz, nx, ny);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_fdtd2d_3<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          ex, hz, nx, ny );
 
-        hipLaunchKernelGGL((poly_fdtd2d_4<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                           dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(),
-                           hz, ex, ey, nx, ny);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_fdtd2d_4<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          hz, ex, ey, nx, ny );
 
       } // tstep loop
 
@@ -196,48 +200,56 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid)
 
         constexpr size_t shmem = 0;
 
+        const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size);
+
         auto poly_fdtd2d_1_lambda = [=] __device__ (Index_type j) {
           POLYBENCH_FDTD_2D_BODY1;
         };
 
-        const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size);
-        hipLaunchKernelGGL((poly_fdtd2d_1_lam<block_size, decltype(poly_fdtd2d_1_lambda)>),
-          dim3(grid_size1), dim3(block_size), shmem, res.get_stream(),
-          ny, poly_fdtd2d_1_lambda);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel( (poly_fdtd2d_1_lam<block_size,
+                                              decltype(poly_fdtd2d_1_lambda)>),
+                           grid_size1, block_size,
+                           shmem, res.get_stream(),
+                           ny, poly_fdtd2d_1_lambda );
 
         FDTD_2D_THREADS_PER_BLOCK_HIP;
         FDTD_2D_NBLOCKS_HIP;
 
-        auto poly_fdtd2d_2_lambda =
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_FDTD_2D_BODY2;
-          };
+        auto poly_fdtd2d_2_lambda = [=] __device__ (Index_type i,
+                                                    Index_type j) {
+          POLYBENCH_FDTD_2D_BODY2;
+        };
 
-        hipLaunchKernelGGL((poly_fdtd2d_2_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_fdtd2d_2_lambda)>),
-                           dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(),
-                           nx, ny, poly_fdtd2d_2_lambda);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_fdtd2d_2_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                             decltype(poly_fdtd2d_2_lambda)>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          nx, ny, poly_fdtd2d_2_lambda );
 
-        auto poly_fdtd2d_3_lambda =
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_FDTD_2D_BODY3;
-          };
+        auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i,
+                                                    Index_type j) {
+          POLYBENCH_FDTD_2D_BODY3;
+        };
 
-        hipLaunchKernelGGL((poly_fdtd2d_3_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_fdtd2d_3_lambda)>),
-                           dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(),
-                           nx, ny, poly_fdtd2d_3_lambda);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_fdtd2d_3_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                             decltype(poly_fdtd2d_3_lambda)>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          nx, ny, poly_fdtd2d_3_lambda );
 
-        auto poly_fdtd2d_4_lambda =
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_FDTD_2D_BODY4;
-          };
+        auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i,
+                                                    Index_type j) {
+          POLYBENCH_FDTD_2D_BODY4;
+        };
 
-        hipLaunchKernelGGL((poly_fdtd2d_4_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_fdtd2d_4_lambda)>),
-                           dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(),
-                           nx, ny, poly_fdtd2d_4_lambda);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_fdtd2d_4_lam<FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                             decltype(poly_fdtd2d_4_lambda)>),
+          nblocks234, nthreads_per_block234,
+          shmem, res.get_stream(),
+          nx, ny, poly_fdtd2d_4_lambda );
 
       } // tstep loop
 
diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp
index 28d06bdc7..6eaf696f3 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp
index c34d939ad..be8c0491e 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp
index cc23d5a18..ed8e43833 100644
--- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp
new file mode 100644
index 000000000..b409b7569
--- /dev/null
+++ b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp
@@ -0,0 +1,185 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_FDTD_2D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define j_wg_sz (32)
+#define i_wg_sz (work_group_size / j_wg_sz)
+
+
+template < size_t work_group_size >
+void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_FDTD_2D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (t = 0; t < tsteps; ++t) {
+
+        const size_t global_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(ny, work_group_size);
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<1> (global_size1, work_group_size),
+                         [=] (sycl::nd_item<1> item) {
+
+            Index_type j = item.get_global_id(0);
+            if (j < ny) {
+               POLYBENCH_FDTD_2D_BODY1;
+            }
+
+          });
+        });
+
+        sycl::range<3> global_dim234(1,
+                                     i_wg_sz * RAJA_DIVIDE_CEILING_INT(nx, i_wg_sz),
+                                     j_wg_sz * RAJA_DIVIDE_CEILING_INT(ny, j_wg_sz));
+
+        sycl::range<3> wkgroup_dim234(1, i_wg_sz, j_wg_sz);
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234),
+                         [=] (sycl::nd_item<3> item) {
+
+            Index_type i = item.get_global_id(1);
+            Index_type j = item.get_global_id(2);
+
+            if (i > 0 && i < nx && j < ny) {
+              POLYBENCH_FDTD_2D_BODY2;
+            }
+
+          });
+        });
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234),
+                         [=] (sycl::nd_item<3> item) {
+
+            Index_type i = item.get_global_id(1);
+            Index_type j = item.get_global_id(2);
+
+            if (i < nx && j > 0 && j < ny) {
+              POLYBENCH_FDTD_2D_BODY3;
+            }
+
+          });
+        });
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234),
+                         [=] (sycl::nd_item<3> item) {
+
+            Index_type i = item.get_global_id(1);
+            Index_type j = item.get_global_id(2);
+
+            if (i < nx-1 && j < ny-1) {
+              POLYBENCH_FDTD_2D_BODY4;
+            }
+
+          });
+        });
+
+      } // tstep loop
+
+    } // run_reps
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_FDTD_2D_VIEWS_RAJA;
+
+    using EXEC_POL1 = RAJA::sycl_exec<work_group_size, true /*async*/>;
+
+    using EXEC_POL234 =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_1<i_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_2<j_wg_sz>,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (t = 0; t < tsteps; ++t) {
+
+        RAJA::forall<EXEC_POL1>( res, RAJA::RangeSegment(0, ny),
+        [=] (Index_type j) {
+          POLYBENCH_FDTD_2D_BODY1_RAJA;
+        });
+
+        RAJA::kernel_resource<EXEC_POL234>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, nx},
+                           RAJA::RangeSegment{0, ny}),
+          res,
+          [=] (Index_type i, Index_type j) {
+            POLYBENCH_FDTD_2D_BODY2_RAJA;
+          }
+        );
+
+        RAJA::kernel_resource<EXEC_POL234>(
+          RAJA::make_tuple(RAJA::RangeSegment{0, nx},
+                           RAJA::RangeSegment{1, ny}),
+          res,
+          [=] (Index_type i, Index_type j) {
+            POLYBENCH_FDTD_2D_BODY3_RAJA;
+          }
+        );
+
+        RAJA::kernel_resource<EXEC_POL234>(
+          RAJA::make_tuple(RAJA::RangeSegment{0, nx-1},
+                           RAJA::RangeSegment{0, ny-1}),
+          res, 
+          [=] (Index_type i, Index_type j) {
+            POLYBENCH_FDTD_2D_BODY4_RAJA;
+          }
+        );
+
+      }  // tstep loop
+
+    } // run_reps
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_FDTD_2D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_FDTD_2D, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp
index ed2432d87..7f87fd3ef 100644
--- a/src/polybench/POLYBENCH_FDTD_2D.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -31,7 +31,7 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params)
                                     nx_default * (ny_default-1) ) );
   setDefaultReps(8);
 
-  m_nx = std::sqrt( getTargetProblemSize() ) + 1;
+  m_nx = std::sqrt( getTargetProblemSize() ) + 1 + std::sqrt(2)-1;
   m_ny = m_nx;
   m_tsteps = 40;
 
@@ -43,18 +43,25 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params)
                              m_nx*(m_ny-1) +
                              (m_nx-1)*(m_ny-1) ) );
   setKernelsPerRep(m_tsteps * 4);
-  setBytesPerRep( m_tsteps * ( (0*sizeof(Real_type ) + 1*sizeof(Real_type )) +
-                               (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ny +
+  setBytesReadPerRep((1*sizeof(Real_type ) +
 
-                               (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * m_ny +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * m_ny +
+                      1*sizeof(Real_type ) * (m_nx-1) * m_ny +
+                      1*sizeof(Real_type ) * m_nx * m_ny +
 
-                               (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * (m_ny-1) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * m_ny +
+                      1*sizeof(Real_type ) * m_nx * (m_ny-1) +
+                      1*sizeof(Real_type ) * m_nx * m_ny +
 
-                               (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * (m_ny-1) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * m_ny +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * (m_ny-1) ) );
+                      1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) +
+                      1*sizeof(Real_type ) * (m_nx-1) * m_ny +
+                      1*sizeof(Real_type ) * m_nx * (m_ny-1)) * m_tsteps );
+  setBytesWrittenPerRep((1*sizeof(Real_type ) * m_ny +
+
+                         1*sizeof(Real_type ) * (m_nx-1) * m_ny +
+
+                         1*sizeof(Real_type ) * m_nx * (m_ny-1) +
+
+                         1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1)) * m_tsteps );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( m_tsteps * ( 0 * m_ny +
                                3 * (m_nx-1)*m_ny +
                                3 * m_nx*(m_ny-1) +
@@ -84,6 +91,9 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_FDTD_2D::~POLYBENCH_FDTD_2D()
diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp
index e1d1b67c3..685c4cf40 100644
--- a/src/polybench/POLYBENCH_FDTD_2D.hpp
+++ b/src/polybench/POLYBENCH_FDTD_2D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -113,18 +113,23 @@ class POLYBENCH_FDTD_2D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx); 
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+ 
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_nx;
   Index_type m_ny;
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp
index 7aff525c2..5edb43b97 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -85,10 +85,11 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid)
         POLY_FLOYD_WARSHALL_NBLOCKS_CUDA;
         constexpr size_t shmem = 0;
 
-        poly_floyd_warshall<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                           <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(pout, pin,
-                                                             k, N);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel(
+          (poly_floyd_warshall<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          pout, pin, k, N );
 
       }
 
@@ -106,12 +107,17 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid)
         POLY_FLOYD_WARSHALL_NBLOCKS_CUDA;
         constexpr size_t shmem = 0;
 
-        poly_floyd_warshall_lam<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                               <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(N,
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_FLOYD_WARSHALL_BODY;
-          }
-        );
+        auto poly_floyd_warshall_lambda = [=] __device__ (Index_type i,
+                                                          Index_type j) {
+          POLYBENCH_FLOYD_WARSHALL_BODY;
+        }; 
+ 
+        RPlaunchCudaKernel(
+          (poly_floyd_warshall_lam<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 
+                                   decltype(poly_floyd_warshall_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_floyd_warshall_lambda );
 
       }
 
@@ -138,10 +144,11 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment{0, N},
-                                               RAJA::RangeSegment{0, N},
-                                               RAJA::RangeSegment{0, N}),
-                                       res,
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                         RAJA::RangeSegment{0, N},
+                         RAJA::RangeSegment{0, N}),
+        res,
         [=] __device__ (Index_type k, Index_type i, Index_type j) {
           POLYBENCH_FLOYD_WARSHALL_BODY_RAJA;
         }
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp
index c3581748c..ec408396a 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -85,11 +85,11 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid)
         POLY_FLOYD_WARSHALL_NBLOCKS_HIP;
         constexpr size_t shmem = 0;
 
-        hipLaunchKernelGGL((poly_floyd_warshall<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           pout, pin,
-                           k, N);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_floyd_warshall<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          pout, pin, k, N );
 
       }
 
@@ -103,20 +103,21 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid)
 
       for (Index_type k = 0; k < N; ++k) {
 
-        auto poly_floyd_warshall_lambda =
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_FLOYD_WARSHALL_BODY;
-          };
-
         POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP;
         POLY_FLOYD_WARSHALL_NBLOCKS_HIP;
         constexpr size_t shmem = 0;
 
-        hipLaunchKernelGGL(
-          (poly_floyd_warshall_lam<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_floyd_warshall_lambda)>),
-          dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-          N, poly_floyd_warshall_lambda);
-        hipErrchk( hipGetLastError() );
+        auto poly_floyd_warshall_lambda = [=] __device__ (Index_type i,
+                                                          Index_type j) {
+          POLYBENCH_FLOYD_WARSHALL_BODY;
+        };
+
+        RPlaunchHipKernel(
+          (poly_floyd_warshall_lam<POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                                   decltype(poly_floyd_warshall_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_floyd_warshall_lambda );
 
       }
 
@@ -143,10 +144,11 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment{0, N},
-                                               RAJA::RangeSegment{0, N},
-                                               RAJA::RangeSegment{0, N}),
-                                       res,
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                         RAJA::RangeSegment{0, N},
+                         RAJA::RangeSegment{0, N}),
+        res,
         [=] __device__ (Index_type k, Index_type i, Index_type j) {
           POLYBENCH_FLOYD_WARSHALL_BODY_RAJA;
         }
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp
index 1b2f57e4d..8aab52a55 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp
index e0e5d6d54..f3af4a088 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp
index 36ac66a84..8b088db8e 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp
new file mode 100644
index 000000000..415470801
--- /dev/null
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp
@@ -0,0 +1,115 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_FLOYD_WARSHALL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+   //
+   // Define work-group shape for SYCL execution
+   //
+#define j_wg_sz (32)
+#define i_wg_sz (work_group_size / j_wg_sz)
+
+
+template < size_t work_group_size >
+void POLYBENCH_FLOYD_WARSHALL::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_FLOYD_WARSHALL_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    sycl::range<3> global_dim(1,
+                              i_wg_sz * RAJA_DIVIDE_CEILING_INT(N, i_wg_sz),
+                              j_wg_sz * RAJA_DIVIDE_CEILING_INT(N, j_wg_sz));
+
+    sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type k = 0; k < N; ++k) {
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                         [=] (sycl::nd_item<3> item) {
+
+            Index_type i = item.get_global_id(1);
+            Index_type j = item.get_global_id(2);
+
+            if ( i < N && j < N ) {
+              POLYBENCH_FLOYD_WARSHALL_BODY;
+            }
+
+          });
+        });
+
+      }
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::For<0, RAJA::seq_exec,
+          RAJA::statement::SyclKernelAsync<
+            RAJA::statement::For<1, RAJA::sycl_global_1<i_wg_sz>,
+              RAJA::statement::For<2, RAJA::sycl_global_2<j_wg_sz>,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel_resource<EXEC_POL>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                         RAJA::RangeSegment{0, N},
+                         RAJA::RangeSegment{0, N}),
+        res,
+        [=] (Index_type k, Index_type i, Index_type j) {
+          POLYBENCH_FLOYD_WARSHALL_BODY_RAJA;
+        }
+      );
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_FLOYD_WARSHALL : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_FLOYD_WARSHALL, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
index 03c1e65ba..149ae87aa 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,14 +26,16 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params)
   setDefaultProblemSize( N_default * N_default );
   setDefaultReps(8);
 
-  m_N = std::sqrt( getTargetProblemSize() ) + 1;
+  m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1;
 
 
   setActualProblemSize( m_N * m_N );
 
   setItsPerRep( m_N*m_N );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N * m_N );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N * m_N );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * m_N*m_N*m_N );
 
   checksum_scale_factor = 1.0 *
@@ -60,6 +62,9 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_FLOYD_WARSHALL::~POLYBENCH_FLOYD_WARSHALL()
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp
index e8a067377..618f6e0f6 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -76,18 +76,23 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_N;
 
diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp
index afaa17185..2307900dd 100644
--- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp
+++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -90,11 +90,13 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid)
       POLY_GEMM_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      poly_gemm<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-               <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(C, A, B,
-                                                 alpha, beta,
-                                                 ni, nj, nk);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel(
+          (poly_gemm<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          C, A, B,
+          alpha, beta,
+          ni, nj, nk );
 
     }
     stopTimer();
@@ -108,18 +110,21 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid)
       POLY_GEMM_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      poly_gemm_lam<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                   <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(ni, nj,
-        [=] __device__ (Index_type i, Index_type j) {
-          POLYBENCH_GEMM_BODY1;
-          POLYBENCH_GEMM_BODY2;
-          for (Index_type k = 0; k < nk; ++k ) {
-            POLYBENCH_GEMM_BODY3;
-          }
-          POLYBENCH_GEMM_BODY4;
+      auto poly_gemm_lambda = [=] __device__ (Index_type i, Index_type j) {
+        POLYBENCH_GEMM_BODY1;
+        POLYBENCH_GEMM_BODY2;
+        for (Index_type k = 0; k < nk; ++k ) {
+          POLYBENCH_GEMM_BODY3;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
+        POLYBENCH_GEMM_BODY4;
+      };
+
+      RPlaunchCudaKernel(
+       (poly_gemm_lam<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                      decltype(poly_gemm_lambda)>),
+       nblocks, nthreads_per_block,
+       shmem, res.get_stream(),
+       ni, nj, poly_gemm_lambda );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp
index 4ee83f375..f92beaed0 100644
--- a/src/polybench/POLYBENCH_GEMM-Hip.cpp
+++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -90,11 +90,13 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid)
       POLY_GEMM_NBLOCKS_HIP;
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((poly_gemm<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                         C, A, B, alpha, beta,
-                         ni, nj, nk);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+          (poly_gemm<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          C, A, B,
+          alpha, beta,
+          ni, nj, nk );
 
     }
     stopTimer();
@@ -117,10 +119,12 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid)
         POLYBENCH_GEMM_BODY4;
       };
 
-      hipLaunchKernelGGL((poly_gemm_lam<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_gemm_lambda)>),
-        dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-        ni, nj, poly_gemm_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+       (poly_gemm_lam<POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                      decltype(poly_gemm_lambda)>),
+       nblocks, nthreads_per_block,
+       shmem, res.get_stream(),
+       ni, nj, poly_gemm_lambda );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp
index 21f63e7f2..444af4df7 100644
--- a/src/polybench/POLYBENCH_GEMM-OMP.cpp
+++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp
index a1d618b5b..a660ec35e 100644
--- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp
index 84fa70002..1aaaab119 100644
--- a/src/polybench/POLYBENCH_GEMM-Seq.cpp
+++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GEMM-Sycl.cpp b/src/polybench/POLYBENCH_GEMM-Sycl.cpp
new file mode 100644
index 000000000..2f4fc09d3
--- /dev/null
+++ b/src/polybench/POLYBENCH_GEMM-Sycl.cpp
@@ -0,0 +1,135 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_GEMM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define j_wg_sz (32)
+#define i_wg_sz (work_group_size / j_wg_sz)
+
+
+template < size_t work_group_size >
+void POLYBENCH_GEMM::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_GEMM_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    sycl::range<3> global_dim(1,
+                              i_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, i_wg_sz),
+                              j_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, j_wg_sz));
+
+    sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz); 
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                       [=] (sycl::nd_item<3> item) {
+
+          Index_type i = item.get_global_id(1);
+          Index_type j = item.get_global_id(2);
+
+          if (i < ni && j < nj) {
+            POLYBENCH_GEMM_BODY1;
+            POLYBENCH_GEMM_BODY2;
+            for (Index_type k = 0; k < nk; ++k) {
+              POLYBENCH_GEMM_BODY3;
+            }
+            POLYBENCH_GEMM_BODY4;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_GEMM_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_1<i_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_2<j_wg_sz>,
+              RAJA::statement::Lambda<0, RAJA::Params<0>>,
+              RAJA::statement::Lambda<1, RAJA::Segs<0,1>>,
+              RAJA::statement::For<2, RAJA::seq_exec,
+                RAJA::statement::Lambda<2, RAJA::Segs<0,1,2>, RAJA::Params<0>>
+              >,
+              RAJA::statement::Lambda<3, RAJA::Segs<0,1>, RAJA::Params<0>>
+            >
+          >
+        >
+      >;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::kernel_param_resource<EXEC_POL>(
+
+          RAJA::make_tuple( RAJA::RangeSegment{0, ni},
+                            RAJA::RangeSegment{0, nj},
+                            RAJA::RangeSegment{0, nk} ),
+          RAJA::tuple<Real_type>{0.0},   // variable for dot
+          res,
+
+          [=] (Real_type& dot) {
+            POLYBENCH_GEMM_BODY1_RAJA;
+          },
+          [=] (Index_type i, Index_type j) {
+            POLYBENCH_GEMM_BODY2_RAJA;
+          },
+          [=] (Index_type i, Index_type j, Index_type k,
+               Real_type& dot) {
+            POLYBENCH_GEMM_BODY3_RAJA;
+          },
+          [=] (Index_type i, Index_type j,
+               Real_type& dot) {
+            POLYBENCH_GEMM_BODY4_RAJA;
+          }
+        );
+
+      }
+      stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GEMM, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp
index 6094ce908..48769b42f 100644
--- a/src/polybench/POLYBENCH_GEMM.cpp
+++ b/src/polybench/POLYBENCH_GEMM.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,9 +28,9 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params)
   setDefaultProblemSize( ni_default * nj_default );
   setDefaultReps(4);
 
-  m_ni = std::sqrt( getTargetProblemSize() ) + 1;
+  m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1;
   m_nj = m_ni;
-  m_nk = nk_default;
+  m_nk = Index_type(double(nk_default)/ni_default*m_ni);
 
   m_alpha = 0.62;
   m_beta = 1.002;
@@ -40,9 +40,10 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params)
 
   setItsPerRep( m_ni * m_nj );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk +
+                      1*sizeof(Real_type ) * m_nj * m_nk );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj);
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((1 +
                   3 * m_nk) * m_ni*m_nj);
 
@@ -70,6 +71,9 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_GEMM::~POLYBENCH_GEMM()
diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp
index 33ea77997..14c590596 100644
--- a/src/polybench/POLYBENCH_GEMM.hpp
+++ b/src/polybench/POLYBENCH_GEMM.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -99,18 +99,23 @@ class POLYBENCH_GEMM : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_ni;
   Index_type m_nj;
diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp
index 1360e93b2..d5119ede5 100644
--- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -41,10 +41,10 @@ namespace polybench
 
 template < size_t j_block_size, size_t i_block_size >
 __launch_bounds__(j_block_size*i_block_size)
-__global__ void poly_gemmver_1(Real_ptr A,
-                               Real_ptr u1, Real_ptr v1,
-                               Real_ptr u2, Real_ptr v2,
-                               Index_type n)
+__global__ void poly_gemver_1(Real_ptr A,
+                              Real_ptr u1, Real_ptr v1,
+                              Real_ptr u2, Real_ptr v2,
+                              Index_type n)
 {
   Index_type i = blockIdx.y * i_block_size + threadIdx.y;
   Index_type j = blockIdx.x * j_block_size + threadIdx.x;
@@ -56,7 +56,7 @@ __global__ void poly_gemmver_1(Real_ptr A,
 
 template < size_t j_block_size, size_t i_block_size, typename Lambda >
 __launch_bounds__(j_block_size*i_block_size)
-__global__ void poly_gemmver_1_lam(Index_type n, Lambda body)
+__global__ void poly_gemver_1_lam(Index_type n, Lambda body)
 {
   Index_type i = blockIdx.y * i_block_size + threadIdx.y;
   Index_type j = blockIdx.x * j_block_size + threadIdx.x;
@@ -68,10 +68,10 @@ __global__ void poly_gemmver_1_lam(Index_type n, Lambda body)
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void poly_gemmver_2(Real_ptr A,
-                               Real_ptr x, Real_ptr y,
-                               Real_type beta,
-                               Index_type n)
+__global__ void poly_gemver_2(Real_ptr A,
+                              Real_ptr x, Real_ptr y,
+                              Real_type beta,
+                              Index_type n)
 {
   Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
@@ -85,8 +85,8 @@ __global__ void poly_gemmver_2(Real_ptr A,
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void poly_gemmver_3(Real_ptr x, Real_ptr z,
-                               Index_type n)
+__global__ void poly_gemver_3(Real_ptr x, Real_ptr z,
+                              Index_type n)
 {
   Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
@@ -96,10 +96,10 @@ __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z,
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void poly_gemmver_4(Real_ptr A,
-                               Real_ptr x, Real_ptr w,
-                               Real_type alpha,
-                               Index_type n)
+__global__ void poly_gemver_4(Real_ptr A,
+                              Real_ptr x, Real_ptr w,
+                              Real_type alpha,
+                              Index_type n)
 {
   Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
@@ -113,7 +113,7 @@ __global__ void poly_gemmver_4(Real_ptr A,
 
 template < size_t block_size, typename Lambda >
 __launch_bounds__(block_size)
-__global__ void poly_gemmver_234_lam(Index_type n, Lambda body)
+__global__ void poly_gemver_234_lam(Index_type n, Lambda body)
 {
   Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
@@ -140,26 +140,28 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid)
       GEMVER_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      poly_gemmver_1<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                    <<<nblocks1, nthreads_per_block1, shmem, res.get_stream()>>>(A, u1, v1, u2, v2,
-                                                        n);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel(
+        (poly_gemver_1<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        nblocks1, nthreads_per_block1,
+        shmem, res.get_stream(),
+        A, u1, v1, u2, v2, n );
 
       size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size);
 
-      poly_gemmver_2<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(A, x, y,
-                                                beta,
-                                                n);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (poly_gemver_2<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          A, x, y, beta, n );
 
-      poly_gemmver_3<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(x, z,
-                                                n);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (poly_gemver_3<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, z, n );
 
-      poly_gemmver_4<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(A, x, w,
-                                                alpha,
-                                                n);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (poly_gemver_4<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          A, x, w, alpha, n );
 
     }
     stopTimer();
@@ -173,44 +175,56 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid)
       GEMVER_NBLOCKS_CUDA;
       constexpr size_t shmem = 0;
 
-      poly_gemmver_1_lam<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                        <<<nblocks1, nthreads_per_block1, shmem, res.get_stream()>>>(n,
-        [=] __device__ (Index_type i, Index_type j) {
-          POLYBENCH_GEMVER_BODY1;
-        }
-      );
-      cudaErrchk( cudaGetLastError() );
+      auto poly_gemver1_lambda = [=] __device__ (Index_type i, Index_type j) {
+        POLYBENCH_GEMVER_BODY1;
+      };
+  
+      RPlaunchCudaKernel(
+       (poly_gemver_1_lam<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                          decltype(poly_gemver1_lambda)>),
+       nblocks1, nthreads_per_block1,
+       shmem, res.get_stream(),
+       n, poly_gemver1_lambda );
 
       size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size);
 
-      poly_gemmver_234_lam<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(n,
-        [=] __device__ (Index_type i) {
-          POLYBENCH_GEMVER_BODY2;
-          for (Index_type j = 0; j < n; ++j) {
-            POLYBENCH_GEMVER_BODY3;
-          }
-          POLYBENCH_GEMVER_BODY4;
+      auto poly_gemver2_lambda = [=] __device__ (Index_type i) {
+        POLYBENCH_GEMVER_BODY2;
+        for (Index_type j = 0; j < n; ++j) {
+          POLYBENCH_GEMVER_BODY3;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
-
-      poly_gemmver_234_lam<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(n,
-        [=] __device__ (Index_type i) {
-          POLYBENCH_GEMVER_BODY5;
+        POLYBENCH_GEMVER_BODY4;
+      };
+
+      RPlaunchCudaKernel( (poly_gemver_234_lam<block_size,
+                                               decltype(poly_gemver2_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          n, poly_gemver2_lambda );
+
+      auto poly_gemver3_lambda = [=] __device__ (Index_type i) {
+        POLYBENCH_GEMVER_BODY5;
+      };
+
+      RPlaunchCudaKernel( (poly_gemver_234_lam<block_size,
+                                               decltype(poly_gemver3_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          n, poly_gemver3_lambda );
+
+      auto poly_gemver4_lambda = [=] __device__ (Index_type i) {
+        POLYBENCH_GEMVER_BODY6;
+        for (Index_type j = 0; j < n; ++j) {
+          POLYBENCH_GEMVER_BODY7;
         }
-      );
-      cudaErrchk( cudaGetLastError() );
+        POLYBENCH_GEMVER_BODY8;
+      };
 
-      poly_gemmver_234_lam<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(n,
-        [=] __device__ (Index_type i) {
-          POLYBENCH_GEMVER_BODY6;
-          for (Index_type j = 0; j < n; ++j) {
-            POLYBENCH_GEMVER_BODY7;
-          }
-          POLYBENCH_GEMVER_BODY8;
-        }
-      );
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (poly_gemver_234_lam<block_size,
+                                               decltype(poly_gemver4_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          n, poly_gemver4_lambda );
 
     }
     stopTimer();
@@ -248,9 +262,10 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL1>( RAJA::make_tuple(RAJA::RangeSegment{0, n},
-                                                RAJA::RangeSegment{0, n}),
-                                        res,
+      RAJA::kernel_resource<EXEC_POL1>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, n},
+                         RAJA::RangeSegment{0, n}),
+        res,
         [=] __device__ (Index_type i, Index_type j) {
           POLYBENCH_GEMVER_BODY1_RAJA;
         }
diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp
index ab1416bf0..f51e15d42 100644
--- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -41,10 +41,10 @@ namespace polybench
 
 template < size_t j_block_size, size_t i_block_size >
 __launch_bounds__(j_block_size*i_block_size)
-__global__ void poly_gemmver_1(Real_ptr A,
-                               Real_ptr u1, Real_ptr v1,
-                               Real_ptr u2, Real_ptr v2,
-                               Index_type n)
+__global__ void poly_gemver_1(Real_ptr A,
+                              Real_ptr u1, Real_ptr v1,
+                              Real_ptr u2, Real_ptr v2,
+                              Index_type n)
 {
   Index_type i = blockIdx.y * i_block_size + threadIdx.y;
   Index_type j = blockIdx.x * j_block_size + threadIdx.x;
@@ -56,7 +56,7 @@ __global__ void poly_gemmver_1(Real_ptr A,
 
 template < size_t j_block_size, size_t i_block_size, typename Lambda >
 __launch_bounds__(j_block_size*i_block_size)
-__global__ void poly_gemmver_1_lam(Index_type n, Lambda body)
+__global__ void poly_gemver_1_lam(Index_type n, Lambda body)
 {
   Index_type i = blockIdx.y * i_block_size + threadIdx.y;
   Index_type j = blockIdx.x * j_block_size + threadIdx.x;
@@ -68,10 +68,10 @@ __global__ void poly_gemmver_1_lam(Index_type n, Lambda body)
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void poly_gemmver_2(Real_ptr A,
-                               Real_ptr x, Real_ptr y,
-                               Real_type beta,
-                               Index_type n)
+__global__ void poly_gemver_2(Real_ptr A,
+                              Real_ptr x, Real_ptr y,
+                              Real_type beta,
+                              Index_type n)
 {
   Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
@@ -85,8 +85,8 @@ __global__ void poly_gemmver_2(Real_ptr A,
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void poly_gemmver_3(Real_ptr x, Real_ptr z,
-                               Index_type n)
+__global__ void poly_gemver_3(Real_ptr x, Real_ptr z,
+                              Index_type n)
 {
   Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
@@ -96,10 +96,10 @@ __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z,
 
 template < size_t block_size >
 __launch_bounds__(block_size)
-__global__ void poly_gemmver_4(Real_ptr A,
-                               Real_ptr x, Real_ptr w,
-                               Real_type alpha,
-                               Index_type n)
+__global__ void poly_gemver_4(Real_ptr A,
+                              Real_ptr x, Real_ptr w,
+                              Real_type alpha,
+                              Index_type n)
 {
   Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
@@ -113,7 +113,7 @@ __global__ void poly_gemmver_4(Real_ptr A,
 
 template < size_t block_size, typename Lambda >
 __launch_bounds__(block_size)
-__global__ void poly_gemmver_234_lam(Index_type n, Lambda body)
+__global__ void poly_gemver_234_lam(Index_type n, Lambda body)
 {
   Index_type i = blockIdx.x * block_size + threadIdx.x;
   if (i < n) {
@@ -140,27 +140,28 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid)
       GEMVER_NBLOCKS_HIP;
       constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((poly_gemmver_1<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                         dim3(nblocks1), dim3(nthreads_per_block1), shmem, res.get_stream(),
-                         A, u1, v1, u2, v2, n);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+        (poly_gemver_1<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        nblocks1, nthreads_per_block1,
+        shmem, res.get_stream(),
+        A, u1, v1, u2, v2, n );
 
       size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n, block_size);
 
-      hipLaunchKernelGGL((poly_gemmver_2<block_size>),
-                         dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                         A, x, y, beta, n);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_gemver_2<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         A, x, y, beta, n );
 
-      hipLaunchKernelGGL((poly_gemmver_3<block_size>),
-                         dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                         x, z, n);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_gemver_3<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         x, z, n );
 
-      hipLaunchKernelGGL((poly_gemmver_4<block_size>),
-                         dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                         A, x, w, alpha, n);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_gemver_4<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         A, x, w, alpha, n );
 
     }
     stopTimer();
@@ -174,51 +175,56 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid)
       GEMVER_NBLOCKS_HIP;
       constexpr size_t shmem = 0;
 
-      auto poly_gemmver_1_lambda = [=] __device__ (Index_type i, Index_type j) {
-          POLYBENCH_GEMVER_BODY1;
+      auto poly_gemver1_lambda = [=] __device__ (Index_type i, Index_type j) {
+        POLYBENCH_GEMVER_BODY1;
       };
 
-      hipLaunchKernelGGL((poly_gemmver_1_lam<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_gemmver_1_lambda)>),
-                         dim3(nblocks1), dim3(nthreads_per_block1), shmem, res.get_stream(),
-                         n, poly_gemmver_1_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel(
+       (poly_gemver_1_lam<GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                          decltype(poly_gemver1_lambda)>),
+       nblocks1, nthreads_per_block1,
+       shmem, res.get_stream(),
+       n, poly_gemver1_lambda );
 
       size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size);
 
-      auto poly_gemmver_2_lambda = [=] __device__ (Index_type i) {
-          POLYBENCH_GEMVER_BODY2;
-          for (Index_type j = 0; j < n; ++j) {
-            POLYBENCH_GEMVER_BODY3;
-          }
-          POLYBENCH_GEMVER_BODY4;
+      auto poly_gemver2_lambda = [=] __device__ (Index_type i) {
+        POLYBENCH_GEMVER_BODY2;
+        for (Index_type j = 0; j < n; ++j) {
+          POLYBENCH_GEMVER_BODY3;
+        }
+        POLYBENCH_GEMVER_BODY4;
       };
 
-      hipLaunchKernelGGL((poly_gemmver_234_lam<block_size, decltype(poly_gemmver_2_lambda)>),
-        dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-        n, poly_gemmver_2_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_gemver_234_lam<block_size,
+                                              decltype(poly_gemver2_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         n, poly_gemver2_lambda );
 
-      auto poly_gemmver_3_lambda = [=] __device__ (Index_type i) {
-          POLYBENCH_GEMVER_BODY5;
+      auto poly_gemver3_lambda = [=] __device__ (Index_type i) {
+        POLYBENCH_GEMVER_BODY5;
       };
 
-      hipLaunchKernelGGL((poly_gemmver_234_lam<block_size, decltype(poly_gemmver_3_lambda)>),
-        dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-        n, poly_gemmver_3_lambda);
-      hipErrchk( hipGetLastError() );
-
-      auto poly_gemmver_4_lambda = [=] __device__ (Index_type i) {
-          POLYBENCH_GEMVER_BODY6;
-          for (Index_type j = 0; j < n; ++j) {
-            POLYBENCH_GEMVER_BODY7;
-          }
-          POLYBENCH_GEMVER_BODY8;
+      RPlaunchHipKernel( (poly_gemver_234_lam<block_size,
+                                              decltype(poly_gemver3_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         n, poly_gemver3_lambda );
+
+      auto poly_gemver4_lambda = [=] __device__ (Index_type i) {
+        POLYBENCH_GEMVER_BODY6;
+        for (Index_type j = 0; j < n; ++j) {
+          POLYBENCH_GEMVER_BODY7;
+        }
+        POLYBENCH_GEMVER_BODY8;
       };
 
-      hipLaunchKernelGGL((poly_gemmver_234_lam<block_size, decltype(poly_gemmver_4_lambda)>),
-        dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-        n, poly_gemmver_4_lambda);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_gemver_234_lam<block_size,
+                                              decltype(poly_gemver4_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         n, poly_gemver4_lambda );
 
     }
     stopTimer();
@@ -256,9 +262,10 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::kernel_resource<EXEC_POL1>( RAJA::make_tuple(RAJA::RangeSegment{0, n},
-                                                RAJA::RangeSegment{0, n}),
-                                        res,
+      RAJA::kernel_resource<EXEC_POL1>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, n},
+                         RAJA::RangeSegment{0, n}),
+        res,
         [=] __device__ (Index_type i, Index_type j) {
           POLYBENCH_GEMVER_BODY1_RAJA;
         }
diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp
index a20872867..5af84061d 100644
--- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp
index b12be578a..29f487d73 100644
--- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp
index 5a4f9199a..c1524a2ef 100644
--- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp
+++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GEMVER-Sycl.cpp b/src/polybench/POLYBENCH_GEMVER-Sycl.cpp
new file mode 100644
index 000000000..1242de063
--- /dev/null
+++ b/src/polybench/POLYBENCH_GEMVER-Sycl.cpp
@@ -0,0 +1,210 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_GEMVER.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define j_wg_sz (32)
+#define i_wg_sz (work_group_size / j_wg_sz)
+
+
+template < size_t work_group_size >
+void POLYBENCH_GEMVER::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_GEMVER_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    sycl::range<3> global_dim1(1,
+                               i_wg_sz * RAJA_DIVIDE_CEILING_INT(n, i_wg_sz),
+                               j_wg_sz * RAJA_DIVIDE_CEILING_INT(n, j_wg_sz));
+    sycl::range<3> wkgroup_dim1(1, i_wg_sz, j_wg_sz);
+
+    const size_t global_size234 = work_group_size * RAJA_DIVIDE_CEILING_INT(n, work_group_size);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim1),
+                       [=] (sycl::nd_item<3> item) {
+
+          Index_type i = item.get_global_id(1);
+          Index_type j = item.get_global_id(2);
+
+          if (i < n && j < n) {
+            POLYBENCH_GEMVER_BODY1;
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < n) {
+            POLYBENCH_GEMVER_BODY2;
+            for (Index_type j = 0; j < n; ++j) {
+              POLYBENCH_GEMVER_BODY3;
+            }
+            POLYBENCH_GEMVER_BODY4;
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < n) {
+            POLYBENCH_GEMVER_BODY5;
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size),
+                       [=] (sycl::nd_item<1> item ) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < n) {
+            POLYBENCH_GEMVER_BODY6;
+            for (Index_type j = 0; j < n; ++j) {
+              POLYBENCH_GEMVER_BODY7;
+            }
+            POLYBENCH_GEMVER_BODY8;
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_GEMVER_VIEWS_RAJA;
+
+    using EXEC_POL1 =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_1<i_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_2<j_wg_sz>,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >;
+
+    using EXEC_POL24 =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_0<work_group_size>,
+            RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>,
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>>
+            >,
+            RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>>
+          >
+        >
+      >;
+
+    using EXEC_POL3 = RAJA::sycl_exec<work_group_size, true /*async*/>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel_resource<EXEC_POL1>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, n},
+                         RAJA::RangeSegment{0, n}),
+        res,
+        [=] (Index_type i, Index_type j) {
+          POLYBENCH_GEMVER_BODY1_RAJA;
+        }
+      );
+
+      RAJA::kernel_param_resource<EXEC_POL24>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, n},
+                         RAJA::RangeSegment{0, n}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=] (Index_type /* i */, Real_type &dot) {
+          POLYBENCH_GEMVER_BODY2_RAJA;
+        },
+        [=] (Index_type i, Index_type j, Real_type &dot) {
+          POLYBENCH_GEMVER_BODY3_RAJA;
+        },
+        [=] (Index_type i, Real_type &dot) {
+          POLYBENCH_GEMVER_BODY4_RAJA;
+        }
+      );
+
+      RAJA::forall<EXEC_POL3> ( res, RAJA::RangeSegment{0, n},
+        [=] (Index_type i) {
+          POLYBENCH_GEMVER_BODY5_RAJA;
+        }
+      );
+
+      RAJA::kernel_param_resource<EXEC_POL24>(
+        RAJA::make_tuple(RAJA::RangeSegment{0, n},
+                         RAJA::RangeSegment{0, n}),
+        RAJA::tuple<Real_type>{0.0},
+        res,
+
+        [=] (Index_type i, Real_type &dot) {
+          POLYBENCH_GEMVER_BODY6_RAJA;
+        },
+        [=] (Index_type i, Index_type j, Real_type &dot) {
+          POLYBENCH_GEMVER_BODY7_RAJA;
+        },
+        [=] (Index_type i, Real_type &dot) {
+          POLYBENCH_GEMVER_BODY8_RAJA;
+        }
+      );
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_GEMVER : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GEMVER, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp
index 7223f85fd..e0db7f361 100644
--- a/src/polybench/POLYBENCH_GEMVER.cpp
+++ b/src/polybench/POLYBENCH_GEMVER.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,7 +26,7 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params)
   setDefaultProblemSize( n_default * n_default );
   setDefaultReps(20);
 
-  m_n =  std::sqrt( getTargetProblemSize() ) + 1;
+  m_n =  std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1;
 
   m_alpha = 1.5;
   m_beta = 1.2;
@@ -39,16 +39,24 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params)
                 m_n +
                 m_n*m_n );
   setKernelsPerRep(4);
-  setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n +
-                  (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_n +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_n * m_n +
+                      4*sizeof(Real_type ) * m_n +
 
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n +
-                  (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n +
+                      1*sizeof(Real_type ) * m_n * m_n +
+                      2*sizeof(Real_type ) * m_n +
 
-                  (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n +
+                      2*sizeof(Real_type ) * m_n +
 
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n +
-                  (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n );
+                      1*sizeof(Real_type ) * m_n * m_n +
+                      2*sizeof(Real_type ) * m_n );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_n * m_n +
+
+                         1*sizeof(Real_type ) * m_n +
+
+                         1*sizeof(Real_type ) * m_n +
+
+                         1*sizeof(Real_type ) * m_n );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(4 * m_n*m_n +
                  3 * m_n*m_n +
                  1 * m_n +
@@ -79,6 +87,9 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_GEMVER::~POLYBENCH_GEMVER()
diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp
index 07ecae962..ef5ad5a8a 100644
--- a/src/polybench/POLYBENCH_GEMVER.hpp
+++ b/src/polybench/POLYBENCH_GEMVER.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -152,18 +152,23 @@ class POLYBENCH_GEMVER : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_n;
   Real_type m_alpha;
diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp
index 3e921c2d2..24ed43947 100644
--- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -56,11 +56,14 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
       constexpr size_t shmem = 0;
-      poly_gesummv<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(x, y,
-                                              A, B,
-                                              alpha, beta,
-                                              N);
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (poly_gesummv<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          x, y,
+                          A, B, 
+                          alpha, beta,
+                          N );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp
index 7f4468849..5a156b799 100644
--- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -56,13 +56,14 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((poly_gesummv<block_size>),
-                         dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
+    
+      RPlaunchHipKernel( (poly_gesummv<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
                          x, y,
-                         A, B,
+                         A, B, 
                          alpha, beta,
-                         N);
-      hipErrchk( hipGetLastError() );
+                         N );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp
index f9efd4d31..bc59ada36 100644
--- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp
index 86e73b293..8f572c16a 100644
--- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp
index d7ba3fc70..34b70708f 100644
--- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp b/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp
new file mode 100644
index 000000000..83197e995
--- /dev/null
+++ b/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp
@@ -0,0 +1,117 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_GESUMMV.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+
+template < size_t work_group_size >
+void POLYBENCH_GESUMMV::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_GESUMMV_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+
+          if (i < N) {
+            POLYBENCH_GESUMMV_BODY1;
+            for (Index_type j = 0; j < N; ++j ) {
+              POLYBENCH_GESUMMV_BODY2;
+            }
+            POLYBENCH_GESUMMV_BODY3; 
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_GESUMMV_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_0<work_group_size>,
+            RAJA::statement::Lambda<0, RAJA::Params<0,1>>,
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0,1>>
+            >,
+            RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0,1>>
+          >
+        >
+      >;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::kernel_param_resource<EXEC_POL>(
+          RAJA::make_tuple( RAJA::RangeSegment{0, N},
+                            RAJA::RangeSegment{0, N} ),
+          RAJA::make_tuple(static_cast<Real_type>(0.0),
+                           static_cast<Real_type>(0.0)),
+          res,
+
+          [=] (Real_type& tmpdot,
+               Real_type& ydot) {
+            POLYBENCH_GESUMMV_BODY1_RAJA;
+          },
+          [=] (Index_type i, Index_type j, Real_type& tmpdot,
+                                           Real_type& ydot) {
+            POLYBENCH_GESUMMV_BODY2_RAJA;
+          },
+          [=] (Index_type i, Real_type& tmpdot,
+                             Real_type& ydot) {
+            POLYBENCH_GESUMMV_BODY3_RAJA;
+          }
+        );
+
+      }
+      stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_GESUMMV : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GESUMMV, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp
index ea8e2224f..7764c4036 100644
--- a/src/polybench/POLYBENCH_GESUMMV.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,7 +26,7 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params)
   setDefaultProblemSize( N_default * N_default );
   setDefaultReps(120);
 
-  m_N = std::sqrt( getTargetProblemSize() ) + 1;
+  m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1;
 
   m_alpha = 0.62;
   m_beta = 1.002;
@@ -36,8 +36,10 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params)
 
   setItsPerRep( m_N );
   setKernelsPerRep(1);
-  setBytesPerRep( (2*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N +
-                  (0*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N * m_N );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
+                      2*sizeof(Real_type ) * m_N * m_N );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((4 * m_N +
                   3 ) * m_N  );
 
@@ -59,6 +61,9 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_GESUMMV::~POLYBENCH_GESUMMV()
diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp
index 32a1b0eae..3d80155ed 100644
--- a/src/polybench/POLYBENCH_GESUMMV.hpp
+++ b/src/polybench/POLYBENCH_GESUMMV.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -98,17 +98,22 @@ class POLYBENCH_GESUMMV : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Index_type m_N;
 
diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp
index 1b63ee758..70fa00a76 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -100,13 +100,17 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid)
         HEAT_3D_NBLOCKS_CUDA;
         constexpr size_t shmem = 0;
 
-        poly_heat_3D_1<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-            <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(A, B, N);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel(
+          (poly_heat_3D_1<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          A, B, N );
 
-        poly_heat_3D_2<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-            <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(A, B, N);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel(
+          (poly_heat_3D_2<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          A, B, N );
 
       }
 
@@ -124,21 +128,31 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid)
         HEAT_3D_NBLOCKS_CUDA;
         constexpr size_t shmem = 0;
 
-        poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-            <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(N,
-          [=] __device__ (Index_type i, Index_type j, Index_type k) {
-            POLYBENCH_HEAT_3D_BODY1;
-          }
-        );
-        cudaErrchk( cudaGetLastError() );
-
-        poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-            <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(N,
-          [=] __device__ (Index_type i, Index_type j, Index_type k) {
-            POLYBENCH_HEAT_3D_BODY2;
-          }
-        );
-        cudaErrchk( cudaGetLastError() );
+        auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i,
+                                                     Index_type j,
+                                                     Index_type k) {
+          POLYBENCH_HEAT_3D_BODY1;
+        };
+
+        RPlaunchCudaKernel(
+          (poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                            decltype(poly_heat_3D_1_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_heat_3D_1_lambda );
+
+        auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i,
+                                                     Index_type j,
+                                                     Index_type k) {
+          POLYBENCH_HEAT_3D_BODY2;
+        };
+
+        RPlaunchCudaKernel(
+          (poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                            decltype(poly_heat_3D_2_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_heat_3D_2_lambda );
 
       }
 
@@ -168,19 +182,21 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid)
 
       for (Index_type t = 0; t < tsteps; ++t) {
 
-        RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1}),
-                                         res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
           [=] __device__ (Index_type i, Index_type j, Index_type k) {
             POLYBENCH_HEAT_3D_BODY1_RAJA;
           }
         );
 
-        RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1}),
-                                         res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
           [=] __device__ (Index_type i, Index_type j, Index_type k) {
             POLYBENCH_HEAT_3D_BODY2_RAJA;
           }
diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp
index 3a7d7f28e..6a2fb3329 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -100,15 +100,17 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid)
         HEAT_3D_NBLOCKS_HIP;
         constexpr size_t shmem = 0;
 
-        hipLaunchKernelGGL((poly_heat_3D_1<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           A, B, N);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_heat_3D_1<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          A, B, N );
 
-        hipLaunchKernelGGL((poly_heat_3D_2<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           A, B, N);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_heat_3D_2<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          A, B, N );
 
       }
 
@@ -126,26 +128,31 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid)
         HEAT_3D_NBLOCKS_HIP;
         constexpr size_t shmem = 0;
 
-        auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i, Index_type j,
+        auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i,
+                                                     Index_type j,
                                                      Index_type k) {
           POLYBENCH_HEAT_3D_BODY1;
         };
 
-        auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, Index_type j,                                                     Index_type k) {
+        RPlaunchHipKernel(
+          (poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                            decltype(poly_heat_3D_1_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_heat_3D_1_lambda );
+
+        auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i,
+                                                     Index_type j,
+                                                     Index_type k) {
           POLYBENCH_HEAT_3D_BODY2;
         };
 
-        hipLaunchKernelGGL((poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
-                                             decltype(poly_heat_3D_1_lambda)>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           N, poly_heat_3D_1_lambda);
-        hipErrchk( hipGetLastError() );
-
-        hipLaunchKernelGGL((poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
-                                             decltype(poly_heat_3D_2_lambda)>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           N, poly_heat_3D_2_lambda);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_heat_3D_lam<HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                            decltype(poly_heat_3D_2_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_heat_3D_2_lambda );
 
       }
 
@@ -174,19 +181,21 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid)
 
       for (Index_type t = 0; t < tsteps; ++t) {
 
-        RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1}),
-                                         res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
           [=] __device__ (Index_type i, Index_type j, Index_type k) {
             POLYBENCH_HEAT_3D_BODY1_RAJA;
           }
         );
 
-        RAJA::kernel_resource<EXEC_POL>( RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1}),
-                                         res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
           [=] __device__ (Index_type i, Index_type j, Index_type k) {
             POLYBENCH_HEAT_3D_BODY2_RAJA;
           }
diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp
index 1b9380a15..ba80f5022 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp
index 7a70c3f87..1c3999279 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp
index 25af09240..115661cc7 100644
--- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp
new file mode 100644
index 000000000..27341d447
--- /dev/null
+++ b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp
@@ -0,0 +1,148 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_HEAT_3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define k_wg_sz (32)
+#define j_wg_sz (work_group_size / k_wg_sz)
+#define i_wg_sz (1)
+
+
+template < size_t work_group_size >
+void POLYBENCH_HEAT_3D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_HEAT_3D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type t = 0; t < tsteps; ++t) {
+
+        sycl::range<3> global_dim(i_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, i_wg_sz),
+                                  j_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, j_wg_sz),
+                                  k_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, k_wg_sz));
+
+        sycl::range<3> wkgroup_dim(i_wg_sz, j_wg_sz, k_wg_sz);
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                         [=] (sycl::nd_item<3> item) {
+
+            Index_type i = 1 + item.get_global_id(0);
+            Index_type j = 1 + item.get_global_id(1);
+            Index_type k = 1 + item.get_global_id(2);
+
+            if (i < N-1 && j < N-1 && k < N-1) {
+              POLYBENCH_HEAT_3D_BODY1;
+            }
+
+          });
+        });
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                         [=] (sycl::nd_item<3> item) {
+
+            Index_type i = 1 + item.get_global_id(0);
+            Index_type j = 1 + item.get_global_id(1);
+            Index_type k = 1 + item.get_global_id(2);
+
+            if (i < N-1 && j < N-1 && k < N-1) {
+              POLYBENCH_HEAT_3D_BODY2;
+            }
+
+          });
+        });
+
+      }
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_HEAT_3D_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_0<i_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_1<j_wg_sz>,
+              RAJA::statement::For<2, RAJA::sycl_global_2<k_wg_sz>,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >;
+
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type t = 0; t < tsteps; ++t) {
+
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
+          [=] (Index_type i, Index_type j, Index_type k) {
+            POLYBENCH_HEAT_3D_BODY1_RAJA;
+          }
+        );
+
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
+          [=] (Index_type i, Index_type j, Index_type k) {
+            POLYBENCH_HEAT_3D_BODY2_RAJA;
+          }
+        );
+
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_HEAT_3D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_HEAT_3D, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp
index 4f14b54f9..1e4272534 100644
--- a/src/polybench/POLYBENCH_HEAT_3D.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,12 +22,12 @@ namespace polybench
 POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params)
   : KernelBase(rajaperf::Polybench_HEAT_3D, params)
 {
-  Index_type N_default = 100;
+  Index_type N_default = 102;
 
   setDefaultProblemSize( (N_default-2)*(N_default-2)*(N_default-2) );
   setDefaultReps(20);
 
-  m_N = std::cbrt( getTargetProblemSize() ) + 1;
+  m_N = std::cbrt( getTargetProblemSize() ) + 2 + std::cbrt(3)-1;
   m_tsteps = 20;
 
 
@@ -35,14 +35,13 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params)
 
   setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) );
   setKernelsPerRep( m_tsteps * 2 );
-  setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
-                               (m_N-2) * (m_N-2) * (m_N-2) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
-                               (m_N * m_N * m_N - 12*(m_N-2) - 8) +
-                               (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
-                               (m_N-2) * (m_N-2) * (m_N-2) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
-                               (m_N * m_N * m_N - 12*(m_N-2) - 8) ) );
+  setBytesReadPerRep((1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8) +
+
+                      1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8)) * m_tsteps);
+  setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) +
+
+                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2)) * m_tsteps);
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( m_tsteps * ( 15 * (m_N-2) * (m_N-2) * (m_N-2) +
                                15 * (m_N-2) * (m_N-2) * (m_N-2) ) );
 
@@ -70,6 +69,9 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_HEAT_3D::~POLYBENCH_HEAT_3D()
diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp
index 03150a267..590d7b326 100644
--- a/src/polybench/POLYBENCH_HEAT_3D.hpp
+++ b/src/polybench/POLYBENCH_HEAT_3D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -122,18 +122,23 @@ class POLYBENCH_HEAT_3D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid); 
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_N;
   Index_type m_tsteps;
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp
index e2c728090..570792bbd 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -63,11 +63,15 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid)
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
         constexpr size_t shmem = 0;
 
-        poly_jacobi_1D_1<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(A, B, N);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel( (poly_jacobi_1D_1<block_size>),
+                            grid_size, block_size,
+                            shmem, res.get_stream(),
+                            A, B, N );
 
-        poly_jacobi_1D_2<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(A, B, N);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel( (poly_jacobi_1D_2<block_size>),
+                            grid_size, block_size,
+                            shmem, res.get_stream(),
+                            A, B, N );
 
       }
 
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp
index b0f60255d..d77497459 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -63,13 +63,15 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid)
         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
         constexpr size_t shmem = 0;
 
-        hipLaunchKernelGGL((poly_jacobi_1D_1<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                                            A, B, N);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel( (poly_jacobi_1D_1<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           A, B, N );
 
-        hipLaunchKernelGGL((poly_jacobi_1D_2<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                                            A, B, N);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel( (poly_jacobi_1D_2<block_size>),
+                           grid_size, block_size,
+                           shmem, res.get_stream(),
+                           A, B, N );
 
       }
 
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp
index 42ae4a0d5..0c7cbae57 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp
index 39a2423df..35089be71 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp
index 20c8c9b73..5f3549b41 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -25,12 +25,14 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR
 
   POLYBENCH_JACOBI_1D_DATA_SETUP;
 
+#if defined(RUN_RAJA_SEQ)
   auto poly_jacobi1d_lam1 = [=] (Index_type i) {
                               POLYBENCH_JACOBI_1D_BODY1;
                             };
   auto poly_jacobi1d_lam2 = [=] (Index_type i) {
                               POLYBENCH_JACOBI_1D_BODY2;
                             };
+#endif
 
   switch ( vid ) {
 
diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp
new file mode 100644
index 000000000..8a13f6567
--- /dev/null
+++ b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp
@@ -0,0 +1,107 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_JACOBI_1D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+template < size_t work_group_size >
+void POLYBENCH_JACOBI_1D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_JACOBI_1D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type t = 0; t < tsteps; ++t) {
+
+        const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size);
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                         [=] (sycl::nd_item<1> item) {
+
+            Index_type i = item.get_global_id(0);
+            if (i > 0 && i < N-1) {
+              POLYBENCH_JACOBI_1D_BODY1;
+            }
+
+          });
+        });
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                         [=] (sycl::nd_item<1> item) {
+
+            Index_type i = item.get_global_id(0);
+            if (i > 0 && i < N-1) {
+              POLYBENCH_JACOBI_1D_BODY2;
+            }
+
+          });
+        });
+
+      }
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    using EXEC_POL = RAJA::sycl_exec<work_group_size, true /*async*/>;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type t = 0; t < tsteps; ++t) {
+
+        RAJA::forall<EXEC_POL> ( res, RAJA::RangeSegment{1, N-1},
+          [=] (Index_type i) {
+            POLYBENCH_JACOBI_1D_BODY1;
+        });
+
+        RAJA::forall<EXEC_POL> ( res, RAJA::RangeSegment{1, N-1},
+          [=] (Index_type i) {
+            POLYBENCH_JACOBI_1D_BODY2;
+        });
+
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_JACOBI_1D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_JACOBI_1D, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp
index b2beb0dfd..59a2520b5 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,12 @@ namespace polybench
 POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params)
   : KernelBase(rajaperf::Polybench_JACOBI_1D, params)
 {
-  Index_type N_default = 1000000;
+  Index_type N_default = 1000002;
 
   setDefaultProblemSize( N_default-2 );
   setDefaultReps(100);
 
-  m_N = getTargetProblemSize();
+  m_N = getTargetProblemSize() + 2;
   m_tsteps = 16;
 
 
@@ -34,14 +34,13 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params)
 
   setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) );
   setKernelsPerRep(m_tsteps * 2);
-  setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
-                               (m_N-2) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
-                               m_N +
-                               (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
-                               (m_N-2) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
-                               m_N ) );
+  setBytesReadPerRep((1*sizeof(Real_type ) * m_N +
+
+                      1*sizeof(Real_type ) * m_N) * m_tsteps);
+  setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) +
+
+                         1*sizeof(Real_type ) * (m_N-2)) * m_tsteps);
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( m_tsteps * ( 3 * (m_N-2) +
                                3 * (m_N-2) ) );
 
@@ -67,6 +66,9 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_JACOBI_1D::~POLYBENCH_JACOBI_1D()
diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp
index 5c84e0682..f128e5947 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D.hpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -68,17 +68,23 @@ class POLYBENCH_JACOBI_1D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx); 
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Index_type m_N;
   Index_type m_tsteps;
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp
index 1e8a824bb..2620d0654 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -96,13 +96,17 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid)
         JACOBI_2D_NBLOCKS_CUDA;
         constexpr size_t shmem = 0;
 
-        poly_jacobi_2D_1<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                        <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(A, B, N);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel( 
+          (poly_jacobi_2D_1<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          A, B, N );
 
-        poly_jacobi_2D_2<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                        <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(A, B, N);
-        cudaErrchk( cudaGetLastError() );
+        RPlaunchCudaKernel( 
+          (poly_jacobi_2D_2<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          A, B, N );
 
       }
 
@@ -120,21 +124,29 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid)
         JACOBI_2D_NBLOCKS_CUDA;
         constexpr size_t shmem = 0;
 
-        poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                          <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(N,
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_JACOBI_2D_BODY1;
-          }
-        );
-        cudaErrchk( cudaGetLastError() );
-
-        poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>
-                          <<<nblocks, nthreads_per_block, shmem, res.get_stream()>>>(N,
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_JACOBI_2D_BODY2;
-          }
-        );
-        cudaErrchk( cudaGetLastError() );
+        auto poly_jacobi_2D_1_lambda = [=] __device__ (Index_type i,
+                                                       Index_type j) {
+          POLYBENCH_JACOBI_2D_BODY1;
+        };
+
+        RPlaunchCudaKernel(
+          (poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                              decltype(poly_jacobi_2D_1_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_jacobi_2D_1_lambda );
+
+        auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i,
+                                                       Index_type j) {
+          POLYBENCH_JACOBI_2D_BODY2;
+        };
+
+        RPlaunchCudaKernel(
+          (poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
+                              decltype(poly_jacobi_2D_2_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_jacobi_2D_2_lambda );
 
       }
 
@@ -161,17 +173,19 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid)
 
       for (Index_type t = 0; t < tsteps; ++t) {
 
-        RAJA::kernel_resource<EXEC_POL>(RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                                                RAJA::RangeSegment{1, N-1}),
-                                        res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_JACOBI_2D_BODY1_RAJA;
           }
         );
 
-         RAJA::kernel_resource<EXEC_POL>(RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                                                 RAJA::RangeSegment{1, N-1}),
-                                         res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_JACOBI_2D_BODY2_RAJA;
           }
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp
index 6590a8173..8aac79440 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -96,15 +96,17 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid)
         JACOBI_2D_NBLOCKS_HIP;
         constexpr size_t shmem = 0;
 
-        hipLaunchKernelGGL((poly_jacobi_2D_1<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           A, B, N);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_jacobi_2D_1<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          A, B, N );
 
-        hipLaunchKernelGGL((poly_jacobi_2D_2<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           A, B, N);
-        hipErrchk( hipGetLastError() );
+        RPlaunchHipKernel(
+          (poly_jacobi_2D_2<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          A, B, N );
 
       }
 
@@ -122,25 +124,29 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid)
         JACOBI_2D_NBLOCKS_HIP;
         constexpr size_t shmem = 0;
 
-        auto poly_jacobi_2D_1_lambda =
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_JACOBI_2D_BODY1;
-          };
-
-        hipLaunchKernelGGL((poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_jacobi_2D_1_lambda)>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           N, poly_jacobi_2D_1_lambda);
-        hipErrchk( hipGetLastError() );
-
-        auto poly_jacobi_2D_2_lambda =
-          [=] __device__ (Index_type i, Index_type j) {
-            POLYBENCH_JACOBI_2D_BODY2;
-          };
-
-        hipLaunchKernelGGL((poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, decltype(poly_jacobi_2D_2_lambda)>),
-                           dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(),
-                           N, poly_jacobi_2D_2_lambda);
-        hipErrchk( hipGetLastError() );
+        auto poly_jacobi_2D_1_lambda = [=] __device__ (Index_type i,
+                                                       Index_type j) {
+          POLYBENCH_JACOBI_2D_BODY1;
+        };
+
+        RPlaunchHipKernel(
+          (poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                              decltype(poly_jacobi_2D_1_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_jacobi_2D_1_lambda );
+
+        auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i,
+                                                       Index_type j) {
+          POLYBENCH_JACOBI_2D_BODY2;
+        };
+
+        RPlaunchHipKernel(
+          (poly_jacobi_2D_lam<JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
+                              decltype(poly_jacobi_2D_2_lambda)>),
+          nblocks, nthreads_per_block,
+          shmem, res.get_stream(),
+          N, poly_jacobi_2D_2_lambda );
 
       }
 
@@ -167,17 +173,19 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid)
 
       for (Index_type t = 0; t < tsteps; ++t) {
 
-        RAJA::kernel_resource<EXEC_POL>(RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                                                RAJA::RangeSegment{1, N-1}),
-                                        res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_JACOBI_2D_BODY1_RAJA;
           }
         );
 
-        RAJA::kernel_resource<EXEC_POL>(RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
-                                                RAJA::RangeSegment{1, N-1}),
-                                        res,
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
           [=] __device__ (Index_type i, Index_type j) {
             POLYBENCH_JACOBI_2D_BODY2_RAJA;
           }
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp
index 51f3cb146..d3d7b0471 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp
index 97806cfac..e711660cc 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp
index 107dd4ec4..18cc343cd 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp
new file mode 100644
index 000000000..ff6dab08b
--- /dev/null
+++ b/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp
@@ -0,0 +1,141 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_JACOBI_2D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+  //
+  // Define work-group shape for SYCL execution
+  //
+#define j_wg_sz (32)
+#define i_wg_sz (work_group_size / j_wg_sz)
+
+
+template < size_t work_group_size >
+void POLYBENCH_JACOBI_2D::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+ 
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_JACOBI_2D_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type t = 0; t < tsteps; ++t) {
+
+        sycl::range<3> global_dim(1,
+                                  i_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, i_wg_sz),
+                                  j_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, j_wg_sz));
+
+        sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz);
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                         [=] (sycl::nd_item<3> item) {
+
+            Index_type i = item.get_global_id(1) + 1;
+            Index_type j = item.get_global_id(2) + 1;
+
+            if ( i < N-1 && j < N-1 ) {
+              POLYBENCH_JACOBI_2D_BODY1;
+            }
+
+          });
+        });
+
+        qu->submit([&] (sycl::handler& h) {
+          h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim),
+                         [=] (sycl::nd_item<3> item) {
+
+            Index_type i = item.get_global_id(1) + 1;
+            Index_type j = item.get_global_id(2) + 1;
+
+            if ( i < N-1 && j < N-1 ) {
+              POLYBENCH_JACOBI_2D_BODY2;
+            }
+
+          });
+        });
+
+      }
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_JACOBI_2D_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_1<i_wg_sz>,
+            RAJA::statement::For<1, RAJA::sycl_global_2<j_wg_sz>,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      for (Index_type t = 0; t < tsteps; ++t) {
+
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
+          [=] (Index_type i, Index_type j) {
+            POLYBENCH_JACOBI_2D_BODY1_RAJA;
+          }
+        );
+
+        RAJA::kernel_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{1, N-1},
+                           RAJA::RangeSegment{1, N-1}),
+          res,
+          [=] (Index_type i, Index_type j) {
+            POLYBENCH_JACOBI_2D_BODY2_RAJA;
+          }
+        );
+
+      }
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_JACOBI_2D : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_JACOBI_2D, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp
index 9fe51e5c1..a3b077a1f 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -21,12 +21,12 @@ namespace polybench
 POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params)
   : KernelBase(rajaperf::Polybench_JACOBI_2D, params)
 {
-  Index_type N_default = 1000;
+  Index_type N_default = 1002;
 
-  setDefaultProblemSize( N_default * N_default );
+  setDefaultProblemSize( (N_default-2)*(N_default-2) );
   setDefaultReps(50);
 
-  m_N = std::sqrt( getTargetProblemSize() ) + 1;
+  m_N = std::sqrt( getTargetProblemSize() ) + 2 + std::sqrt(2)-1;
   m_tsteps = 40;
 
 
@@ -34,14 +34,13 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params)
 
   setItsPerRep( m_tsteps * (2 * (m_N-2) * (m_N-2)) );
   setKernelsPerRep(2);
-  setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
-                               (m_N-2) * (m_N-2) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
-                               (m_N * m_N - 4) +
-                               (1*sizeof(Real_type ) + 0*sizeof(Real_type )) *
-                               (m_N-2) * (m_N-2) +
-                               (0*sizeof(Real_type ) + 1*sizeof(Real_type )) *
-                               (m_N * m_N  - 4) ) );
+  setBytesReadPerRep((1*sizeof(Real_type ) * (m_N * m_N - 4) +
+
+                      1*sizeof(Real_type ) * (m_N * m_N - 4)) * m_tsteps);
+  setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) * (m_N-2) +
+
+                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2)) * m_tsteps);
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( m_tsteps * ( 5 * (m_N-2)*(m_N-2) +
                                5 * (m_N -2)*(m_N-2) ) );
 
@@ -69,6 +68,9 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_JACOBI_2D::~POLYBENCH_JACOBI_2D()
diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp
index fe77836cb..df170306e 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D.hpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -87,18 +87,23 @@ class POLYBENCH_JACOBI_2D : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size,
-                                                         gpu_block_size::MultipleOf<32>>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
+                                                         integer::MultipleOf<32>>;
 
   Index_type m_N;
   Index_type m_tsteps;
diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp
index 871fef013..83ea50512 100644
--- a/src/polybench/POLYBENCH_MVT-Cuda.cpp
+++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -69,13 +69,17 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
-        constexpr size_t shmem = 0;
+      constexpr size_t shmem = 0;
 
-      poly_mvt_1<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(A, x1, y1, N);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (poly_mvt_1<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          A, x1, y1, N );
 
-      poly_mvt_2<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(A, x2, y2, N);
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (poly_mvt_2<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          A, x2, y2, N );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp
index 32b1b5161..636ad234c 100644
--- a/src/polybench/POLYBENCH_MVT-Hip.cpp
+++ b/src/polybench/POLYBENCH_MVT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -69,17 +69,17 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size);
-        constexpr size_t shmem = 0;
+      constexpr size_t shmem = 0;
 
-      hipLaunchKernelGGL((poly_mvt_1<block_size>),
-                         dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                         A, x1, y1, N);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_mvt_1<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         A, x1, y1, N );
 
-      hipLaunchKernelGGL((poly_mvt_2<block_size>),
-                         dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-                         A, x2, y2, N);
-      hipErrchk( hipGetLastError() );
+      RPlaunchHipKernel( (poly_mvt_2<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         A, x2, y2, N );
 
     }
     stopTimer();
diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp
index 159a86274..bb9c8f221 100644
--- a/src/polybench/POLYBENCH_MVT-OMP.cpp
+++ b/src/polybench/POLYBENCH_MVT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp
index c9ff17751..5b278628d 100644
--- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp
+++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp
index 9d63fd997..efa2ec452 100644
--- a/src/polybench/POLYBENCH_MVT-Seq.cpp
+++ b/src/polybench/POLYBENCH_MVT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/polybench/POLYBENCH_MVT-Sycl.cpp b/src/polybench/POLYBENCH_MVT-Sycl.cpp
new file mode 100644
index 000000000..c0a3879ad
--- /dev/null
+++ b/src/polybench/POLYBENCH_MVT-Sycl.cpp
@@ -0,0 +1,153 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_MVT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+
+template < size_t work_group_size >
+void POLYBENCH_MVT::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  POLYBENCH_MVT_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+
+          if (i < N) {
+            POLYBENCH_MVT_BODY1;
+            for (Index_type j = 0; j < N; ++j ) {
+              POLYBENCH_MVT_BODY2;
+            }
+            POLYBENCH_MVT_BODY3;
+          }
+
+        });
+      });
+
+      qu->submit([&] (sycl::handler& h) { 
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) { 
+
+          Index_type i = item.get_global_id(0);
+
+          if (i < N) {
+            POLYBENCH_MVT_BODY4;
+            for (Index_type j = 0; j < N; ++j ) {
+              POLYBENCH_MVT_BODY5;
+            }
+            POLYBENCH_MVT_BODY6;
+          } 
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if (vid == RAJA_SYCL) {
+
+    POLYBENCH_MVT_VIEWS_RAJA;
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::SyclKernelAsync<
+          RAJA::statement::For<0, RAJA::sycl_global_0<work_group_size>,
+            RAJA::statement::Lambda<0, RAJA::Params<0>>,
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>>
+            >,
+            RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>>
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::region<RAJA::seq_region>( [=]() {
+
+        RAJA::kernel_param_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                           RAJA::RangeSegment{0, N}),
+          RAJA::tuple<Real_type>{0.0},
+          res,
+
+          [=] (Real_type &dot) {
+            POLYBENCH_MVT_BODY1_RAJA;
+          },
+          [=] (Index_type i, Index_type j, Real_type &dot) {
+            POLYBENCH_MVT_BODY2_RAJA;
+          },
+          [=] (Index_type i, Real_type &dot) {
+            POLYBENCH_MVT_BODY3_RAJA;
+          }
+
+        );
+
+        RAJA::kernel_param_resource<EXEC_POL>(
+          RAJA::make_tuple(RAJA::RangeSegment{0, N},
+                           RAJA::RangeSegment{0, N}),
+          RAJA::tuple<Real_type>{0.0},
+          res,
+
+          [=] (Real_type &dot) {
+            POLYBENCH_MVT_BODY4_RAJA;
+          },
+          [=] (Index_type i, Index_type j, Real_type &dot) {
+            POLYBENCH_MVT_BODY5_RAJA;
+          },
+          [=] (Index_type i, Real_type &dot) {
+            POLYBENCH_MVT_BODY6_RAJA;
+          }
+
+        );
+
+      }); // end sequential region (for single-source code)
+
+    }
+    stopTimer();
+
+  } else {
+      getCout() << "\n  POLYBENCH_MVT : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_MVT, Sycl)
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
+
diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp
index c0a5b8bb9..e8da53a0c 100644
--- a/src/polybench/POLYBENCH_MVT.cpp
+++ b/src/polybench/POLYBENCH_MVT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,17 +26,22 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params)
   setDefaultProblemSize( N_default * N_default );
   setDefaultReps(100);
 
-  m_N = std::sqrt( getTargetProblemSize() ) + 1;
+  m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1;
 
 
   setActualProblemSize( m_N * m_N );
 
   setItsPerRep( 2 * m_N );
   setKernelsPerRep(2);
-  setBytesPerRep( (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N +
-                  (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N +
-                  (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N );
+  setBytesReadPerRep( 2*sizeof(Real_type ) * m_N +
+                      1*sizeof(Real_type ) * m_N * m_N +
+
+                      2*sizeof(Real_type ) * m_N +
+                      1*sizeof(Real_type ) * m_N * m_N );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N +
+
+                         1*sizeof(Real_type ) * m_N );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_N*m_N +
                  2 * m_N*m_N );
 
@@ -62,6 +67,9 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params)
 
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
+
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
 }
 
 POLYBENCH_MVT::~POLYBENCH_MVT()
diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp
index 518d75dd8..a54181833 100644
--- a/src/polybench/POLYBENCH_MVT.hpp
+++ b/src/polybench/POLYBENCH_MVT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -112,17 +112,22 @@ class POLYBENCH_MVT : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Index_type m_N;
   Real_ptr m_x1;
diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in
index d545c0b93..3d2588378 100644
--- a/src/rajaperf_config.hpp.in
+++ b/src/rajaperf_config.hpp.in
@@ -9,7 +9,7 @@
  */
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -22,13 +22,20 @@
 
 #include "RAJA/config.hpp"
 #include "camp/number.hpp"
+#include "camp/list.hpp"
 
+#include <type_traits>
 #include <string>
 
 #cmakedefine RAJA_PERFSUITE_ENABLE_MPI
 #cmakedefine RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN
 
+#if defined(RAJA_ENABLE_CUDA)
+#define RAJA_PERFSUITE_TUNING_CUDA_ARCH @RAJA_PERFSUITE_TUNING_CUDA_ARCH@
+#endif
+
 #if defined(RAJA_ENABLE_HIP)
+#define RAJA_PERFSUITE_TUNING_HIP_ARCH @RAJA_PERFSUITE_TUNING_HIP_ARCH@
 #include <hip/hip_version.h>
 #if (HIP_VERSION_MAJOR > 5) || \
     (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2)
@@ -42,8 +49,23 @@
 #include <algorithm>
 #endif
 
+// Squash compiler warnings about unused variables
+template < typename ... Ts >
+inline void RAJAPERF_UNUSED_VAR(Ts&&...) { }
+
+// Squash compiler warnings about unused arguments
+#define RAJAPERF_UNUSED_ARG(...)
+
 namespace rajaperf {
 
+namespace integer {
+
+// helper alias to convert comma separated integer literals into list
+template < size_t... Is >
+using list_type = camp::list< camp::integral_constant<size_t, Is>... >;
+
+} // closing brace for integer namespace
+
 struct configuration {
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
 
@@ -85,18 +107,25 @@ const adiak::version adiak_compiler_version = std::string("@CMAKE_CXX_COMPILER_V
 const adiak::version adiak_cuda_compiler_version = std::string("@CMAKE_CUDA_COMPILER_VERSION@");
 constexpr static const char* adiak_gpu_targets = "@GPU_TARGETS@";
 constexpr static const char* adiak_cmake_hip_architectures = "@CMAKE_HIP_ARCHIECTURES@";
-const std::vector<int> adiak_gpu_targets_block_sizes = {@RAJA_PERFSUITE_GPU_BLOCKSIZES@};
+constexpr static const char* adiak_tuning_cuda_arch = "@RAJA_PERFSUITE_TUNING_CUDA_ARCH@";
+constexpr static const char* adiak_tuning_hip_arch = "@RAJA_PERFSUITE_TUNING_HIP_ARCH@";
+const std::vector<int> adiak_gpu_block_sizes = {@RAJA_PERFSUITE_GPU_BLOCKSIZES@};
+const std::vector<int> adiak_atomic_replications = {@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@};
+const std::vector<int> adiak_gpu_items_per_thread = {@RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD@};
 const std::vector<adiak::catstring> adiak_raja_hipcc_flags = str_to_list<adiak::catstring>(std::string("@RAJA_HIPCC_FLAGS@"));
 const adiak::catstring adiak_mpi_cxx_compiler = std::string("@MPI_CXX_COMPILER@");
 const adiak::catstring adiak_systype_build = std::string("@RAJAPERF_BUILD_SYSTYPE@");
 const adiak::catstring adiak_machine_build = std::string("@RAJAPERF_BUILD_HOST@");
 #endif
 
-// helper alias to void trailing comma in no-arg case
-template < size_t... Is >
-using i_seq = camp::int_seq<size_t, Is...>;
 // List of GPU block sizes
-using gpu_block_sizes = i_seq<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>;
+using gpu_block_sizes = integer::list_type<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>;
+
+// List of GPU atomic replications
+using atomic_replications = integer::list_type<@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@>;
+
+// List of GPU items per thread
+using gpu_items_per_thread = integer::list_type<@RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD@>;
 
 // Name of user who ran code
 std::string user_run;
@@ -110,13 +139,27 @@ std::string machine_run;
 
 };
 
-} // closing brace for rajaperf namespace
+#if __cplusplus < 201703L
+// Implement std::conjunction from https://en.cppreference.com/w/cpp/types/conjunction
+template<class...> struct conjunction : std::true_type {};
+template<class B1> struct conjunction<B1> : B1 {};
+template<class B1, class... Bn>
+struct conjunction<B1, Bn...>
+  : std::conditional_t<bool(B1::value), conjunction<Bn...>, B1> {};
+#else
+using std::conjunction;
+#endif
 
-// Squash compiler warnings about unused variables
-template < typename ... Ts >
-inline void RAJAPERF_UNUSED_VAR(Ts&&...) { }
+//compile time loop over an integer sequence
+//this allows for creating a loop over a compile time constant variable
+template <typename Func, typename... Ts>
+inline void seq_for(camp::list<Ts...> const&, Func&& func)
+{
+  // braced init lists are evaluated in order
+  int seq_unused_array[] = {0, (func(Ts{}), 0)...};
+  RAJAPERF_UNUSED_VAR(seq_unused_array);
+}
 
-// Squash compiler warnings about unused arguments
-#define RAJAPERF_UNUSED_ARG(...)
+} // closing brace for rajaperf namespace
 
 #endif  // closing endif for header file include guard
diff --git a/src/stream-kokkos/ADD-Kokkos.cpp b/src/stream-kokkos/ADD-Kokkos.cpp
index 51e5bdf81..1a4280cc9 100644
--- a/src/stream-kokkos/ADD-Kokkos.cpp
+++ b/src/stream-kokkos/ADD-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream-kokkos/CMakeLists.txt b/src/stream-kokkos/CMakeLists.txt
index 4cd38bdf5..6ba8dbbb6 100644
--- a/src/stream-kokkos/CMakeLists.txt
+++ b/src/stream-kokkos/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
diff --git a/src/stream-kokkos/COPY-Kokkos.cpp b/src/stream-kokkos/COPY-Kokkos.cpp
index d363cd944..3312a57fa 100644
--- a/src/stream-kokkos/COPY-Kokkos.cpp
+++ b/src/stream-kokkos/COPY-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream-kokkos/DOT-Kokkos.cpp b/src/stream-kokkos/DOT-Kokkos.cpp
index ca6b0e304..ff1124068 100644
--- a/src/stream-kokkos/DOT-Kokkos.cpp
+++ b/src/stream-kokkos/DOT-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream-kokkos/MUL-Kokkos.cpp b/src/stream-kokkos/MUL-Kokkos.cpp
index aa53b0d66..e1f17be92 100644
--- a/src/stream-kokkos/MUL-Kokkos.cpp
+++ b/src/stream-kokkos/MUL-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream-kokkos/TRIAD-Kokkos.cpp b/src/stream-kokkos/TRIAD-Kokkos.cpp
index 3b897a46a..2d5465939 100644
--- a/src/stream-kokkos/TRIAD-Kokkos.cpp
+++ b/src/stream-kokkos/TRIAD-Kokkos.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp
index e8e095665..7b79f1b10 100644
--- a/src/stream/ADD-Cuda.cpp
+++ b/src/stream/ADD-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,9 +52,11 @@ void ADD::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      add<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( c, a, b,
-                                      iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (add<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          c, a, b, iend );
 
     }
     stopTimer();
@@ -64,13 +66,18 @@ void ADD::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto add_lambda = [=] __device__ (Index_type i) {
+        ADD_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        ADD_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size, 
+                                              decltype(add_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, add_lambda );
 
     }
     stopTimer();
diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp
index 50ab42466..fe470d391 100644
--- a/src/stream/ADD-Hip.cpp
+++ b/src/stream/ADD-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,11 @@ void ADD::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((add<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  c, a, b,
-                                      iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (add<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         c, a, b, iend );
 
     }
     stopTimer();
@@ -69,9 +71,12 @@ void ADD::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(add_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, add_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(add_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, add_lambda );
 
     }
     stopTimer();
diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp
index ddd24eb30..22f850da3 100644
--- a/src/stream/ADD-OMP.cpp
+++ b/src/stream/ADD-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp
index 6e4302446..c1a1480cf 100644
--- a/src/stream/ADD-OMPTarget.cpp
+++ b/src/stream/ADD-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp
index a07fe24d6..516fe61a6 100644
--- a/src/stream/ADD-Seq.cpp
+++ b/src/stream/ADD-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/ADD-Sycl.cpp b/src/stream/ADD-Sycl.cpp
new file mode 100644
index 000000000..483672cb1
--- /dev/null
+++ b/src/stream/ADD-Sycl.cpp
@@ -0,0 +1,80 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf
+{
+namespace stream
+{
+
+template <size_t work_group_size >
+void ADD::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  ADD_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            ADD_BODY
+          }
+
+        });
+      });
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+        RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+        ADD_BODY;
+      });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  ADD : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ADD, Sycl)
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp
index 02cf25107..510f39bb8 100644
--- a/src/stream/ADD.cpp
+++ b/src/stream/ADD.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ ADD::ADD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) *
-                  getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
   setUsesFeature(Forall);
@@ -53,6 +54,9 @@ ADD::ADD(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp
index 49e09a602..7b96dbf9e 100644
--- a/src/stream/ADD.hpp
+++ b/src/stream/ADD.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,18 +52,24 @@ class ADD : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_a;
   Real_ptr m_b;
diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt
index 03351ff5d..bb4de4ce5 100644
--- a/src/stream/CMakeLists.txt
+++ b/src/stream/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -14,29 +14,34 @@ blt_add_library(
           ADD-Cuda.cpp
           ADD-OMP.cpp
           ADD-OMPTarget.cpp
+	  ADD-Sycl.cpp
           COPY.cpp 
           COPY-Seq.cpp 
           COPY-Hip.cpp
           COPY-Cuda.cpp
           COPY-OMP.cpp
           COPY-OMPTarget.cpp
+	  COPY-Sycl.cpp
           DOT.cpp 
           DOT-Seq.cpp 
           DOT-Hip.cpp 
           DOT-Cuda.cpp 
           DOT-OMP.cpp 
           DOT-OMPTarget.cpp 
+	  DOT-Sycl.cpp 
           MUL.cpp 
           MUL-Seq.cpp 
           MUL-Hip.cpp 
           MUL-Cuda.cpp 
           MUL-OMP.cpp 
           MUL-OMPTarget.cpp 
+	  MUL-Sycl.cpp 
           TRIAD.cpp 
           TRIAD-Seq.cpp 
           TRIAD-Hip.cpp 
           TRIAD-Cuda.cpp 
           TRIAD-OMPTarget.cpp 
           TRIAD-OMP.cpp 
+	  TRIAD-Sycl.cpp 
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp
index 3bea59764..a45d45a16 100644
--- a/src/stream/COPY-Cuda.cpp
+++ b/src/stream/COPY-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,11 @@ void COPY::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      copy<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( c, a,
-                                       iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (copy<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          c, a, iend );
 
     }
     stopTimer();
@@ -63,13 +65,18 @@ void COPY::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto copy_lambda = [=] __device__ (Index_type i) {
+        COPY_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        COPY_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size, 
+                                              decltype(copy_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, copy_lambda );
 
     }
     stopTimer();
diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp
index 305892fdb..f5e19fac0 100644
--- a/src/stream/COPY-Hip.cpp
+++ b/src/stream/COPY-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,11 @@ void COPY::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((copy<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),
-          c, a, iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (copy<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         c, a, iend );
 
     }
     stopTimer();
@@ -69,9 +71,12 @@ void COPY::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(copy_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(copy_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, copy_lambda );
 
     }
     stopTimer();
diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp
index d9a0aa2a9..1718ff5ac 100644
--- a/src/stream/COPY-OMP.cpp
+++ b/src/stream/COPY-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp
index a9250c4cd..f1dd5017d 100644
--- a/src/stream/COPY-OMPTarget.cpp
+++ b/src/stream/COPY-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp
index 311f9754d..25b897707 100644
--- a/src/stream/COPY-Seq.cpp
+++ b/src/stream/COPY-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/COPY-Sycl.cpp b/src/stream/COPY-Sycl.cpp
new file mode 100644
index 000000000..4f1049a6e
--- /dev/null
+++ b/src/stream/COPY-Sycl.cpp
@@ -0,0 +1,81 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "COPY.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+template <size_t work_group_size >
+void COPY::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  COPY_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                                   [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            COPY_BODY
+          }
+
+        });
+      });
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         COPY_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+      std::cout << "\n  COPY : Unknown Sycl variant id = " << vid << std::endl;
+  }
+
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(COPY, Sycl)
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp
index c92018c63..9cfce257a 100644
--- a/src/stream/COPY.cpp
+++ b/src/stream/COPY.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ COPY::COPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) *
-                  getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
   setUsesFeature( Forall );
@@ -53,6 +54,9 @@ COPY::COPY(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp
index 0544e0d2f..991406624 100644
--- a/src/stream/COPY.hpp
+++ b/src/stream/COPY.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,18 +51,24 @@ class COPY : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_a;
   Real_ptr m_c;
diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp
index fbecc979f..031355a3e 100644
--- a/src/stream/DOT-Cuda.cpp
+++ b/src/stream/DOT-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,9 @@
 #include "common/CudaDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
 
 
 namespace rajaperf
@@ -45,24 +48,16 @@ __global__ void dot(Real_ptr a, Real_ptr b,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
   if ( threadIdx.x == 0 ) {
     RAJA::atomicAdd<RAJA::cuda_atomic>( dprod, pdot[ 0 ] );
   }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *dprod += pdot[ 0 ];
-  }
-#endif
-
 }
 
 
-template < size_t block_size >
-void DOT::runCudaVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void DOT::runCudaVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getCudaResource()};
@@ -71,40 +66,65 @@ void DOT::runCudaVariantImpl(VariantID vid)
 
   if ( vid == Base_CUDA ) {
 
-    Real_ptr dprod;
-    allocData(DataSpace::CudaDevice, dprod, 1);
+    RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1, 1);
+
+    constexpr size_t shmem = sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
+        MappingHelper, (dot<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      cudaErrchk( cudaMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type),
-                                   cudaMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1, 1);
+
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = sizeof(Real_type)*block_size;
-      dot<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-          a, b, dprod, m_dot_init, iend );
-      cudaErrchk( cudaGetLastError() );
+      RPlaunchCudaKernel( (dot<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          a, b, dprod, m_dot_init, iend );
 
-      Real_type lprod;
-      cudaErrchk( cudaMemcpyAsync( &lprod, dprod, sizeof(Real_type),
-                                   cudaMemcpyDeviceToHost, res.get_stream() ) );
-      cudaErrchk( cudaStreamSynchronize( res.get_stream() ) );
-      m_dot += lprod;
+      RAJAPERF_CUDA_REDUCER_COPY_BACK(dprod, hdprod, 1, 1);
+      m_dot += hdprod[0];
 
     }
     stopTimer();
 
-    deallocData(DataSpace::CudaDevice, dprod);
+    RAJAPERF_CUDA_REDUCER_TEARDOWN(dprod, hdprod);
+
+  } else {
+     getCout() << "\n  DOT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void DOT::runCudaVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::cuda_reduce_atomic,
+      RAJA::cuda_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
 
-  } else if ( vid == RAJA_CUDA ) {
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  DOT_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       RAJA::ReduceSum<RAJA::cuda_reduce, Real_type> dot(m_dot_init);
+       RAJA::ReduceSum<reduction_policy, Real_type> dot(m_dot_init);
 
-       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >( res,
+       RAJA::forall<exec_policy>( res,
          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
          DOT_BODY;
        });
@@ -119,7 +139,162 @@ void DOT::runCudaVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Cuda)
+template < size_t block_size, typename MappingHelper >
+void DOT::runCudaVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::cuda_exec<block_size, true /*async*/>,
+      RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getCudaResource()};
+
+  DOT_DATA_SETUP;
+
+  if ( vid == RAJA_CUDA ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       Real_type tdot = m_dot_init;
+
+       RAJA::forall<exec_policy>( res,
+         RAJA::RangeSegment(ibegin, iend),
+         RAJA::expt::Reduce<RAJA::operators::plus>(&tdot),
+         [=] __device__ (Index_type i, Real_type& dot) {
+           DOT_BODY;
+         }
+       );
+
+       m_dot += static_cast<Real_type>(tdot);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  DOT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+void DOT::runCudaVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantBase<decltype(block_size){},
+                                 decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runCudaVariantRAJA<decltype(block_size){},
+                                   decltype(algorithm_helper),
+                                   decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runCudaVariantRAJANewReduce<decltype(block_size){},
+                                          decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  DOT : Unknown Cuda variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void DOT::setCudaTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_CUDA || vid == RAJA_CUDA ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_CUDA ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          } else if ( vid == RAJA_CUDA ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+            RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace stream
 } // end namespace rajaperf
diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp
index 7bd1ef277..0c3c914a9 100644
--- a/src/stream/DOT-Hip.cpp
+++ b/src/stream/DOT-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -15,6 +15,9 @@
 #include "common/HipDataUtils.hpp"
 
 #include <iostream>
+#include <utility>
+#include <type_traits>
+#include <limits>
 
 
 namespace rajaperf
@@ -45,25 +48,16 @@ __global__ void dot(Real_ptr a, Real_ptr b,
      __syncthreads();
   }
 
-#if 1 // serialized access to shared data;
   if ( threadIdx.x == 0 ) {
-    //atomicAdd(dprod, pdot[ 0 ] );
-    RAJA::atomicAdd(RAJA::hip_atomic{}, dprod, pdot[ 0 ] );
+    RAJA::atomicAdd<RAJA::hip_atomic>( dprod, pdot[ 0 ] );
   }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *dprod += pdot[ 0 ];
-  }
-#endif
-
 }
 
 
-template < size_t block_size >
-void DOT::runHipVariantImpl(VariantID vid)
+template < size_t block_size, typename MappingHelper >
+void DOT::runHipVariantBase(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
 
   auto res{getHipResource()};
@@ -72,41 +66,65 @@ void DOT::runHipVariantImpl(VariantID vid)
 
   if ( vid == Base_HIP ) {
 
-    Real_ptr dprod;
-    allocData(DataSpace::HipDevice, dprod, 1);
+    RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1, 1);
+
+    constexpr size_t shmem = sizeof(Real_type)*block_size;
+    const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
+        MappingHelper, (dot<block_size>), block_size, shmem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      hipErrchk( hipMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type),
-                                 hipMemcpyHostToDevice, res.get_stream() ) );
+      RAJAPERF_HIP_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1, 1);
+
+      const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      const size_t grid_size = std::min(normal_grid_size, max_grid_size);
 
-      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      constexpr size_t shmem = sizeof(Real_type)*block_size;
-      hipLaunchKernelGGL((dot<block_size>), dim3(grid_size), dim3(block_size),
-                                            shmem, res.get_stream(),
+      RPlaunchHipKernel( (dot<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
                          a, b, dprod, m_dot_init, iend );
-      hipErrchk( hipGetLastError() );
 
-      Real_type lprod;
-      hipErrchk( hipMemcpyAsync( &lprod, dprod, sizeof(Real_type),
-                                 hipMemcpyDeviceToHost, res.get_stream() ) );
-      hipErrchk( hipStreamSynchronize( res.get_stream() ) );
-      m_dot += lprod;
+      RAJAPERF_HIP_REDUCER_COPY_BACK(dprod, hdprod, 1, 1);
+      m_dot += hdprod[0];
 
     }
     stopTimer();
 
-    deallocData(DataSpace::HipDevice, dprod);
+    RAJAPERF_HIP_REDUCER_TEARDOWN(dprod, hdprod);
 
-  } else if ( vid == RAJA_HIP ) {
+  } else {
+     getCout() << "\n  DOT : Unknown Hip variant id = " << vid << std::endl;
+  }
+}
+
+template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+void DOT::runHipVariantRAJA(VariantID vid)
+{
+  using reduction_policy = std::conditional_t<AlgorithmHelper::atomic,
+      RAJA::hip_reduce_atomic,
+      RAJA::hip_reduce>;
+
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  DOT_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-       RAJA::ReduceSum<RAJA::hip_reduce, Real_type> dot(m_dot_init);
+       RAJA::ReduceSum<reduction_policy, Real_type> dot(m_dot_init);
 
-       RAJA::forall< RAJA::hip_exec<block_size, true /*async*/> >( res,
+       RAJA::forall<exec_policy>( res,
          RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
          DOT_BODY;
        });
@@ -121,7 +139,160 @@ void DOT::runHipVariantImpl(VariantID vid)
   }
 }
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Hip)
+template < size_t block_size, typename MappingHelper >
+void DOT::runHipVariantRAJANewReduce(VariantID vid)
+{
+  using exec_policy = std::conditional_t<MappingHelper::direct,
+      RAJA::hip_exec<block_size, true /*async*/>,
+      RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
+
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getHipResource()};
+
+  DOT_DATA_SETUP;
+
+  if ( vid == RAJA_HIP ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       Real_type tdot = m_dot_init;
+
+       RAJA::forall<exec_policy>( res,
+         RAJA::RangeSegment(ibegin, iend),
+         RAJA::expt::Reduce<RAJA::operators::plus>(&tdot),
+         [=] __device__ (Index_type i, Real_type& dot) {
+           DOT_BODY;
+         }
+       );
+
+       m_dot += static_cast<Real_type>(tdot);
+
+    }
+    stopTimer();
+
+  } else {
+     getCout() << "\n  DOT : Unknown HIP variant id = " << vid << std::endl;
+  }
+}
+
+void DOT::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  size_t t = 0;
+
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantBase<decltype(block_size){},
+                                decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              if (tune_idx == t) {
+
+                setBlockSize(block_size);
+                runHipVariantRAJA<decltype(block_size){},
+                                  decltype(algorithm_helper),
+                                  decltype(mapping_helper)>(vid);
+
+              }
+
+              t += 1;
+
+            });
+
+            if (tune_idx == t) {
+
+              setBlockSize(block_size);
+              runHipVariantRAJANewReduce<decltype(block_size){},
+                                         decltype(mapping_helper)>(vid);
+
+            }
+
+            t += 1;
+
+          }
+
+        });
+
+      }
+
+    });
+
+  } else {
+
+    getCout() << "\n  DOT : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void DOT::setHipTuningDefinitions(VariantID vid)
+{
+  if ( vid == Base_HIP || vid == RAJA_HIP ) {
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) {
+
+          if ( vid == Base_HIP ) {
+
+            auto algorithm_helper = gpu_algorithm::block_atomic_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      std::to_string(block_size));
+
+          } else if ( vid == RAJA_HIP ) {
+
+            seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) {
+
+              addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                        decltype(mapping_helper)::get_name()+"_"+
+                                        std::to_string(block_size));
+
+            });
+     
+            auto algorithm_helper = gpu_algorithm::block_device_helper{};
+
+            addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+
+                                      decltype(mapping_helper)::get_name()+"_"+
+                                      "new_"+std::to_string(block_size));
+
+          }
+
+        });
+
+      }
+
+    });
+
+  }
+
+}
 
 } // end namespace stream
 } // end namespace rajaperf
diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp
index 6b7d67e0e..d7112336a 100644
--- a/src/stream/DOT-OMP.cpp
+++ b/src/stream/DOT-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,7 +18,7 @@ namespace stream
 {
 
 
-void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void DOT::runOpenMPVariant(VariantID vid, size_t tune_idx)
 {
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
@@ -76,20 +76,46 @@ void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 
     case RAJA_OpenMP : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
 
-        RAJA::ReduceSum<RAJA::omp_reduce, Real_type> dot(m_dot_init);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          DOT_BODY;
-        });
+          RAJA::ReduceSum<RAJA::omp_reduce, Real_type> dot(m_dot_init);
 
-        m_dot += dot;
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            DOT_BODY;
+          });
+
+          m_dot += dot;
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          Real_type tdot = m_dot_init;
 
+          RAJA::forall<RAJA::omp_parallel_for_exec>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tdot),
+            [=] (Index_type i, Real_type& dot) {
+              DOT_BODY;
+            }
+          );
+
+          m_dot += static_cast<Real_type>(tdot);
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  DOT : Unknown OpenMP tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -102,8 +128,17 @@ void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 
 #else
   RAJA_UNUSED_VAR(vid);
+  RAJA_UNUSED_VAR(tune_idx);
 #endif
 }
 
+void DOT::setOpenMPTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMP) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace stream
 } // end namespace rajaperf
diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp
index 7ab5d578e..238f8fbae 100644
--- a/src/stream/DOT-OMPTarget.cpp
+++ b/src/stream/DOT-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -26,7 +26,7 @@ namespace stream
   //
   const size_t threads_per_team = 256;
 
-void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void DOT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
 {
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
@@ -34,44 +34,89 @@ void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
 
   DOT_DATA_SETUP;
 
-  if ( vid == Base_OpenMPTarget ) {
+  switch ( vid ) {
 
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    case Base_OpenMPTarget : {
 
-      Real_type dot = m_dot_init;
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      #pragma omp target is_device_ptr(a, b) device( did ) map(tofrom:dot)
-      #pragma omp teams distribute parallel for reduction(+:dot) \
-              thread_limit(threads_per_team) schedule(static, 1)
-      for (Index_type i = ibegin; i < iend; ++i ) {
-        DOT_BODY;
-      }
+        Real_type dot = m_dot_init;
+
+        #pragma omp target is_device_ptr(a, b) device( did ) map(tofrom:dot)
+        #pragma omp teams distribute parallel for reduction(+:dot) \
+                thread_limit(threads_per_team) schedule(static, 1)
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          DOT_BODY;
+        }
+
+        m_dot += dot;
 
-      m_dot += dot;
+      }
+      stopTimer();
 
+      break;
     }
-    stopTimer();
 
-  } else if ( vid == RAJA_OpenMPTarget ) {
+    case RAJA_OpenMPTarget : {
+
+      if (tune_idx == 0) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> dot(m_dot_init);
+
+          RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            DOT_BODY;
+          });
+
+          m_dot += static_cast<Real_type>(dot.get());
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
 
-    startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-      RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> dot(m_dot_init);
+          Real_type tdot = m_dot_init;
 
-      RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-        DOT_BODY;
-      });
+          RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
+            RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tdot),
+            [=] (Index_type i, Real_type& dot) {
+              DOT_BODY;
+            }
+          );
 
-      m_dot += static_cast<Real_type>(dot.get());
+          m_dot += static_cast<Real_type>(tdot);
 
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  DOT : Unknown OMP Target tuning index = " << tune_idx << std::endl;
+      }
+
+      break;
+    }
+
+    default : {
+      getCout() << "\n  DOT : Unknown OMP Target variant id = " << vid << std::endl;
     }
-    stopTimer();
 
-  } else {
-     getCout() << "\n  DOT : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+
+}
+
+void DOT::setOpenMPTargetTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_OpenMPTarget) {
+    addVariantTuningName(vid, "new");
   }
 }
 
diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp
index fe7568191..4d359775f 100644
--- a/src/stream/DOT-Seq.cpp
+++ b/src/stream/DOT-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -18,8 +18,11 @@ namespace stream
 {
 
 
-void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+void DOT::runSeqVariant(VariantID vid, size_t tune_idx)
 {
+#if !defined(RUN_RAJA_SEQ)
+  RAJA_UNUSED_VAR(tune_idx);
+#endif
   const Index_type run_reps = getRunReps();
   const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();
@@ -73,20 +76,45 @@ void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 
     case RAJA_Seq : {
 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      if (tune_idx == 0) {
 
-        RAJA::ReduceSum<RAJA::seq_reduce, Real_type> dot(m_dot_init);
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::seq_exec>(
-          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
-          DOT_BODY;
-        });
+          RAJA::ReduceSum<RAJA::seq_reduce, Real_type> dot(m_dot_init);
+  
+          RAJA::forall<RAJA::seq_exec>(
+            RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+            DOT_BODY;
+          });
+
+          m_dot += static_cast<Real_type>(dot.get());
+
+        }
+        stopTimer();
+
+      } else if (tune_idx == 1) {
+
+        startTimer();
+        for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+          Real_type tdot = m_dot_init;
 
-        m_dot += static_cast<Real_type>(dot.get());
+          RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(ibegin, iend),
+            RAJA::expt::Reduce<RAJA::operators::plus>(&tdot),
+            [=] (Index_type i, Real_type& dot) {
+              DOT_BODY;
+            }
+          );
 
+          m_dot += static_cast<Real_type>(tdot);
+
+        }
+        stopTimer();
+
+      } else {
+        getCout() << "\n  DOT : Unknown Seq tuning index = " << tune_idx << std::endl;
       }
-      stopTimer();
 
       break;
     }
@@ -100,5 +128,13 @@ void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 
 }
 
+void DOT::setSeqTuningDefinitions(VariantID vid)
+{
+  addVariantTuningName(vid, "default");
+  if (vid == RAJA_Seq) {
+    addVariantTuningName(vid, "new");
+  }
+}
+
 } // end namespace stream
 } // end namespace rajaperf
diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp
new file mode 100644
index 000000000..250f0b680
--- /dev/null
+++ b/src/stream/DOT-Sycl.cpp
@@ -0,0 +1,103 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DOT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include "common/SyclDataUtils.hpp"
+
+#include <iostream>
+
+
+namespace rajaperf
+{
+namespace stream
+{
+
+template <size_t work_group_size >
+void DOT::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  DOT_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    Real_ptr dot;
+    allocAndInitSyclDeviceData(dot, &m_dot_init, 1, qu);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      initSyclDeviceData(dot, &m_dot_init, 1, qu); 
+
+      qu->submit([&] (sycl::handler& h) {
+
+        auto sumReduction = sycl::reduction(dot, sycl::plus<Real_type>());
+
+        h.parallel_for(sycl::nd_range<1>(global_size, work_group_size),
+                       sumReduction,
+                       [=] (sycl::nd_item<1> item, auto& dot) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            DOT_BODY;
+          }
+
+        });
+      });
+
+      Real_type ldot;
+      Real_ptr pldot = &ldot;
+      getSyclDeviceData(pldot, dot, 1, qu);
+      m_dot += ldot;       
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       Real_type tdot = m_dot_init;
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( 
+         res,
+         RAJA::RangeSegment(ibegin, iend), 
+         RAJA::expt::Reduce<RAJA::operators::plus>(&tdot),
+         [=]  (Index_type i, Real_type& dot) {
+           DOT_BODY;
+         }
+       );
+
+       m_dot += static_cast<Real_type>(tdot);
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  DOT : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Sycl)
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp
index cc32be5f2..5249c8ebd 100644
--- a/src/stream/DOT.cpp
+++ b/src/stream/DOT.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,9 +28,10 @@ DOT::DOT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) +
-                  (0*sizeof(Real_type) + 2*sizeof(Real_type)) *
-                  getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) +
+                      2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());
 
   setUsesFeature( Forall );
@@ -53,6 +54,9 @@ DOT::DOT(const RunParams& params)
   setVariantDefined( Base_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp
index 5912c120a..2626dbc5e 100644
--- a/src/stream/DOT.hpp
+++ b/src/stream/DOT.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,18 +51,37 @@ class DOT : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
+  void setSeqTuningDefinitions(VariantID vid);
+  void setOpenMPTuningDefinitions(VariantID vid);
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
-  template < size_t block_size >
-  void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size >
-  void runHipVariantImpl(VariantID vid);
+  void setOpenMPTargetTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runCudaVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runCudaVariantRAJANewReduce(VariantID vid);
+
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantBase(VariantID vid);
+  template < size_t block_size, typename AlgorithmHelper, typename MappingHelper >
+  void runHipVariantRAJA(VariantID vid);
+  template < size_t block_size, typename MappingHelper >
+  void runHipVariantRAJANewReduce(VariantID vid);
+
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_a;
   Real_ptr m_b;
diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp
index adfebfd01..55731255b 100644
--- a/src/stream/MUL-Cuda.cpp
+++ b/src/stream/MUL-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,11 @@ void MUL::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      mul<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( b, c, alpha,
-                                      iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (mul<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          b, c, alpha, iend );
 
     }
     stopTimer();
@@ -63,13 +65,18 @@ void MUL::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto mul_lambda = [=] __device__ (Index_type i) {
+        MUL_BODY;
+      };
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        MUL_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size, 
+                                              decltype(mul_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, mul_lambda );
 
     }
     stopTimer();
diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp
index 8a2394612..0990ac09b 100644
--- a/src/stream/MUL-Hip.cpp
+++ b/src/stream/MUL-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,11 @@ void MUL::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((mul<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  b, c, alpha,
-                                      iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (mul<block_size>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         b, c, alpha, iend );
 
     }
     stopTimer();
@@ -69,9 +71,12 @@ void MUL::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(mul_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, mul_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                             decltype(mul_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, mul_lambda );
 
     }
     stopTimer();
diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp
index 3369d0f3d..e5a17864e 100644
--- a/src/stream/MUL-OMP.cpp
+++ b/src/stream/MUL-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp
index c5f20d6b3..07edb732f 100644
--- a/src/stream/MUL-OMPTarget.cpp
+++ b/src/stream/MUL-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp
index 9107945fd..8bffdb3ca 100644
--- a/src/stream/MUL-Seq.cpp
+++ b/src/stream/MUL-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp
new file mode 100644
index 000000000..01be5d872
--- /dev/null
+++ b/src/stream/MUL-Sycl.cpp
@@ -0,0 +1,80 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MUL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+template <size_t work_group_size >
+void MUL::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  MUL_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            MUL_BODY
+          }
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         MUL_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+     std::cout << "\n  MUL : Unknown Sycl variant id = " << vid << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MUL, Sycl)
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_Sycl
diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp
index fba825bf6..eedea75c7 100644
--- a/src/stream/MUL.cpp
+++ b/src/stream/MUL.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ MUL::MUL(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) *
-                  getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
   setUsesFeature( Forall );
@@ -53,6 +54,9 @@ MUL::MUL(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp
index 3db59092a..6edd6381a 100644
--- a/src/stream/MUL.hpp
+++ b/src/stream/MUL.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -52,18 +52,24 @@ class MUL : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_b;
   Real_ptr m_c;
diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp
index af3af1c63..89f931f6c 100644
--- a/src/stream/TRIAD-Cuda.cpp
+++ b/src/stream/TRIAD-Cuda.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,11 @@ void TRIAD::runCudaVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      triad<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>( a, b, c, alpha,
-                                        iend );
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (triad<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          a, b, c, alpha, iend );
 
     }
     stopTimer();
@@ -63,13 +65,18 @@ void TRIAD::runCudaVariantImpl(VariantID vid)
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
+      auto triad_lambda = [=] __device__ (Index_type i) {
+        TRIAD_BODY;
+      }; 
+
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      lambda_cuda_forall<block_size><<<grid_size, block_size, shmem, res.get_stream()>>>(
-        ibegin, iend, [=] __device__ (Index_type i) {
-        TRIAD_BODY;
-      });
-      cudaErrchk( cudaGetLastError() );
+
+      RPlaunchCudaKernel( (lambda_cuda_forall<block_size,
+                                              decltype(triad_lambda)>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          ibegin, iend, triad_lambda );
 
     }
     stopTimer();
diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp
index a8a5b9f99..aebaa3ec1 100644
--- a/src/stream/TRIAD-Hip.cpp
+++ b/src/stream/TRIAD-Hip.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -51,9 +51,11 @@ void TRIAD::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((triad<block_size>), dim3(grid_size), dim3(block_size), shmem, res.get_stream(),  a, b, c, alpha,
-                                        iend );
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (triad<block_size>),
+                          grid_size, block_size,
+                          shmem, res.get_stream(),
+                          a, b, c, alpha, iend );
 
     }
     stopTimer();
@@ -69,9 +71,12 @@ void TRIAD::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
       constexpr size_t shmem = 0;
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_lambda)>),
-        grid_size, block_size, shmem, res.get_stream(), ibegin, iend, triad_lambda);
-      hipErrchk( hipGetLastError() );
+
+      RPlaunchHipKernel( (lambda_hip_forall<block_size,
+                                            decltype(triad_lambda)>),
+                         grid_size, block_size,
+                         shmem, res.get_stream(),
+                         ibegin, iend, triad_lambda );
 
     }
     stopTimer();
diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp
index 5d9832d95..abbadb240 100644
--- a/src/stream/TRIAD-OMP.cpp
+++ b/src/stream/TRIAD-OMP.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp
index dfea3158d..5ec18d155 100644
--- a/src/stream/TRIAD-OMPTarget.cpp
+++ b/src/stream/TRIAD-OMPTarget.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp
index 96ab6ccea..132892f76 100644
--- a/src/stream/TRIAD-Seq.cpp
+++ b/src/stream/TRIAD-Seq.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp
new file mode 100644
index 000000000..c8ecafdf7
--- /dev/null
+++ b/src/stream/TRIAD-Sycl.cpp
@@ -0,0 +1,82 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+
+#include "common/SyclDataUtils.hpp"
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+template <size_t work_group_size >
+void TRIAD::runSyclVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  auto res{getSyclResource()};
+  auto qu = res.get_queue();
+
+  TRIAD_DATA_SETUP;
+
+  if ( vid == Base_SYCL ) {
+
+    const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      qu->submit([&] (sycl::handler& h) {
+        h.parallel_for(sycl::nd_range<1> (global_size, work_group_size),
+                       [=] (sycl::nd_item<1> item) {
+
+          Index_type i = item.get_global_id(0);
+          if (i < iend) {
+            TRIAD_BODY
+          }
+
+        });
+      });
+
+    }
+    stopTimer();
+
+  } else if ( vid == RAJA_SYCL ) {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::sycl_exec<work_group_size, true /*async*/> >( res,
+         RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) {
+         TRIAD_BODY;
+       });
+
+    }
+    stopTimer();
+
+  } else {
+      std::cout << "\n  TRIAD : Unknown Sycl variant id = " << vid << std::endl;
+  }
+
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRIAD, Sycl)
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_SYCL
diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp
index d9897618c..da6386755 100644
--- a/src/stream/TRIAD.cpp
+++ b/src/stream/TRIAD.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -28,8 +28,9 @@ TRIAD::TRIAD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) *
-                  getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());
 
   checksum_scale_factor = 0.001 *
@@ -57,6 +58,9 @@ TRIAD::TRIAD(const RunParams& params)
   setVariantDefined( Lambda_HIP );
   setVariantDefined( RAJA_HIP );
 
+  setVariantDefined( Base_SYCL );
+  setVariantDefined( RAJA_SYCL );
+
   setVariantDefined( Kokkos_Lambda );
 }
 
diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp
index 3f65bf804..afb06cd3c 100644
--- a/src/stream/TRIAD.hpp
+++ b/src/stream/TRIAD.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -53,18 +53,24 @@ class TRIAD : public KernelBase
   void runCudaVariant(VariantID vid, size_t tune_idx);
   void runHipVariant(VariantID vid, size_t tune_idx);
   void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+  void runSyclVariant(VariantID vid, size_t tune_idx);
+
   void runKokkosVariant(VariantID vid, size_t tune_idx);
 
   void setCudaTuningDefinitions(VariantID vid);
   void setHipTuningDefinitions(VariantID vid);
+  void setSyclTuningDefinitions(VariantID vid);
+
   template < size_t block_size >
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  template < size_t work_group_size >
+  void runSyclVariantImpl(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size>;
+  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_a;
   Real_ptr m_b;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 001c81190..88b61bbe5 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 # and RAJA Performance Suite project contributors.
 # See the RAJAPerf/LICENSE file for details.
 #
@@ -13,13 +13,23 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS
     lcals
     polybench
     stream
-    algorithm)
+    algorithm
+    comm)
 list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS})
- 
-raja_add_test(
-  NAME test-raja-perf-suite
-  SOURCES test-raja-perf-suite.cpp
-  DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS}
-  )
+
+if (RAJA_PERFSUITE_ENABLE_MPI)
+  raja_add_test(
+    NAME test-raja-perf-suite
+    SOURCES test-raja-perf-suite.cpp
+    DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS}
+    NUM_MPI_TASKS ${RAJA_PERFSUITE_NUM_MPI_TASKS}
+    )
+else()
+  raja_add_test(
+    NAME test-raja-perf-suite
+    SOURCES test-raja-perf-suite.cpp
+    DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS}
+    )
+endif()
 
 target_include_directories(test-raja-perf-suite.exe PRIVATE ${PROJECT_SOURCE_DIR}/src)
diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp
index 26ebcbda5..6f36958c0 100644
--- a/test/test-raja-perf-suite.cpp
+++ b/test/test-raja-perf-suite.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
+// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC
 // and RAJA Performance Suite project contributors.
 // See the RAJAPerf/LICENSE file for details.
 //
@@ -8,6 +8,10 @@
 
 #include "gtest/gtest.h"
 
+#if defined(RUN_KOKKOS)
+#include <Kokkos_Core.hpp>
+#endif
+
 #include "common/Executor.hpp"
 #include "common/KernelBase.hpp"
 
@@ -16,6 +20,33 @@
 #include <iostream>
 #include <cmath>
 
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+#include <mpi.h>
+#endif
+
+int main( int argc, char** argv )
+{
+  testing::InitGoogleTest(&argc, argv);
+
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+  MPI_Init(&argc, &argv);
+#endif
+#if defined(RUN_KOKKOS)
+  Kokkos::initialize(argc, argv);
+#endif
+
+  int res = RUN_ALL_TESTS();
+
+#if defined(RUN_KOKKOS)
+  Kokkos::finalize();
+#endif
+#if defined(RAJA_PERFSUITE_ENABLE_MPI)
+  MPI_Finalize();
+#endif
+
+  return res;
+}
+
 TEST(ShortSuiteTest, Basic)
 {
 
@@ -32,7 +63,7 @@ TEST(ShortSuiteTest, Basic)
      (HIP_VERSION_MAJOR < 5 || \
      (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1))
   sargv.emplace_back(std::string("--exclude-kernels"));
-  sargv.emplace_back(std::string("HALOEXCHANGE_FUSED"));
+  sargv.emplace_back(std::string("HALO_PACKING_FUSED"));
 #endif
 
 #if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11)
diff --git a/tpl/RAJA b/tpl/RAJA
index 9b5f61edf..378199aac 160000
--- a/tpl/RAJA
+++ b/tpl/RAJA
@@ -1 +1 @@
-Subproject commit 9b5f61edf3aa1e6fdbc9a4b30828c81504639963
+Subproject commit 378199aac342ee21c2ddfbcbb48413bd1dfac612