diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index abc7b14b9..ef751105d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,75 +1,91 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +# DESCRIPTION: ############################################################################### # General GitLab pipelines configurations for supercomputers and Linux clusters # at Lawrence Livermore National Laboratory (LLNL). -# # This entire pipeline is LLNL-specific # -# Important note: This file is a copy of the template provided by -# llnl/radiuss-shared-ci. It should not require any change from the project to -# get started but could feature project-specific stages. +# Important note: This file is a template provided by llnl/radiuss-shared-ci. +# Remains to set variable values, change the reference to the radiuss-shared-ci +# repo, opt-in and out optional features. The project can then extend it with +# additional stages. # -# Instead, each project should provide: -# - .gitlab/subscribed-pipelines.yml +# In addition, each project should copy over and complete: # - .gitlab/custom-jobs-and-variables.yml -# - .gitlab/${MACHINE}-build-and-test-extra.yml +# - .gitlab/subscribed-pipelines.yml +# +# The jobs should be specified in a file local to the project, +# - .gitlab/jobs/${CI_MACHINE}.yml +# or generated (see LLNL/Umpire for an example). ############################################################################### # We define the following GitLab pipeline variables: variables: -# Required information about GitHub repository - GITHUB_PROJECT_NAME: "RAJAPerf" - GITHUB_PROJECT_ORG: "LLNL" -# Use the umdev service user to run CI. This prevents from running pipelines as -# an actual user. +##### LC GITLAB CONFIGURATION +# Use a LLNL service user to run CI. This prevents from running pipelines as an +# actual user. LLNL_SERVICE_USER: rajasa # Use the service user workspace. Solves permission issues, stores everything # at the same location whoever triggers a pipeline. -# CUSTOM_CI_BUILDS_DIR: "" +# CUSTOM_CI_BUILDS_DIR: "/usr/workspace/rajasa/gitlab-runner" # Tells Gitlab to recursively update the submodules when cloning the project. GIT_SUBMODULE_STRATEGY: recursive -# We build the projects in the CI clone directory. -# TODO: add a clean-up mechanism + +##### PROJECT VARIABLES +# We build the projects in the CI clone directory (used in +# script/gitlab/build_and_test.sh script). +# TODO: add a clean-up mechanism. BUILD_ROOT: ${CI_PROJECT_DIR} + +##### SHARED_CI CONFIGURATION +# Required information about GitHub repository + GITHUB_PROJECT_NAME: "RAJAPerf" + GITHUB_PROJECT_ORG: "LLNL" # Set the build-and-test command. - BUILD_AND_TEST_CMD: "./scripts/gitlab/build_and_test.sh" -# Override the list of branch that will skip the "draft PR test". -# Add protected branches here. Defaults to "develop main master". -# ALWAYS_RUN_LIST: "develop main" + JOB_CMD: + value: "./scripts/gitlab/build_and_test.sh" + expand: false +# Override the pattern describing branches that will skip the "draft PR filter +# test". Add protected branches here. See default value in +# preliminary-ignore-draft-pr.yml. +# ALWAYS_RUN_PATTERN: "^develop$|^main$|^v[0-9.]*-RC$" -# We organize the CI on Gitlab in sub-pipelines. Each sub-pipeline corresponds -# to a test phase on a given machine. +# We organize the build-and-test stage with sub-pipelines. Each sub-pipeline +# corresponds to a test batch on a given machine. # High level stages stages: - - machine-checks + - prerequisites - build-and-test -# Template for jobs triggering a build-and-test sub-pipelines: +# Template for jobs triggering a build-and-test sub-pipeline: .build-and-test: stage: build-and-test trigger: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: v2023.06.0 - file: '${CI_MACHINE}-build-and-test.yml' - - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml' + ref: 'v2024.06.0' + file: 'pipelines/${CI_MACHINE}.yml' + - artifact: '${CI_MACHINE}-jobs.yml' + job: 'generate-job-lists' strategy: depend forward: pipeline_variables: true include: - # checks preliminary to running the actual CI test (optional) + - project: 'lc-templates/id_tokens' + file: 'id_tokens.yml' + # [Optional] checks preliminary to running the actual CI test #- project: 'radiuss/radiuss-shared-ci' - # ref: v2023.03.1 - # file: 'preliminary-ignore-draft-pr.yml' + # ref: 'v2024.06.0' + # file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index a4081efe1..931d1961b 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -15,19 +15,30 @@ variables: # Ruby # Arguments for top level allocation - RUBY_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=45 --nodes=1" + RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --time=40 --nodes=1" # Arguments for job level allocation - RUBY_BUILD_AND_TEST_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=30 --nodes=1" +# Note: We repeat the reservation, necessary when jobs are manually re-triggered. + RUBY_JOB_ALLOC: "--reservation=ci --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby - PROJECT_RUBY_DEPS: "" + PROJECT_RUBY_DEPS: "^blt@develop " + +# Poodle +# Arguments for top level allocation + POODLE_SHARED_ALLOC: "--exclusive --time=40 --nodes=1" +# Arguments for job level allocation + POODLE_JOB_ALLOC: "--nodes=1" +# Project specific variants for poodle + PROJECT_POODLE_VARIANTS: "~shared +openmp" +# Project specific deps for poodle + PROJECT_POODLE_DEPS: "^blt@develop " # Corona # Arguments for top level allocation - CORONA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" + CORONA_SHARED_ALLOC: "--exclusive --time-limit=12m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation - CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=30m --nodes=1 --begin-time=+5s" + CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_CORONA_VARIANTS: "~shared ~openmp" # Project specific deps for corona @@ -35,28 +46,28 @@ variables: # Tioga # Arguments for top level allocation - TIOGA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" + TIOGA_SHARED_ALLOC: "--queue=pci --exclusive --time-limit=26m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation - TIOGA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1 --begin-time=+5s" -# Project specific variants for corona - PROJECT_TIOGA_VARIANTS: "~shared ~openmp" -# Project specific deps for corona + TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" +# Project specific variants for tioga + PROJECT_TIOGA_VARIANTS: "~shared +openmp" +# Project specific deps for tioga PROJECT_TIOGA_DEPS: "^blt@develop " # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 30" + LASSEN_JOB_ALLOC: "1 -W 20 -q pci" # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70" # Project specific deps for lassen - PROJECT_LASSEN_DEPS: "" + PROJECT_LASSEN_DEPS: "^blt@develop " # Configuration shared by build and test jobs specific to this project. # Not all configuration can be shared. Here projects can fine tune the # CI behavior. # See Umpire for an example (export junit test reports). -.custom_build_and_test: +.custom_job: artifacts: reports: junit: junit.xml diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/jobs/corona.yml similarity index 76% rename from .gitlab/corona-build-and-test-extra.yml rename to .gitlab/jobs/corona.yml index 03d67218a..8fec233c5 100644 --- a/.gitlab/corona-build-and-test-extra.yml +++ b/.gitlab/jobs/corona.yml @@ -1,11 +1,18 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################# +# Override reproducer section to define project specific variables. +.corona_reproducer_vars: + script: + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + ######################## # Overridden shared jobs ######################## diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/jobs/lassen.yml similarity index 50% rename from .gitlab/lassen-build-and-test-extra.yml rename to .gitlab/jobs/lassen.yml index 68850e5e8..c6eacf864 100644 --- a/.gitlab/lassen-build-and-test-extra.yml +++ b/.gitlab/jobs/lassen.yml @@ -1,11 +1,18 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################## +# Override reproducer section to define project specific variables. +.lassen_reproducer_vars: + script: + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + ######################## # Overridden shared jobs ######################## @@ -16,18 +23,10 @@ # Overriding shared spec: Longer allocation + extra flags xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@=16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.2.0" - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" - extends: .build_and_test_on_lassen - -# Overriding shared spec: Longer allocation + extra flags -xl_2022_08_19_gcc_8_3_1_cuda_11_7_0: - variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.7.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" - MODULE_LIST: "cuda/11.7.0" - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" - extends: .build_and_test_on_lassen + LASSEN_JOB_ALLOC: "1 -W 60 -q pci" + extends: .job_on_lassen ############ @@ -37,16 +36,24 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_7_0: # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -########## -# CUDA -########## +gcc_8_3_1: + variables: + SPEC: " ~shared +openmp %gcc@=8.3.1 ${PROJECT_LASSEN_DEPS}" + extends: .job_on_lassen gcc_8_3_1_cuda_11_5_0_ats_disabled: - extends: .build_and_test_on_lassen + extends: .job_on_lassen + variables: + SPEC: " ~shared +openmp +cuda %gcc@=8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" + MODULE_LIST: "cuda/11.5.0" + LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci" + +gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi: + extends: .job_on_lassen variables: - SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers" + SPEC: " ~shared +openmp +cuda +mpi %gcc@=8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.5.0" - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 30" + LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci" ########## # OTHERS @@ -54,18 +61,18 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled: clang_13_0_1_libcpp: variables: - SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\"" - extends: .build_and_test_on_lassen + SPEC: " ~shared +openmp %clang@=13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ${PROJECT_LASSEN_DEPS}" + extends: .job_on_lassen #clang_14_0_5_asan: # variables: -# SPEC: " ~shared +openmp %clang@14.0.5 cxxflags==\"-fsanitize=address\"" +# SPEC: " ~shared +openmp %clang@=14.0.5 cxxflags==\"-fsanitize=address\" ${PROJECT_LASSEN_DEPS}" # ASAN_OPTIONS: "detect_leaks=1" # LSAN_OPTIONS: "suppressions=${CI_PROJECT_DIR}/tpl/RAJA/suppressions.asan" -# extends: .build_and_test_on_lassen +# extends: .job_on_lassen # Activated in RAJA, but we don't use desul atomics here #gcc_8_3_1_cuda_10_1_168_desul_atomics: # variables: -# SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" -# extends: .build_and_test_on_lassen +# SPEC: "+openmp +cuda +desul %gcc@=8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" +# extends: .job_on_lassen diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml new file mode 100644 index 000000000..ed18f60f5 --- /dev/null +++ b/.gitlab/jobs/poodle.yml @@ -0,0 +1,55 @@ +############################################################################## +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## + +# Override reproducer section to define projet specific variables. +.poodle_reproducer_vars: + script: + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + +######################## +# Overridden shared jobs +######################## +# We duplicate the shared jobs description and add necessary changes for RAJA. +# We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that +# the comparison with the original job is easier. + +clang_14_0_6: + variables: + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@=14.0.6 ${PROJECT_POODLE_DEPS}" + extends: .job_on_poodle + +gcc_10_3_1: + variables: + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@=10.3.1 ${PROJECT_POODLE_DEPS}" + extends: .job_on_poodle + +intel_19_1_2_gcc_10_3_1: + variables: + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}" + extends: .job_on_poodle + +intel_2022_1_0: + variables: + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=2022.1.0 ${PROJECT_POODLE_DEPS}" + allow_failure: true + extends: .job_on_poodle + +############ +# Extra jobs +############ +# We do not recommend using ${PROJECT__VARIANTS} and +# ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully +# describe the spec here. + +intel_2022_1_0_mpi: + variables: + SPEC: "~shared +openmp +mpi %intel@=2022.1.0 ^mvapich2 ${PROJECT_POODLE_DEPS}" + allow_failure: true + extends: .job_on_poodle diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/jobs/ruby.yml similarity index 51% rename from .gitlab/ruby-build-and-test-extra.yml rename to .gitlab/jobs/ruby.yml index da320f4f8..3502ed3fb 100644 --- a/.gitlab/ruby-build-and-test-extra.yml +++ b/.gitlab/jobs/ruby.yml @@ -1,35 +1,46 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################## -######################## +# Override reproducer section to define project specific variables. +.ruby_reproducer_vars: + script: + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + +####################### # Overridden shared jobs ######################## # We duplicate the shared jobs description and add necessary changes for RAJA. # We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that # the comparison with the original job is easier. -# Overriding shared config for longer run and algorithm variants clang_14_0_6: variables: - SPEC: " ~shared +openmp +omptask %clang@14.0.6" - extends: .build_and_test_on_ruby + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@=14.0.6 ${PROJECT_RUBY_DEPS}" + extends: .job_on_ruby gcc_10_3_1: variables: - SPEC: " ~shared +openmp +omptask %gcc@10.3.1" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@=10.3.1 ${PROJECT_RUBY_DEPS}" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" - extends: .build_and_test_on_ruby + extends: .job_on_ruby -intel_19_1_2_gcc_8_5_0: +intel_19_1_2_gcc_10_3_1: variables: - SPEC: " +openmp %intel@19.1.2.gcc.8.5.0" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" - extends: .build_and_test_on_ruby + extends: .job_on_ruby + +intel_2022_1_0: + variables: + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}" + extends: .job_on_ruby ############ # Extra jobs @@ -37,3 +48,8 @@ intel_19_1_2_gcc_8_5_0: # We do not recommend using ${PROJECT__VARIANTS} and # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. + +intel_2022_1_0_mpi: + variables: + SPEC: "~shared +openmp +mpi %intel@=2022.1.0 ^mvapich2 ${PROJECT_RUBY_DEPS}" + extends: .job_on_ruby diff --git a/.gitlab/tioga-build-and-test-extra.yml b/.gitlab/jobs/tioga.yml similarity index 57% rename from .gitlab/tioga-build-and-test-extra.yml rename to .gitlab/jobs/tioga.yml index 02a2feef6..bcf9eccb8 100644 --- a/.gitlab/tioga-build-and-test-extra.yml +++ b/.gitlab/jobs/tioga.yml @@ -1,11 +1,18 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################# +# Override reproducer section to define project specific variables. +.tioga_reproducer_vars: + script: + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + ######################## # Overridden shared jobs ######################## @@ -15,8 +22,6 @@ # No overridden jobs so far. -# In post-build phase, deallocate resources. - ############ # Extra jobs ############ @@ -24,11 +29,13 @@ # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -# With GitLab CI, included files cannot be empty. -#variables: -# INCLUDED_FILE_CANNOT_BE_EMPTY: "True" +rocmcc_6_1_1_hip_openmp: + variables: + SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ${PROJECT_TIOGA_DEPS}" + extends: .job_on_tioga -rocmcc_5_4_3_hip_openmp: +rocmcc_6_1_1_hip_openmp_mpi: variables: - SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.4.3 ^hip@5.4.3 ^blt@develop" - extends: .build_and_test_on_tioga + SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ${PROJECT_TIOGA_DEPS}" + extends: .job_on_tioga + allow_failure: true diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index 108e84a54..7e60a05e9 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -9,7 +9,7 @@ # The template job to test whether a machine is up. # Expects CI_MACHINE defined to machine name. .machine-check: - stage: machine-checks + stage: prerequisites tags: [shell, oslic] variables: GIT_STRATEGY: none @@ -30,6 +30,30 @@ # Comment the jobs for machines you don’t need. ### +# One job to generate the job list for all the subpipelines +generate-job-lists: + stage: prerequisites + tags: [shell, oslic] + variables: + GIT_SUBMODULE_DEPTH: 2 + GIT_SUBMODULE_STRATEGY: recursive + GIT_SUBMODULE_PATHS: tpl/RAJA + RADIUSS_JOBS_PATH: "tpl/RAJA/scripts/radiuss-spack-configs/gitlab/radiuss-jobs" + LOCAL_JOBS_PATH: ".gitlab/jobs" + script: + - cat ${RADIUSS_JOBS_PATH}/ruby.yml ${LOCAL_JOBS_PATH}/ruby.yml > ruby-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/poodle.yml ${LOCAL_JOBS_PATH}/poodle.yml > poodle-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/lassen.yml ${LOCAL_JOBS_PATH}/lassen.yml > lassen-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/corona.yml ${LOCAL_JOBS_PATH}/corona.yml > corona-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/tioga.yml ${LOCAL_JOBS_PATH}/tioga.yml > tioga-jobs.yml + artifacts: + paths: + - ruby-jobs.yml + - poodle-jobs.yml + - lassen-jobs.yml + - corona-jobs.yml + - tioga-jobs.yml + # RUBY ruby-up-check: variables: @@ -39,7 +63,19 @@ ruby-up-check: ruby-build-and-test: variables: CI_MACHINE: "ruby" - needs: [ruby-up-check] + needs: [ruby-up-check, generate-job-lists] + extends: [.build-and-test] + +# POODLE +poodle-up-check: + variables: + CI_MACHINE: "poodle" + extends: [.machine-check] + +poodle-build-and-test: + variables: + CI_MACHINE: "poodle" + needs: [poodle-up-check, generate-job-lists] extends: [.build-and-test] # CORONA @@ -51,7 +87,7 @@ corona-up-check: corona-build-and-test: variables: CI_MACHINE: "corona" - needs: [corona-up-check] + needs: [corona-up-check, generate-job-lists] extends: [.build-and-test] # TIOGA @@ -63,7 +99,7 @@ tioga-up-check: tioga-build-and-test: variables: CI_MACHINE: "tioga" - needs: [tioga-up-check] + needs: [tioga-up-check, generate-job-lists] extends: [.build-and-test] # LASSEN @@ -75,7 +111,7 @@ lassen-up-check: lassen-build-and-test: variables: CI_MACHINE: "lassen" - needs: [lassen-up-check] + needs: [lassen-up-check, generate-job-lists] extends: [.build-and-test] diff --git a/.uberenv_config.json b/.uberenv_config.json index e2353e1c9..fda595d3a 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -4,7 +4,7 @@ "package_final_phase" : "initconfig", "package_source_dir" : "../..", "spack_url": "https://github.com/spack/spack.git", -"spack_branch": "e4s-23.02", +"spack_branch": "develop-2024-05-26", "spack_activate" : {}, "spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs", "spack_packages_path": "tpl/RAJA/scripts/radiuss-spack-configs/packages", diff --git a/CMakeLists.txt b/CMakeLists.txt index 812d339b0..b9d0bd3c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -16,7 +16,7 @@ else() endif() option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable -this, and all other variants, to run _only_ raw C loops." On) +this, and all other variants, to run _only_ base variants." On) option(ENABLE_KOKKOS "Include Kokkos implementations of the kernels in the RAJA Perfsuite" Off) # @@ -27,7 +27,7 @@ if (PERFSUITE_ENABLE_WARNINGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") endif() -if (ENABLE_KOKKOS) +if (ENABLE_KOKKOS OR ENABLE_SYCL) set(CMAKE_CXX_STANDARD 17) set(BLT_CXX_STD c++17) else() @@ -51,6 +51,12 @@ if (ENABLE_TESTS) endif() cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off) +if (RAJA_PERFSUITE_ENABLE_MPI) +set(RAJA_PERFSUITE_NUM_MPI_TASKS 4 CACHE STRING "Number of MPI tasks in tests") +else() +set(RAJA_PERFSUITE_NUM_MPI_TASKS 0 CACHE INTERNAL "Number of MPI tasks in tests") +endif() +message(STATUS "Using RAJA_PERFSUITE_NUM_MPI_TASKS: ${RAJA_PERFSUITE_NUM_MPI_TASKS}") cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off) @@ -67,12 +73,33 @@ set(ENABLE_TBB Off CACHE BOOL "") set(RAJA_USE_CHRONO On CACHE BOOL "") +set(RAJA_PERFSUITE_TUNING_CUDA_ARCH "0" CACHE STRING "CUDA arch to tune the execution for, ex '700' for sm_70") +set(RAJA_PERFSUITE_TUNING_HIP_ARCH "0" CACHE STRING "HIP arch to tune the execution for, ex '910' for gfx90a, '942' for gfx942") + set(RAJA_PERFSUITE_GPU_BLOCKSIZES "" CACHE STRING "Comma separated list of GPU block sizes, ex '256,1024'") +set(RAJA_PERFSUITE_ATOMIC_REPLICATIONS "" CACHE STRING "Comma separated list of atomic replications, ex '1,256,4096'") + +set(RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD "" CACHE STRING "Comma separated list of atomic replications, ex '1,256,4096'") + set(RAJA_RANGE_ALIGN 4) set(RAJA_RANGE_MIN_LENGTH 32) set(RAJA_DATA_ALIGN 64) +string(LENGTH "${RAJA_PERFSUITE_TUNING_CUDA_ARCH}" CUDA_ARCH_LENGTH) +if (CUDA_ARCH_LENGTH GREATER 1) + message(STATUS "Using cuda tunings for arch: ${RAJA_PERFSUITE_TUNING_CUDA_ARCH}") +else() + message(STATUS "Using default cuda arch tunings") +endif() + +string(LENGTH "${RAJA_PERFSUITE_TUNING_HIP_ARCH}" HIP_ARCH_LENGTH) +if (HIP_ARCH_LENGTH GREATER 1) + message(STATUS "Using hip tunings for arch: ${RAJA_PERFSUITE_TUNING_HIP_ARCH}") +else() + message(STATUS "Using default hip arch tunings") +endif() + string(LENGTH "${RAJA_PERFSUITE_GPU_BLOCKSIZES}" BLOCKSIZES_LENGTH) if (BLOCKSIZES_LENGTH GREATER 0) message(STATUS "Using gpu block size(s): ${RAJA_PERFSUITE_GPU_BLOCKSIZES}") @@ -80,6 +107,20 @@ else() message(STATUS "Using default gpu block size(s)") endif() +string(LENGTH "${RAJA_PERFSUITE_ATOMIC_REPLICATIONS}" ATOMIC_REPLICATIONS_LENGTH) +if (ATOMIC_REPLICATIONS_LENGTH GREATER 0) + message(STATUS "Using atomic replication(s): ${RAJA_PERFSUITE_ATOMIC_REPLICATIONS}") +else() + message(STATUS "Using default atomic replication(s)") +endif() + +string(LENGTH "${RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD}" GPU_ITEMS_PER_THREAD_LENGTH) +if (GPU_ITEMS_PER_THREAD_LENGTH GREATER 0) + message(STATUS "Using gpu items per thread(s): ${RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD}") +else() + message(STATUS "Using default gpu items per thread(s)") +endif() + # exclude RAJA make targets from top-level build... add_subdirectory(tpl/RAJA) @@ -95,8 +136,8 @@ if (ENABLE_OPENMP) add_definitions(-DRUN_OPENMP) endif () -set(RAJA_PERFSUITE_VERSION_MAJOR 2023) -set(RAJA_PERFSUITE_VERSION_MINOR 06) +set(RAJA_PERFSUITE_VERSION_MAJOR 2024) +set(RAJA_PERFSUITE_VERSION_MINOR 07) set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0) set(RAJA_PERFSUITE_DEPENDS RAJA) @@ -110,6 +151,9 @@ endif() if (ENABLE_CUDA) list(APPEND RAJA_PERFSUITE_DEPENDS cuda) endif() +if (ENABLE_SYCL) + list(APPEND RAJA_PERFSUITE_DEPENDS sycl) +endif() # Kokkos requires hipcc as the CMAKE_CXX_COMPILER for HIP AMD/VEGA GPU # platforms, whereas RAJAPerf Suite uses blt/CMake FindHIP to set HIP compiler. diff --git a/Dockerfile b/Dockerfile index 9d7f6b197..b15e3c102 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) @@ -111,12 +111,12 @@ RUN . /opt/spack/share/spack/setup-env.sh && \ ## make -j 6 && \ ## cd .. && rm -rf build -FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl +FROM ghcr.io/rse-ops/intel-ubuntu-23.04:intel-2023.2.1 AS sycl ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ - cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ + cmake -DCMAKE_CXX_COMPILER=dpcpp -DENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 -DENABLE_TESTS=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 5 -sp" && \ + ./bin/raja-perf.exe --checkrun --exclude-variants Base_SYCL RAJA_SYCL -sp" && \ cd .. && rm -rf build diff --git a/LICENSE b/LICENSE index 039a20b01..27c1ef431 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2017-2023, Lawrence Livermore National Security, LLC. +Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index bf2eee850..04aeea048 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [comment]: # (#################################################################) -[comment]: # (Copyright 2017-23, Lawrence Livermore National Security, LLC) +[comment]: # (Copyright 2017-24, Lawrence Livermore National Security, LLC) [comment]: # (and RAJA Performance Suite project contributors.) [comment]: # (See the RAJAPerf/LICENSE file for details.) [comment]: # diff --git a/RELEASE b/RELEASE index 4b8dcac50..61fc02251 100644 --- a/RELEASE +++ b/RELEASE @@ -2,7 +2,7 @@ RAJA Performance Suite -Copyright (c) 2017-23, Lawrence Livermore National Security, LLC. +Copyright (c) 2017-24, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory. All rights reserved. See details in the RAJAPerf/LICENSE file. diff --git a/TODO/WIP-COUPLE.cpp b/TODO/WIP-COUPLE.cpp index 2e7c70197..6f0feeed8 100644 --- a/TODO/WIP-COUPLE.cpp +++ b/TODO/WIP-COUPLE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -110,7 +110,7 @@ void COUPLE::runKernel(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( + RAJA::forall( RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { COUPLE_BODY; }); diff --git a/TODO/WIP-COUPLE.hpp b/TODO/WIP-COUPLE.hpp index 33faa85cc..bf29503f3 100644 --- a/TODO/WIP-COUPLE.hpp +++ b/TODO/WIP-COUPLE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/azure-pipelines.yml b/azure-pipelines.yml index da8637d19..41f9c0cd7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -56,8 +56,8 @@ jobs: ## docker_target: nvcc11.1.1-debug ## hip5.1.3: ## docker_target: hip5.1.3 -## sycl: -## docker_target: sycl + sycl: + docker_target: sycl pool: vmImage: 'ubuntu-latest' variables: diff --git a/blt b/blt index 5a792c177..9ff77344f 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 5a792c1775e7a7628d84dcde31652a689f1df7b5 +Subproject commit 9ff77344f0b2a6ee345e452bddd6bfd46cbbfa35 diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index ac86f5bcc..9b4df01d6 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/docs/conf.py b/docs/conf.py index 6673fa10f..ee3729c8f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,16 +79,16 @@ # General information about the project. project = u'RAJAPerf' -copyright = u'2017-2023, Lawrence Livermore National Security, LLNS' +copyright = u'2017-2024, Lawrence Livermore National Security, LLNS' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = u'2022.10' +version = u'2024.07' # The full version, including alpha/beta/rc tags. -release = u'2022.10.0' +release = u'2024.07.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index 12ec445a5..438c89f82 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/requirements.txt b/docs/requirements.txt index 6b1d35172..0a42ee80c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,3 @@ -docutils<0.20 +docutils +sphinx==6.2.1 +sphinx-rtd-theme==1.2.2 diff --git a/docs/sphinx/dev_guide/branch_development.rst b/docs/sphinx/dev_guide/branch_development.rst index 8d2e04437..318076584 100644 --- a/docs/sphinx/dev_guide/branch_development.rst +++ b/docs/sphinx/dev_guide/branch_development.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/build_configurations.rst b/docs/sphinx/dev_guide/build_configurations.rst index 7ce70decf..4972d85a2 100644 --- a/docs/sphinx/dev_guide/build_configurations.rst +++ b/docs/sphinx/dev_guide/build_configurations.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst index 231b00ee3..1fdd1a55f 100644 --- a/docs/sphinx/dev_guide/ci.rst +++ b/docs/sphinx/dev_guide/ci.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst index 74f86d3cd..bdac32a30 100644 --- a/docs/sphinx/dev_guide/contributing.rst +++ b/docs/sphinx/dev_guide/contributing.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index c2c976ff3..d04aa25ab 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/kernel_class.rst b/docs/sphinx/dev_guide/kernel_class.rst index 5d544dd68..015b7592f 100644 --- a/docs/sphinx/dev_guide/kernel_class.rst +++ b/docs/sphinx/dev_guide/kernel_class.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 38d8274a0..05271dd8e 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/release_process.rst b/docs/sphinx/dev_guide/release_process.rst index 8b1942758..0542ec08e 100644 --- a/docs/sphinx/dev_guide/release_process.rst +++ b/docs/sphinx/dev_guide/release_process.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/structure.rst b/docs/sphinx/dev_guide/structure.rst index 5c25ef2a2..bc11f9941 100644 --- a/docs/sphinx/dev_guide/structure.rst +++ b/docs/sphinx/dev_guide/structure.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/rajaperf_license.rst b/docs/sphinx/rajaperf_license.rst index a7985861f..5233fff7b 100644 --- a/docs/sphinx/rajaperf_license.rst +++ b/docs/sphinx/rajaperf_license.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## @@ -12,7 +12,7 @@ RAJA Performance Suite Copyright and License Information ========================================================== -Copyright (c) 2017-23, Lawrence Livermore National Security, LLC. +Copyright (c) 2017-24, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory. diff --git a/docs/sphinx/user_guide/CMakeLists.txt b/docs/sphinx/user_guide/CMakeLists.txt index 912f38a7a..e084390e8 100644 --- a/docs/sphinx/user_guide/CMakeLists.txt +++ b/docs/sphinx/user_guide/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA erformance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst index 082fb9f4e..372f495e9 100644 --- a/docs/sphinx/user_guide/build.rst +++ b/docs/sphinx/user_guide/build.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## @@ -201,7 +201,7 @@ multiple versions of GPU kernels that will run with different GPU thread-block sizes. The CMake option for this is ``-DRAJA_PERFSUITE_GPU_BLOCKSIZES=``. For example:: - $ mkdir my-gnu-build + $ mkdir my-gpu-build $ cd my-gpu-build $ cmake \ -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \ @@ -211,6 +211,41 @@ sizes. The CMake option for this is will build versions of GPU kernels that use 64, 128, 256, 512, and 1024 threads per GPU thread-block. +Building with specific GPU atomic replication tunings +----------------------------------------------------- + +If desired, you can build a version of the RAJA Performance Suite code with +multiple versions of GPU kernels that will run with different GPU atomic +replication amounts. The CMake option for this is +``-DRAJA_PERFSUITE_ATOMIC_REPLICATIONS=``. For example:: + + $ mkdir my-gpu-build + $ cd my-gpu-build + $ cmake \ + -DRAJA_PERFSUITE_ATOMIC_REPLICATIONS=1,256,4096 \ + .. + $ make -j + +will build versions of GPU kernels that use 1, 256, and 4096 atomic +replications. + +Building with specific GPU items per thread tunings +----------------------------------------------------- + +If desired, you can build a version of the RAJA Performance Suite code with +multiple versions of GPU kernels that will run with different GPU items per +thread amounts. The CMake option for this is +``-DRAJA_PERFSUITE_GPU_ITEMS_PER_THREAD=``. For example:: + + $ mkdir my-gpu-build + $ cd my-gpu-build + $ cmake \ + -DRAJA_PERFSUITE_GPU_ITEMS_PER_THREAD=1,2,4,8 \ + .. + $ make -j + +will build versions of GPU kernels that use 1, 2, 4, and 8 items per thread. + Building with Caliper --------------------- diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index 0bd7d5570..33475a6b9 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst index 3d0879278..2af530e9a 100644 --- a/docs/sphinx/user_guide/output.rst +++ b/docs/sphinx/user_guide/output.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst index 19a8917bd..083263d61 100644 --- a/docs/sphinx/user_guide/run.rst +++ b/docs/sphinx/user_guide/run.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh new file mode 100755 index 000000000..f002631f3 --- /dev/null +++ b/scripts/alcf-builds/sycl.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + + +BUILD_SUFFIX=sycl +: ${BUILD_TYPE:=RelWithDebInfo} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/alcf-builds/sycl.cmake + +rm -rf build_${BUILD_SUFFIX}_${USER} >/dev/null +mkdir build_${BUILD_SUFFIX}_${USER} && cd build_${BUILD_SUFFIX}_${USER} + +cmake \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_OPENMP=Off \ + -DENABLE_CUDA=Off \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \ + -DENABLE_TARGET_OPENMP=Off \ + -DENABLE_ALL_WARNINGS=Off \ + -DENABLE_SYCL=On \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_LINKER=icpx \ + "$@" \ + .. + +make -j 18 diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index de837bed9..430d50b4b 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -7,7 +7,7 @@ then fi ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC and RAJA +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC and RAJA # project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) @@ -21,17 +21,26 @@ hostname="$(hostname)" truehostname=${hostname//[0-9]/} project_dir="$(pwd)" -build_root=${BUILD_ROOT:-""} hostconfig=${HOST_CONFIG:-""} spec=${SPEC:-""} +module_list=${MODULE_LIST:-""} job_unique_id=${CI_JOB_ID:-""} +use_dev_shm=${USE_DEV_SHM:-true} + raja_version=${UPDATE_RAJA:-""} sys_type=${SYS_TYPE:-""} -use_dev_shm=${USE_DEV_SHM:-true} spack_upstream_path=${SPACK_UPSTREAM_PATH:-"/usr/workspace/umdev/RAJAPerf/upstream"} update_spack_upstream=${UPDATE_SPACK_UPSTREAM:-false} +if [[ -n ${module_list} ]] +then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Modules to load: ${module_list}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + module load ${module_list} +fi + prefix="" if [[ ${update_spack_upstream} == true ]] @@ -55,8 +64,9 @@ then prefix="${prefix}-${job_unique_id}" mkdir -p ${prefix} else - prefix="spack-and-build-root" - mkdir ${prefix} + # We set the prefix in the parent directory so that spack dependencies are not installed inside the source tree. + prefix="$(pwd)/../spack-and-build-root" + mkdir -p ${prefix} fi # Dependencies @@ -131,17 +141,8 @@ fi hostconfig=$(basename ${hostconfig_path}) # Build Directory -if [[ -z ${build_root} ]] -then - if [[ -d /dev/shm && ${use_dev_shm} == true ]] - then - build_root="${prefix}" - else - build_root="$(pwd)" - fi -else - build_root="${build_root}" -fi +# When using /dev/shm, we use prefix for both spack builds and source build, unless BUILD_ROOT was defined +build_root=${BUILD_ROOT:-"${prefix}"} build_dir="${build_root}/build_${hostconfig//.cmake/}" @@ -162,7 +163,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # Map CPU core allocations - declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32) + declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["poodle"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32) # If using Multi-project, set up the submodule if [[ -n ${raja_version} ]] @@ -186,25 +187,34 @@ then rm -rf ${build_dir} 2>/dev/null mkdir -p ${build_dir} && cd ${build_dir} - date + # We set the MPI tests command to allow overlapping. + # Shared allocation: Allows build_and_test.sh to run within a sub-allocation (see CI config). + # Use /dev/shm: Prevent MPI tests from running on a node where the build dir doesn't exist. + cmake_options="" + if [[ "${truehostname}" == "ruby" || "${truehostname}" == "poodle" ]] + then + cmake_options="-DBLT_MPI_COMMAND_APPEND:STRING=--overlap" + fi + date if [[ "${truehostname}" == "corona" || "${truehostname}" == "tioga" ]] then module unload rocm fi $cmake_exe \ -C ${hostconfig_path} \ + ${cmake_options} \ ${project_dir} if ! $cmake_exe --build . -j ${core_counts[$truehostname]} then echo "ERROR: compilation failed, building with verbose output..." $cmake_exe --build . --verbose -j 1 fi + date echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ RAJA Perf Suite Built" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - date fi if [[ ! -d ${build_dir} ]] @@ -214,6 +224,7 @@ fi cd ${build_dir} +date echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ TESTING RAJAPERF SUITE" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" @@ -226,33 +237,12 @@ then # in case we want to make them disctinct in the future. # - if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - then - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun -n1 ... ctest --output-on-failure -T test" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test - else - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun -n1 ... ctest --output-on-failure -T test" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test - fi - else - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - then - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - ctest --output-on-failure -T test 2>&1 | tee tests_output.txt - else - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - ctest --output-on-failure -T test 2>&1 | tee tests_output.txt - fi - fi + echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + date + ctest --output-on-failure -T test 2>&1 | tee tests_output.txt + date no_test_str="No tests were found!!!" if [[ "$(tail -n 1 tests_output.txt)" == "${no_test_str}" ]] diff --git a/scripts/install_llvm.sh b/scripts/install_llvm.sh index f1a16dcfa..b264f59de 100755 --- a/scripts/install_llvm.sh +++ b/scripts/install_llvm.sh @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index a6fc06451..15fde9bf1 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -1,17 +1,17 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_clang.sh 11.0.1" - echo " -or - " + echo " - or - " echo " blueos_clang.sh ibm-10.0.1-gcc-8.3.1" exit fi diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index 2f7fdf5e9..67ffdcf91 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -1,17 +1,17 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_clang_omptarget.sh 10.0.1-gcc-8.3.1" - echo " - or - " + echo " - or -" echo " blueos_clang_omptarget.sh ibm-10.0.1-gcc-8.3.1" exit fi diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh index b51ad749a..fe71ddf77 100755 --- a/scripts/lc-builds/blueos_gcc.sh +++ b/scripts/lc-builds/blueos_gcc.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_gcc.sh 8.3.1" diff --git a/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh new file mode 100755 index 000000000..14118494a --- /dev/null +++ b/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 5 ]]; then + echo + echo "You must pass 5 arguments to the script (in this order): " + echo " 1) compiler version number for nvcc" + echo " 2) CUDA compute architecture (number only, not 'sm_70' for example)" + echo " 3) compiler version number for clang. " + echo " 4) path to caliper cmake directory" + echo " 5) path to adiak cmake directory" + echo + echo "For example: " + echo " blueos_nvcc_clang-mpi_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak" + exit +fi + +COMP_NVCC_VER=$1 +COMP_ARCH=$2 +COMP_CLANG_VER=$3 +CALI_DIR=$4 +ADIAK_DIR=$5 +shift 5 + +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-mpi-${COMP_CLANG_VER}-caliper +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.20.2 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=ON \ + -DENABLE_OPENMP=On \ + -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ + -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + -DRAJA_PERFSUITE_USE_CALIPER=ON \ + -Dcaliper_DIR=${CALI_DIR} \ + -Dadiak_DIR=${ADIAK_DIR} \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=128,256,512,1024 \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to disable CUDA GPU hooks when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " lrun -1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index 9801459b9..59b74d923 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" @@ -24,7 +24,7 @@ COMP_ARCH=$2 COMP_CLANG_VER=$3 shift 3 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER} +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake echo @@ -45,6 +45,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh index b121d68c2..238b9a30e 100755 --- a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh +++ b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 5 ]]; then +if [[ $# -lt 5 ]]; then echo echo "You must pass 5 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" @@ -17,7 +17,7 @@ if [[ $# -ne 5 ]]; then echo " 5) path to adiak cmake directory" echo echo "For example: " - echo " blueos_nvcc_clang_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/adiak-lassen/lib/cmake/adiak" + echo " blueos_nvcc_clang_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak" exit fi @@ -28,7 +28,7 @@ CALI_DIR=$4 ADIAK_DIR=$5 shift 5 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER} +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake echo @@ -49,6 +49,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh b/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh new file mode 100755 index 000000000..9fdcdb3a7 --- /dev/null +++ b/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 3 ]]; then + echo + echo "You must pass 5 arguments to the script (in this order): " + echo " 1) compiler version number for nvcc" + echo " 2) CUDA compute architecture (number only, not 'sm_70' for example)" + echo " 3) compiler version number for gcc" + echo " 4) path to caliper cmake directory" + echo " 5) path to adiak cmake directory" + echo + echo "For example: " + echo " blueos_nvcc_gcc-mpi_caliper.sh 10.2.89 70 8.3.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak" + exit +fi + +COMP_NVCC_VER=$1 +COMP_ARCH=$2 +COMP_GCC_VER=$3 +CALI_DIR=$4 +ADIAK_DIR=$5 +shift 5 + +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER}-mpi-caliper +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.20.2 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=ON \ + -DENABLE_OPENMP=On \ + -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ + -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + -DRAJA_PERFSUITE_USE_CALIPER=ON \ + -Dcaliper_DIR=${CALI_DIR} \ + -Dadiak_DIR=${ADIAK_DIR} \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=128,256,512,1024 \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to disable CUDA GPU hooks when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " lrun -1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 200e86f9b..d1e24fdac 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" @@ -24,7 +24,7 @@ COMP_ARCH=$2 COMP_GCC_VER=$3 shift 3 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER} +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake echo @@ -45,6 +45,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index 9f2489694..1950dcadc 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" @@ -24,7 +24,7 @@ COMP_ARCH=$2 COMP_XL_VER=$3 shift 3 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-xl${COMP_XL_VER} +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-xl-${COMP_XL_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_xl_X.cmake echo @@ -45,6 +45,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh index c715d1c25..09e192fa5 100755 --- a/scripts/lc-builds/blueos_pgi.sh +++ b/scripts/lc-builds/blueos_pgi.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_pgi.sh 21.1" diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh index 631f8ef5c..d8a718229 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) @@ -26,7 +26,7 @@ COMP_ARCH=$3 COMP_CLANG_VER=$4 shift 4 -BUILD_SUFFIX=lc_blueos-spectrum${COMP_MPI_VER}-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER} +BUILD_SUFFIX=lc_blueos-spectrum-${COMP_MPI_VER}-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake echo @@ -49,6 +49,7 @@ cmake \ -DENABLE_MPI=On \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh b/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh new file mode 100755 index 000000000..dd71dcc62 --- /dev/null +++ b/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 4 ]]; then + echo + echo "You must pass 4 arguments to the script (in this order): " + echo " 1) compiler version number for spectrum mpi" + echo " 2) compiler version number for nvcc (number only, not 'sm_70' for example)" + echo " 3) CUDA compute architecture" + echo " 4) compiler version number for gcc. " + echo + echo "For example: " + echo " blueos_spectrum_nvcc_gcc.sh rolling-release 10.2.89 70 8.3.1" + exit +fi + +COMP_MPI_VER=$1 +COMP_NVCC_VER=$2 +COMP_ARCH=$3 +COMP_GCC_VER=$4 +shift 4 + +BUILD_SUFFIX=lc_blueos-spectrum-${COMP_MPI_VER}-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.20.2 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMPI_CXX_COMPILER=/usr/tce/packages/spectrum-mpi/spectrum-mpi-${COMP_MPI_VER}-gcc-${COMP_GCC_VER}/bin/mpig++ \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=On \ + -DENABLE_OPENMP=On \ + -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ + -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to run with mpi when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " lrun -n4 ./bin/raja-perf.exe" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 5d30ab1ea..9729db57e 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_xl.sh 2021.03.31" diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index 5f972f0dc..559c59900 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_xl_omptarget.sh 2022.08.19" diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh new file mode 100755 index 000000000..6dbeb9ee5 --- /dev/null +++ b/scripts/lc-builds/corona_sycl.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 1 ]]; then + echo + echo "You must pass 1 argument to the script: " + echo " 1) SYCL compiler installation path" + echo + echo "For example: " + echo " corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1" + exit +fi + +SYCL_PATH=$1 +shift 1 + +BUILD_SUFFIX=corona-sycl +: ${BUILD_TYPE:=RelWithDebInfo} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/corona_sycl.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX}_${USER} >/dev/null +mkdir build_${BUILD_SUFFIX}_${USER} && cd build_${BUILD_SUFFIX}_${USER} + +DATE=$(printf '%(%Y-%m-%d)T\n' -1) + +export PATH=${SYCL_PATH}/bin:$PATH +export LD_LIBRARY_PATH=${SYCL_PATH}/lib:${SYCL_PATH}/lib64:$LD_LIBRARY_PATH + +## NOTE: RAJA tests are turned off due to compilation issues. + +cmake \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DSYCL_LIB_PATH:STRING="${SYCL_PATH}/lib" \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_OPENMP=Off \ + -DENABLE_CUDA=Off \ + -DRAJA_ENABLE_TARGET_OPENMP=Off \ + -DENABLE_ALL_WARNINGS=Off \ + -DRAJA_ENABLE_SYCL=On \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_LINKER=clang++ \ + -DBLT_CXX_STD=c++17 \ + -DENABLE_TESTS=On \ + -DENABLE_EXAMPLES=On \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX}_${USER} and run make to build RAJA" +echo +echo "To run RAJA tests, exercises, etc. with the build, please do the following:" +echo +echo " 1) Load the ROCm module version matching the version in the compiler path" +echo " you passed to this script." +echo +echo " 2) Prefix the LD_LIBRARY_PATH environment variable with " +echo " SYCL_PATH/lib:SYCL_PATH/lib64" +echo +echo " where SYCL_PATH is set to the compiler installation path you passed" +echo " to this script (using the proper command for your shell)." +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_hipcc.sh b/scripts/lc-builds/toss3_hipcc.sh deleted file mode 100755 index b5d9b2760..000000000 --- a/scripts/lc-builds/toss3_hipcc.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env bash - -############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### - -if [[ $# -ne 2 ]]; then - echo - echo "You must pass 2 arguments to the script (in this order): " - echo " 1) compiler version number" - echo " 2) HIP compute architecture" - echo - echo "For example: " - echo " toss3_hipcc.sh 5.1.0 gfx906" - exit -fi - -COMP_VER=$1 -COMP_ARCH=$2 -shift 2 - -HIP_CLANG_FLAGS="--offload-arch=${COMP_ARCH}" -HOSTCONFIG="hip_3_X" - -if [[ ${COMP_VER} == 4.5.* ]] -then - HIP_CLANG_FLAGS="${HIP_CLANG_FLAGS} -mllvm -amdgpu-fixed-function-abi=1" - HOSTCONFIG="hip_4_5_link_X" -elif [[ ${COMP_VER} == 4.* ]] -then - HOSTCONFIG="hip_4_link_X" -elif [[ ${COMP_VER} == 3.* ]] -then - HOSTCONFIG="hip_3_X" -else - echo "Unknown hip version, using ${HOSTCONFIG} host-config" -fi - -BUILD_SUFFIX=lc_toss3-hipcc-${COMP_VER}-${COMP_ARCH} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/hip_link_X.cmake - -echo -echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" -echo "Configuration extra arguments:" -echo " $@" -echo - -rm -rf build_${BUILD_SUFFIX} >/dev/null -mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} - - -module load cmake/3.23.1 - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ - -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ - -DHIP_CLANG_PATH=/opt/rocm-${COMP_VER}/llvm/bin \ - -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang \ - -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang++ \ - -DHIP_CLANG_FLAGS="${HIP_CLANG_FLAGS}" \ - -DBLT_CXX_STD=c++14 \ - -C ${RAJA_HOSTCONFIG} \ - -DENABLE_HIP=ON \ - -DENABLE_OPENMP=OFF \ - -DENABLE_CUDA=OFF \ - -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ - "$@" \ - .. - -echo -echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" -echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_pgi.sh b/scripts/lc-builds/toss3_pgi.sh deleted file mode 100755 index 9967dd769..000000000 --- a/scripts/lc-builds/toss3_pgi.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJAPERF/COPYRIGHT file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### - -if [ "$1" == "" ]; then - echo - echo "You must pass a compiler version number to script. For example," - echo " toss3_pgi.sh 20.1" - exit -fi - -COMP_VER=$1 -shift 1 - -BUILD_SUFFIX=lc_toss3-pgi-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/pgi_X.cmake - -echo -echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" -echo "Configuration extra arguments:" -echo " $@" -echo - -rm -rf build_${BUILD_SUFFIX} 2>/dev/null -mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} - -module load cmake/3.20.2 - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgc++ \ - -DCMAKE_C_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgcc \ - -DBLT_CXX_STD=c++14 \ - -C ${RAJA_HOSTCONFIG} \ - -DENABLE_OPENMP=On \ - -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ - "$@" \ - .. - -echo -echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" -echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 7d2de5397..c571e568d 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) @@ -15,7 +15,7 @@ if [[ $# -lt 2 ]]; then echo " 3...) optional arguments to cmake" echo echo "For example: " - echo " toss4_amdclang.sh 5.1.0 gfx906" + echo " toss4_amdclang.sh 5.7.0 gfx906" exit fi @@ -44,6 +44,12 @@ echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in i echo "Configuration extra arguments:" echo " $@" echo +echo "To get cmake to work you may have to configure with" +echo " -DHIP_PLATFORM=amd" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo rm -rf build_${BUILD_SUFFIX} >/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} @@ -53,23 +59,28 @@ module load cmake/3.23.1 # unload rocm to avoid configuration problems where the loaded rocm and COMP_VER # are inconsistent causing the rocprim from the module to be used unexpectedly -module unload rocm +# module unload rocm +if [[ ${COMP_VER} =~ .*magic.* ]]; then + ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}" +else + ROCM_PATH="/usr/tce/packages/rocmcc-tce/rocmcc-${COMP_VER}" +fi cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ - -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ - -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \ - -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \ - -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \ + -DROCM_ROOT_DIR="${ROCM_PATH}" \ + -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \ + -DHIP_PATH=${ROCM_PATH}/llvm/bin \ + -DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/amdclang \ + -DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/amdclang++ \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ - -DENABLE_OPENMP=OFF \ + -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ @@ -78,7 +89,7 @@ cmake \ echo echo "***********************************************************************" echo -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJAPerf" echo echo " Please note that you have to have a consistent build environment" echo " when you make RAJA as cmake may reconfigure; unload the rocm module" diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh new file mode 100755 index 000000000..015416e8e --- /dev/null +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 2 ]]; then + echo + echo "You must pass 2 or more arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) HIP compute architecture" + echo " 3...) optional arguments to cmake" + echo + echo "For example: " + echo " toss4_amdclang_asan.sh 5.7.0 gfx90a" + exit +fi + +COMP_VER=$1 +COMP_ARCH=$2 +shift 2 + +HOSTCONFIG="hip_3_X" + +if [[ ${COMP_VER} == 4.* ]] +then +##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1" + HOSTCONFIG="hip_4_link_X" +elif [[ ${COMP_VER} == 3.* ]] +then + HOSTCONFIG="hip_3_X" +else + echo "Unknown hip version, using ${HOSTCONFIG} host-config" +fi + +BUILD_SUFFIX=lc_toss4-amdclang-${COMP_VER}-${COMP_ARCH}-asan +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo +echo "To get cmake to work you may have to configure with" +echo " -DHIP_PLATFORM=amd" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + + +module load cmake/3.23.1 + +# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER +# are inconsistent causing the rocprim from the module to be used unexpectedly +# module unload rocm + +if [[ ${COMP_VER} =~ .*magic.* ]]; then + ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}" +else + ROCM_PATH="/usr/tce/packages/rocmcc-tce/rocmcc-${COMP_VER}" +fi + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DROCM_ROOT_DIR="${ROCM_PATH}" \ + -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \ + -DHIP_PATH=${ROCM_PATH}/llvm/bin \ + -DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/amdclang \ + -DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/amdclang++ \ + -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}:xnack+" \ + -DGPU_TARGETS="${COMP_ARCH}:xnack+" \ + -DAMDGPU_TARGETS="${COMP_ARCH}:xnack+" \ + -DCMAKE_C_FLAGS="-fsanitize=address -shared-libsan" \ + -DCMAKE_CXX_FLAGS="-fsanitize=address -shared-libsan" \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_HIP=ON \ + -DENABLE_OPENMP=ON \ + -DENABLE_CUDA=OFF \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJAPerf" +echo +echo " Please note that you have to have a consistent build environment" +echo " when you make RAJA as cmake may reconfigure; load the appropriate" +echo " rocm and rocmcc modules (${COMP_VER}) when building." +echo +echo " module load rocm/COMP_VER rocmcc/COMP_VER" +echo " srun -n1 make" +echo +echo " Run with these environment options when using asan" +echo " ASAN_OPTIONS=print_suppressions=0:detect_leaks=0" +echo " HSA_XNACK=1" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh new file mode 100755 index 000000000..072443ff8 --- /dev/null +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 3 ]]; then + echo + echo "You must pass 3 or more arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) HIP version" + echo " 3) HIP compute architecture" + echo " 4...) optional arguments to cmake" + echo + echo "For example: " + echo " toss4_cce_hip.sh 14.0.3 5.2.3 gfx90a" + exit +fi + +COMP_VER=$1 +HIP_VER=$2 +HIP_ARCH=$3 +shift 3 + +HOSTCONFIG="hip_3_X" + +BUILD_SUFFIX=lc_toss4-cce-${COMP_VER}-hip-${HIP_VER}-${HIP_ARCH} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + + +module load cmake/3.24.2 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER="/usr/tce/packages/cce-tce/cce-${COMP_VER}/bin/craycc" \ + -DCMAKE_CXX_COMPILER="/usr/tce/packages/cce-tce/cce-${COMP_VER}/bin/crayCC" \ + -DHIP_PATH=/opt/rocm-${HIP_VER}/hip \ + -DCMAKE_HIP_ARCHITECTURES=${HIP_ARCH} \ + -DGPU_TARGETS=${HIP_ARCH} \ + -DAMDGPU_TARGETS=${HIP_ARCH} \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_HIP=ON \ + -DENABLE_OPENMP=ON \ + -DENABLE_CUDA=OFF \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you have to have a consistent build environment" +echo " when you make RAJA as cmake may reconfigure; load the appropriate" +echo " cce module (${COMP_VER}) when building." +echo +echo " module load cce-tce/${COMP_VER}" +echo " srun -n1 make" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_clang-mpi_caliper.sh b/scripts/lc-builds/toss4_clang-mpi_caliper.sh new file mode 100755 index 000000000..d3f4eb4bf --- /dev/null +++ b/scripts/lc-builds/toss4_clang-mpi_caliper.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 3 ]]; then + echo + echo "You must pass 3 arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) path to caliper cmake directory" + echo " 3) path to adiak cmake directory" + echo + echo "For example: " + echo " toss4_clang-mpi_caliper.sh 14.0.6 /usr/workspace/wsb/asde/caliper-quartz/share/cmake/caliper /usr/workspace/wsb/asde/caliper-quartz/lib/cmake/adiak" + exit +fi + +COMP_VER=$1 +CALI_DIR=$2 +ADIAK_DIR=$3 +shift 3 + +BUILD_SUFFIX=lc_toss4-clang-mpi-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.23.1 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=ON \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + -DRAJA_PERFSUITE_USE_CALIPER=ON \ + -Dcaliper_DIR=${CALI_DIR} \ + -Dadiak_DIR=${ADIAK_DIR} \ + -DCMAKE_C_FLAGS="-g -O0" \ + -DCMAKE_CXX_FLAGS="-g -O0" \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_clang.sh b/scripts/lc-builds/toss4_clang.sh similarity index 65% rename from scripts/lc-builds/toss3_clang.sh rename to scripts/lc-builds/toss4_clang.sh index 7406363bc..64b11c012 100755 --- a/scripts/lc-builds/toss3_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -1,24 +1,24 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," - echo " toss3_clang.sh 10.0.1" + echo " toss4_clang.sh 10.3.1" exit fi COMP_VER=$1 shift 1 -BUILD_SUFFIX=lc_toss3-clang-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +BUILD_SUFFIX=lc_toss4-clang-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -29,7 +29,7 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.20.2 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ @@ -40,8 +40,3 @@ cmake \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. - -echo -echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" -echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh index 273390561..89ece7b23 100755 --- a/scripts/lc-builds/toss4_clang_caliper.sh +++ b/scripts/lc-builds/toss4_clang_caliper.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number" @@ -25,7 +25,7 @@ ADIAK_DIR=$3 shift 3 BUILD_SUFFIX=lc_toss4-clang-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -36,7 +36,7 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.21.1 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index 614f2caec..db9cafa5c 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) @@ -52,6 +52,25 @@ echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in i echo "Configuration extra arguments:" echo " $@" echo +echo "To get cmake to work you may have to configure with" +echo " -DHIP_PLATFORM=amd" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo +echo "To work around some issues where *_FUSED kernels crash add these options" +echo " -DCMAKE_CXX_FLAGS=\"-fgpu-rdc\"" +echo " -DCMAKE_EXE_LINKER_FLAGS=\"-fgpu-rdc\"" +echo +echo "To work around some issues where *_FUSED kernels perform poorly use this environment variable" +echo " env HSA_SCRATCH_SINGLE_LIMIT=4000000000" +echo +echo "To work around some issues where the build fails with a weird error about max or fmax add these options" +echo " -DCMAKE_CXX_FLAGS=\"--hip-version={hip_version:ex=6.1.2}\"" +echo " -DCMAKE_EXE_LINKER_FLAGS=\"--hip-version={hip_version:ex=6.1.2}\"" +echo + + rm -rf build_${BUILD_SUFFIX} >/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} @@ -61,18 +80,27 @@ module load cmake/3.23.1 # unload rocm to avoid configuration problems where the loaded rocm and COMP_VER # are inconsistent causing the rocprim from the module to be used unexpectedly -module unload rocm +module unload rocm rocmcc +if [[ "${COMP_VER}" == *-magic ]]; then + ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}" + MPI_ROCM_PATH="/usr/tce/packages/cray-mpich/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}" +else + ROCM_PATH="/opt/rocm-${COMP_VER}" + MPI_ROCM_PATH=/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER} +fi cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DMPI_C_COMPILER="/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}/bin/mpiamdclang" \ - -DMPI_CXX_COMPILER="/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}/bin/mpiamdclang++" \ - -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ - -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ - -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \ - -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \ - -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \ + -DMPI_C_COMPILER="${MPI_ROCM_PATH}/bin/mpiamdclang" \ + -DMPI_CXX_COMPILER="${MPI_ROCM_PATH}/bin/mpiamdclang++" \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/lib/cmake" \ + -DHIP_PLATFORM=amd \ + -DROCM_ROOT_DIR="${ROCM_PATH}" \ + -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \ + -DHIP_PATH="${ROCM_PATH}/llvm/bin" \ + -DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/amdclang" \ + -DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/amdclang++" \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ @@ -98,10 +126,10 @@ echo echo " module unload rocm" echo " srun -n1 make" echo -echo " Please note that cray-mpich requires libmodules.so.1 from cce to run." +echo " Please note that rocm requires libpgmath.so from rocm/llvm to run." echo " Until this is handled transparently in the build system you may add " -echo " cce to your LD_LIBRARY_PATH." +echo " rocm/llvm to your LD_LIBRARY_PATH." echo -echo " export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/cce-tce/cce-13.0.2/cce/x86_64/lib/" +echo " export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm-${COMP_VER}/llvm/lib" echo echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_mvapich2_gcc.sh b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh similarity index 54% rename from scripts/lc-builds/toss3_mvapich2_gcc.sh rename to scripts/lc-builds/toss4_gcc-mpi_caliper.sh index 8c9e0662c..62389ea73 100755 --- a/scripts/lc-builds/toss3_mvapich2_gcc.sh +++ b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh @@ -1,25 +1,31 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 3 ]]; then echo - echo "You must pass a compiler version number to script. For example," - echo " toss3_mvapich2_gcc.sh 2.3 10.2.1" + echo "You must pass 3 arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) path to caliper cmake directory" + echo " 3) path to adiak cmake directory" + echo + echo "For example: " + echo " toss4_gcc-mpi_caliper.sh 10.3.1 /usr/workspace/wsb/asde/caliper-quartz/share/cmake/caliper /usr/workspace/wsb/asde/caliper-quartz/lib/cmake/adiak" exit fi -MPI_VER=$1 -COMP_VER=$2 -shift 2 +COMP_VER=$1 +CALI_DIR=$2 +ADIAK_DIR=$3 +shift 3 -BUILD_SUFFIX=lc_toss3-mvapich2-${MPI_VER}-gcc-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake +BUILD_SUFFIX=lc_toss4-gcc-mpi-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/gcc_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -30,28 +36,26 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.20.2 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DMPI_CXX_COMPILER=/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-gcc-${COMP_VER}/bin/mpic++ \ + -DCMAKE_C_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/gcc \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ - -DENABLE_MPI=On \ + -DENABLE_MPI=ON \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + -DRAJA_PERFSUITE_USE_CALIPER=ON \ + -Dcaliper_DIR=${CALI_DIR} \ + -Dadiak_DIR=${ADIAK_DIR} \ + -DCMAKE_C_FLAGS="-g -O0" \ + -DCMAKE_CXX_FLAGS="-g -O0" \ "$@" \ .. echo echo "***********************************************************************" -echo echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" -echo -echo " Please note that you have to run with mpi when you run" -echo " the RAJA Perf Suite; for example," -echo -echo " srun -n2 ./bin/raja-perf.exe" -echo echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_gcc.sh b/scripts/lc-builds/toss4_gcc.sh similarity index 65% rename from scripts/lc-builds/toss3_gcc.sh rename to scripts/lc-builds/toss4_gcc.sh index 4e7bf6bc1..1d0a98af7 100755 --- a/scripts/lc-builds/toss3_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -1,24 +1,24 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," - echo " toss3_gcc.sh 8.3.1" + echo " toss4_gcc.sh 10.3.1" exit fi COMP_VER=$1 shift 1 -BUILD_SUFFIX=lc_toss3-gcc-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake +BUILD_SUFFIX=lc_toss4-gcc-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/gcc_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -29,7 +29,7 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.20.2 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ @@ -40,8 +40,3 @@ cmake \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. - -echo -echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" -echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_gcc_caliper.sh b/scripts/lc-builds/toss4_gcc_caliper.sh index 11fd22605..dad854b59 100755 --- a/scripts/lc-builds/toss4_gcc_caliper.sh +++ b/scripts/lc-builds/toss4_gcc_caliper.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number" @@ -36,7 +36,7 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.21.1 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh new file mode 100755 index 000000000..71642e1f1 --- /dev/null +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 2 ]]; then + echo + echo "You must pass 2 or more arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) HIP compute architecture" + echo " 3...) optional arguments to cmake" + echo + echo "For example: " + echo " toss4_hipcc.sh 4.1.0 gfx906" + exit +fi + +COMP_VER=$1 +COMP_ARCH=$2 +shift 2 + +HOSTCONFIG="hip_3_X" + +if [[ ${COMP_VER} == 4.* ]] +then +##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1" + HOSTCONFIG="hip_4_link_X" +elif [[ ${COMP_VER} == 3.* ]] +then + HOSTCONFIG="hip_3_X" +else + echo "Unknown hip version, using ${HOSTCONFIG} host-config" +fi + +BUILD_SUFFIX=lc_toss4-hipcc-${COMP_VER}-${COMP_ARCH} +RAJA_HOST_CONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + + +module load cmake/3.23.1 + +# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER +# are inconsistent causing the rocprim from the module to be used unexpectedly +module unload rocm + + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ + -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ + -DHIP_PATH=/opt/rocm-${COMP_VER}/bin \ + -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \ + -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \ + -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ + -DGPU_TARGETS="${COMP_ARCH}" \ + -DAMDGPU_TARGETS="${COMP_ARCH}" \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_HIP=ON \ + -DENABLE_OPENMP=ON \ + -DENABLE_CUDA=OFF \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you have to have a consistent build environment" +echo " when you make RAJA as cmake may reconfigure; unload the rocm module" +echo " or load the appropriate rocm module (${COMP_VER}) when building." +echo +echo " module unload rocm" +echo " srun -n1 make" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh new file mode 100755 index 000000000..dc042a369 --- /dev/null +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 1 ]]; then + echo + echo "You must pass a compiler version number to script. For example," + echo " toss4_icpc-classic.sh 19.1.2" + exit +fi + +COMP_VER=$1 +shift 1 + +BUILD_SUFFIX=lc_toss4-icpc-classic-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpc-classic_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.23.1 + +## +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# times at a potential cost of slower 'forall' execution. +## + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \ + -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you may need to add some intel openmp libraries to your" +echo " LD_LIBRARY_PATH to run with openmp." +echo +echo " LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel/intel-${COMP_VER}/compiler/lib/intel64_lin" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss4_icpc.sh similarity index 66% rename from scripts/lc-builds/toss3_icpc.sh rename to scripts/lc-builds/toss4_icpc.sh index a8b7de2b9..77d81605f 100755 --- a/scripts/lc-builds/toss3_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -1,32 +1,24 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," - echo " toss3_icpc.sh 19.1.0" + echo " toss4_icpc.sh 2022.3" exit fi COMP_VER=$1 shift 1 -COMP_MAJOR_VER=${COMP_VER:0:2} -GCC_HEADER_VER=7 - -if [ ${COMP_MAJOR_VER} -gt 18 ] -then - GCC_HEADER_VER=8 -fi - -BUILD_SUFFIX=lc_toss3-icpc-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/icpc_X_gcc${GCC_HEADER_VER}headers.cmake +BUILD_SUFFIX=lc_toss4-icpc-${COMP_VER} +RAJA_HOST_CONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpc_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -37,10 +29,10 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.20.2 +module load cmake/3.23.1 ## -# CMake option -DENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile # times at a potential cost of slower 'forall' execution. ## @@ -50,6 +42,7 @@ cmake \ -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \ -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ @@ -57,5 +50,12 @@ cmake \ echo echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you may need to add some intel openmp libraries to your" +echo " LD_LIBRARY_PATH to run with openmp." +echo +echo " LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel/intel-${COMP_VER}/compiler/lib/intel64_lin" +echo echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh new file mode 100755 index 000000000..0a89683c3 --- /dev/null +++ b/scripts/lc-builds/toss4_icpx.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 1 ]]; then + echo + echo "You must pass a compiler version number to script. For example," + echo " toss4_icpx.sh 2022.1.0" + exit +fi + +COMP_VER=$1 +shift 1 + +BUILD_SUFFIX=lc_toss4-icpx-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpx_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.23.1 + +## +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# times at a potential cost of slower 'forall' execution. +## + +source /usr/tce/packages/intel/intel-${COMP_VER}/setvars.sh + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icpx \ + -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icx \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. diff --git a/scripts/lc-builds/toss4_mvapich2_icpx.sh b/scripts/lc-builds/toss4_mvapich2_icpx.sh new file mode 100755 index 000000000..def610fb2 --- /dev/null +++ b/scripts/lc-builds/toss4_mvapich2_icpx.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 2 ]]; then + echo + echo "You must pass 2 or more arguments to the script (in this order): " + echo " 1) mvapich2 compiler version number" + echo " 2) icpx compiler version number" + echo " 3...) optional arguments to cmake" + echo + echo "For example: " + echo " toss4_mvapich2_icpx.sh 2022.1.0" + exit +fi + +MPI_VER=$1 +COMP_VER=$2 +shift 2 + +BUILD_SUFFIX=lc_toss4-mvapich2-${MPI_VER}-icpx-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpx_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.23.1 + +## +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# times at a potential cost of slower 'forall' execution. +## + +source /usr/tce/packages/intel/intel-${COMP_VER}/setvars.sh + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMPI_C_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicc" \ + -DMPI_CXX_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicxx" \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icpx \ + -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icx \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=ON \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to run with mpi when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " srun -n2 ./bin/raja-perf.exe" +echo +echo "***********************************************************************" diff --git a/scripts/make_release_tarball.sh b/scripts/make_release_tarball.sh index cd86cdc80..1956d0436 100755 --- a/scripts/make_release_tarball.sh +++ b/scripts/make_release_tarball.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/travis_build_and_test.sh b/scripts/travis_build_and_test.sh index 5ca692a49..027d41ed7 100755 --- a/scripts/travis_build_and_test.sh +++ b/scripts/travis_build_and_test.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/ubuntu-builds/ubuntu_clang.sh b/scripts/ubuntu-builds/ubuntu_clang.sh index 68b722774..7ddba9a7d 100755 --- a/scripts/ubuntu-builds/ubuntu_clang.sh +++ b/scripts/ubuntu-builds/ubuntu_clang.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " ubuntu_clang.sh 10" @@ -22,6 +22,8 @@ RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/ubuntu-builds/clang_X.cmake echo echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null @@ -39,5 +41,5 @@ cmake \ echo echo "***********************************************************************" -echo "cd into directory ${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" echo "***********************************************************************" diff --git a/scripts/ubuntu-builds/ubuntu_gcc.sh b/scripts/ubuntu-builds/ubuntu_gcc.sh index 04c57fce7..e40c65482 100755 --- a/scripts/ubuntu-builds/ubuntu_gcc.sh +++ b/scripts/ubuntu-builds/ubuntu_gcc.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " ubuntu_gcc.sh 8" @@ -22,6 +22,8 @@ RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/ubuntu-builds/gcc_X.cmake echo echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null @@ -39,5 +41,5 @@ cmake \ echo echo "***********************************************************************" -echo "cd into directory ${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" echo "***********************************************************************" diff --git a/scripts/update_copyright.sh b/scripts/update_copyright.sh index d3bdeb170..527e42d43 100755 --- a/scripts/update_copyright.sh +++ b/scripts/update_copyright.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -17,7 +17,8 @@ # as well. # # IMPORTANT: Since this file is not modified (it is running the shell -# script commands), you must EDIT THE COPYRIGHT DATES ABOVE MANUALLY. +# script commands), you must EDIT THE COPYRIGHT DATES IN THE HEADER ABOVE +# MANUALLY. # # Edit the 'find' command below to change the set of files that will be # modified. @@ -46,18 +47,18 @@ for i in `cat files2change` do echo $i cp $i $i.sed.bak - sed "s/Copyright (c) 2017-22/Copyright (c) 2017-23/" $i.sed.bak > $i + sed "s/Copyright (c) 2017-23/Copyright (c) 2017-24/" $i.sed.bak > $i done echo LICENSE cp LICENSE LICENSE.sed.bak -sed "s/Copyright (c) 2017-2022/Copyright (c) 2017-2023/" LICENSE.sed.bak > LICENSE +sed "s/Copyright (c) 2017-2023/Copyright (c) 2017-2024/" LICENSE.sed.bak > LICENSE -for i in RELEASE README.md +for i in RELEASE README.md docs/conf.py do echo $i cp $i $i.sed.bak - sed "s/2017-22/2017-23/" $i.sed.bak > $i + sed "s/2017-23/2017-24/" $i.sed.bak > $i done #============================================================================= diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 24e0a0815..f60d14744 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -18,8 +18,9 @@ add_subdirectory(polybench) add_subdirectory(stream) add_subdirectory(stream-kokkos) add_subdirectory(algorithm) +add_subdirectory(comm) -set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS +set(RAJA_PERFSUITE_LIBS common apps basic @@ -29,7 +30,9 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS polybench stream stream-kokkos - algorithm) + algorithm + comm) +set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_LIBS}) list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) if(RAJA_ENABLE_TARGET_OPENMP) @@ -57,12 +60,6 @@ blt_add_executable( apps/PRESSURE.cpp apps/PRESSURE-Seq.cpp apps/PRESSURE-OMPTarget.cpp - apps/HALOEXCHANGE.cpp - apps/HALOEXCHANGE-Seq.cpp - apps/HALOEXCHANGE-OMPTarget.cpp - apps/HALOEXCHANGE_FUSED.cpp - apps/HALOEXCHANGE_FUSED-Seq.cpp - apps/HALOEXCHANGE_FUSED-OMPTarget.cpp apps/LTIMES.cpp apps/LTIMES-Seq.cpp apps/LTIMES-OMPTarget.cpp @@ -75,6 +72,9 @@ blt_add_executable( apps/MASS3DPA.cpp apps/MASS3DPA-Seq.cpp apps/MASS3DPA-OMPTarget.cpp + apps/MATVEC_3D_STENCIL.cpp + apps/MATVEC_3D_STENCIL-Seq.cpp + apps/MATVEC_3D_STENCIL-OMPTarget.cpp apps/NODAL_ACCUMULATION_3D.cpp apps/NODAL_ACCUMULATION_3D-Seq.cpp apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -138,6 +138,9 @@ blt_add_executable( basic/TRAP_INT.cpp basic/TRAP_INT-Seq.cpp basic/TRAP_INT-OMPTarget.cpp + basic/MULTI_REDUCE.cpp + basic/MULTI_REDUCE-Seq.cpp + basic/MULTI_REDUCE-OMPTarget.cpp lcals/DIFF_PREDICT.cpp lcals/DIFF_PREDICT-Seq.cpp lcals/DIFF_PREDICT-OMPTarget.cpp @@ -248,6 +251,28 @@ blt_add_executable( algorithm/MEMCPY.cpp algorithm/MEMCPY-Seq.cpp algorithm/MEMCPY-OMPTarget.cpp + algorithm/ATOMIC.cpp + algorithm/ATOMIC-Seq.cpp + algorithm/ATOMIC-OMPTarget.cpp + algorithm/HISTOGRAM.cpp + algorithm/HISTOGRAM-Seq.cpp + algorithm/HISTOGRAM-OMPTarget.cpp + comm/HALO_base.cpp + comm/HALO_PACKING.cpp + comm/HALO_PACKING-Seq.cpp + comm/HALO_PACKING-OMPTarget.cpp + comm/HALO_PACKING_FUSED.cpp + comm/HALO_PACKING_FUSED-Seq.cpp + comm/HALO_PACKING_FUSED-OMPTarget.cpp + comm/HALO_SENDRECV.cpp + comm/HALO_SENDRECV-Seq.cpp + comm/HALO_SENDRECV-OMPTarget.cpp + comm/HALO_EXCHANGE.cpp + comm/HALO_EXCHANGE-Seq.cpp + comm/HALO_EXCHANGE-OMPTarget.cpp + comm/HALO_EXCHANGE_FUSED.cpp + comm/HALO_EXCHANGE_FUSED-Seq.cpp + comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS} ) install( TARGETS raja-perf-omptarget.exe @@ -264,4 +289,7 @@ blt_add_executable( install( TARGETS raja-perf.exe RUNTIME DESTINATION bin ) +install( TARGETS ${RAJA_PERFSUITE_LIBS} + LIBRARY DESTINATION lib + ) endif() diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index 3ce688d29..7aa549262 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/ATOMIC-Cuda.cpp b/src/algorithm/ATOMIC-Cuda.cpp new file mode 100644 index 000000000..a286c60d2 --- /dev/null +++ b/src/algorithm/ATOMIC-Cuda.cpp @@ -0,0 +1,338 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include +#include + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +const size_t warp_size = 32; + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_thread(Real_ptr atomic, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i, ATOMIC_VALUE); + } +} + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_warp(Real_ptr atomic, + Index_type iend) +{ + Real_type val = 0; + + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + val = ATOMIC_VALUE; + } + + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage warp_reduce_storage; + val = WarpReduce(warp_reduce_storage).Sum(val); + if ((threadIdx.x % warp_size) == 0) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i/warp_size, val); + } +} + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_block(Real_ptr atomic, + Index_type iend) +{ + Real_type val = 0; + + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + val = ATOMIC_VALUE; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage block_reduce_storage; + val = BlockReduce(block_reduce_storage).Sum(val); + if (threadIdx.x == 0) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, blockIdx.x, val); + } +} + + +template < size_t block_size, size_t replication > +void ATOMIC::runCudaVariantReplicateGlobal(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (atomic_replicate_thread), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i, ATOMIC_VALUE); + }); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +template < size_t block_size, size_t replication > +void ATOMIC::runCudaVariantReplicateWarp(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (atomic_replicate_warp), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +template < size_t block_size, size_t replication > +void ATOMIC::runCudaVariantReplicateBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (atomic_replicate_block), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +void ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantReplicateGlobal(vid); + + } + + t += 1; + + } + + }); + + if ( vid == Base_CUDA ) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantReplicateWarp(vid); + + } + + t += 1; + + } + + }); + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantReplicateBlock(vid); + + } + + t += 1; + + } + + }); + + } + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_global_"+std::to_string(block_size)); + + } + + }); + + if ( vid == Base_CUDA ) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_warp_"+std::to_string(block_size)); + + } + + }); + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_block_"+std::to_string(block_size)); + + } + + }); + + } + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/algorithm/ATOMIC-Hip.cpp b/src/algorithm/ATOMIC-Hip.cpp new file mode 100644 index 000000000..fbb103596 --- /dev/null +++ b/src/algorithm/ATOMIC-Hip.cpp @@ -0,0 +1,338 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include +#include + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +const size_t warp_size = 64; + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_thread(Real_ptr atomic, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, i, ATOMIC_VALUE); + } +} + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_warp(Real_ptr atomic, + Index_type iend) +{ + Real_type val = 0; + + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + val = ATOMIC_VALUE; + } + + using WarpReduce = rocprim::warp_reduce; + __shared__ typename WarpReduce::storage_type warp_reduce_storage; + WarpReduce().reduce(val, val, warp_reduce_storage); + if ((threadIdx.x % warp_size) == 0) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, i/warp_size, val); + } +} + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_block(Real_ptr atomic, + Index_type iend) +{ + Real_type val = 0; + + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + val = ATOMIC_VALUE; + } + + using BlockReduce = rocprim::block_reduce; + __shared__ typename BlockReduce::storage_type block_reduce_storage; + BlockReduce().reduce(val, val, block_reduce_storage); + if (threadIdx.x == 0) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, blockIdx.x, val); + } +} + + +template < size_t block_size, size_t replication > +void ATOMIC::runHipVariantReplicateGlobal(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (atomic_replicate_thread), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, i, ATOMIC_VALUE); + }); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Hip variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +template < size_t block_size, size_t replication > +void ATOMIC::runHipVariantReplicateWarp(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (atomic_replicate_warp), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Hip variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +template < size_t block_size, size_t replication > +void ATOMIC::runHipVariantReplicateBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (atomic_replicate_block), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Hip variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +void ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantReplicateGlobal(vid); + + } + + t += 1; + + } + + }); + + if ( vid == Base_HIP ) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantReplicateWarp(vid); + + } + + t += 1; + + } + + }); + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantReplicateBlock(vid); + + } + + t += 1; + + } + + }); + + } + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_global_"+std::to_string(block_size)); + + } + + }); + + if ( vid == Base_HIP ) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_warp_"+std::to_string(block_size)); + + } + + }); + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_block_"+std::to_string(block_size)); + + } + + }); + + } + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/algorithm/ATOMIC-OMP.cpp b/src/algorithm/ATOMIC-OMP.cpp new file mode 100644 index 000000000..ae3863bb1 --- /dev/null +++ b/src/algorithm/ATOMIC-OMP.cpp @@ -0,0 +1,153 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +template < size_t replication > +void ATOMIC::runOpenMPVariantReplicate(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + ATOMIC_BODY(i, ATOMIC_VALUE); + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto atomic_base_lam = [=](Index_type i) { + #pragma omp atomic + ATOMIC_BODY(i, ATOMIC_VALUE); + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + atomic_base_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + ATOMIC_RAJA_BODY(RAJA::omp_atomic, i, ATOMIC_VALUE); + }); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n ATOMIC : Unknown variant id = " << vid << std::endl; + } + + } + + ATOMIC_DATA_TEARDOWN(replication); + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + + +void ATOMIC::runOpenMPVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_OpenMP || vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { + + seq_for(cpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + runOpenMPVariantReplicate(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setOpenMPTuningDefinitions(VariantID vid) +{ + if ( vid == Base_OpenMP || vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { + + seq_for(cpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/ATOMIC-OMPTarget.cpp b/src/algorithm/ATOMIC-OMPTarget.cpp new file mode 100644 index 000000000..2c7bb7203 --- /dev/null +++ b/src/algorithm/ATOMIC-OMPTarget.cpp @@ -0,0 +1,127 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +template < size_t replication > +void ATOMIC::runOpenMPTargetVariantReplicate(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(atomic) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + ATOMIC_BODY(i, ATOMIC_VALUE); + } + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + ATOMIC_RAJA_BODY(RAJA::omp_atomic, i, ATOMIC_VALUE); + }); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); + +} + +void ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + runOpenMPTargetVariantReplicate(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/algorithm/ATOMIC-Seq.cpp b/src/algorithm/ATOMIC-Seq.cpp new file mode 100644 index 000000000..1cccb8a6b --- /dev/null +++ b/src/algorithm/ATOMIC-Seq.cpp @@ -0,0 +1,146 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +template < size_t replication > +void ATOMIC::runSeqVariantReplicate(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + ATOMIC_BODY(i, ATOMIC_VALUE); + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto atomic_base_lam = [=](Index_type i) { + ATOMIC_BODY(i, ATOMIC_VALUE); + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + atomic_base_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + ATOMIC_RAJA_BODY(RAJA::seq_atomic, i, ATOMIC_VALUE); + }); + + } + stopTimer(); + + break; + } +#endif + + default : { + getCout() << "\n ATOMIC : Unknown variant id = " << vid << std::endl; + } + + } + + ATOMIC_DATA_TEARDOWN(replication); + +} + + +void ATOMIC::runSeqVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_Seq || vid == Lambda_Seq || vid == RAJA_Seq ) { + + seq_for(cpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + runSeqVariantReplicate(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setSeqTuningDefinitions(VariantID vid) +{ + if ( vid == Base_Seq || vid == Lambda_Seq || vid == RAJA_Seq ) { + + seq_for(cpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp new file mode 100644 index 000000000..8da1c2421 --- /dev/null +++ b/src/algorithm/ATOMIC.cpp @@ -0,0 +1,80 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace algorithm +{ + + +ATOMIC::ATOMIC(const RunParams& params) + : KernelBase(rajaperf::Algorithm_ATOMIC, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(50); + + setActualProblemSize( getTargetProblemSize() ); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 0 ); + setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setFLOPsPerRep(getActualProblemSize()); + + setUsesFeature(Forall); + setUsesFeature(Atomic); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( Lambda_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( Lambda_HIP ); + setVariantDefined( RAJA_HIP ); +} + +ATOMIC::~ATOMIC() +{ +} + +void ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + m_init = 0; + m_final = -static_cast(vid); +} + +void ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid][tune_idx] += static_cast(m_final); +} + +void ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/ATOMIC.hpp b/src/algorithm/ATOMIC.hpp new file mode 100644 index 000000000..55ab41ad8 --- /dev/null +++ b/src/algorithm/ATOMIC.hpp @@ -0,0 +1,113 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// ATOMIC kernel reference implementation: +/// Test atomic throughput with an amount of replication known at compile time. +/// +/// for (Index_type i = 0; i < N; ++i ) { +/// atomic[i%replication] += 1; +/// } +/// + +#ifndef RAJAPerf_Algorithm_ATOMIC_HPP +#define RAJAPerf_Algorithm_ATOMIC_HPP + +#define ATOMIC_DATA_SETUP(replication) \ + Real_type init = m_init; \ + Real_ptr atomic; \ + allocAndInitDataConst(atomic, replication, init, vid); + +#define ATOMIC_DATA_TEARDOWN(replication) \ + { \ + auto reset_atomic = scopedMoveData(atomic, replication, vid); \ + m_final = init; \ + for (size_t r = 0; r < replication; ++r ) { \ + m_final += atomic[r]; \ + } \ + } \ + deallocData(atomic, vid); + +#define ATOMIC_VALUE 1.0 + +#define ATOMIC_BODY(i, val) \ + atomic[(i)%replication] += (val) + +#define ATOMIC_RAJA_BODY(policy, i, val) \ + RAJA::atomicAdd(&atomic[(i)%replication], (val)) + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace algorithm +{ + +class ATOMIC : public KernelBase +{ +public: + + ATOMIC(const RunParams& params); + + ~ATOMIC(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + + template < size_t replication > + void runSeqVariantReplicate(VariantID vid); + template < size_t replication > + void runOpenMPVariantReplicate(VariantID vid); + template < size_t block_size, size_t replication > + void runCudaVariantReplicateGlobal(VariantID vid); + template < size_t block_size, size_t replication > + void runHipVariantReplicateGlobal(VariantID vid); + template < size_t block_size, size_t replication > + void runCudaVariantReplicateWarp(VariantID vid); + template < size_t block_size, size_t replication > + void runHipVariantReplicateWarp(VariantID vid); + template < size_t block_size, size_t replication > + void runCudaVariantReplicateBlock(VariantID vid); + template < size_t block_size, size_t replication > + void runHipVariantReplicateBlock(VariantID vid); + template < size_t replication > + void runOpenMPTargetVariantReplicate(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + static const size_t default_cpu_atomic_replication = 64; + using cpu_atomic_replications_type = integer::make_atomic_replication_list_type; + static const size_t default_atomic_replication = 4096; + using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; + + Real_type m_init; + Real_type m_final; +}; + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt index 54334242e..515c35baa 100644 --- a/src/algorithm/CMakeLists.txt +++ b/src/algorithm/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -30,6 +30,7 @@ blt_add_library( REDUCE_SUM-Cuda.cpp REDUCE_SUM-OMP.cpp REDUCE_SUM-OMPTarget.cpp + REDUCE_SUM-Sycl.cpp MEMSET.cpp MEMSET-Seq.cpp MEMSET-Hip.cpp @@ -42,5 +43,17 @@ blt_add_library( MEMCPY-Cuda.cpp MEMCPY-OMP.cpp MEMCPY-OMPTarget.cpp + ATOMIC.cpp + ATOMIC-Seq.cpp + ATOMIC-Hip.cpp + ATOMIC-Cuda.cpp + ATOMIC-OMP.cpp + ATOMIC-OMPTarget.cpp + HISTOGRAM.cpp + HISTOGRAM-Seq.cpp + HISTOGRAM-Hip.cpp + HISTOGRAM-Cuda.cpp + HISTOGRAM-OMP.cpp + HISTOGRAM-OMPTarget.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp new file mode 100644 index 000000000..0bc363ee3 --- /dev/null +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -0,0 +1,397 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "cub/device/device_histogram.cuh" +#include "cub/util_allocator.cuh" + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +constexpr Index_type warp_size = 32; + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, + Index_ptr bins, + Index_type iend, + Index_type num_bins, + Index_type shared_replication, + Index_type global_replication) +{ + if (shared_replication > 0) { + + extern __shared__ HISTOGRAM::Data_type shared_counts[]; + for (Index_type t = threadIdx.x; + t < Index_type(num_bins * shared_replication); + t += block_size) { + shared_counts[t] = HISTOGRAM::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + for ( ; i < iend ; i += gridDim.x * block_size ) { + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); + RAJA::atomicAdd(&shared_counts[offset], HISTOGRAM::Data_type(1)); + } + } + + __syncthreads(); + for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) { + auto block_sum = HISTOGRAM::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_counts[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; + } + if (block_sum != HISTOGRAM::Data_type(0)) { + Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins; + RAJA::atomicAdd(&global_counts[offset], block_sum); + } + } + + } else { + + Index_type i = blockIdx.x * block_size + threadIdx.x; + Index_type warp = i / warp_size; + for ( ; i < iend ; i += gridDim.x * block_size ) { + Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; + RAJA::atomicAdd(&global_counts[offset], HISTOGRAM::Data_type(1)); + } + } +} + + +void HISTOGRAM::runCudaVariantLibrary(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + HISTOGRAM_DATA_SETUP; + + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, 1); + + RAJAPERF_UNUSED_VAR(counts_init); + + if ( vid == Base_CUDA ) { + + cudaStream_t stream = res.get_stream(); + + int len = iend - ibegin; + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + len, + stream)); + + // Allocate temporary storage + unsigned char* temp_storage; + allocData(DataSpace::CudaDevice, temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Run + cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + len, + stream)); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, 1); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, 1); + + } + stopTimer(); + + // Free temporary storage + deallocData(DataSpace::CudaDevice, temp_storage); + + } else { + getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; + } + + RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); + +} + + +template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication, + typename MappingHelper > +void HISTOGRAM::runCudaVariantAtomicRuntime(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + HISTOGRAM_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + auto* func = &histogram_atomic_runtime; + + cudaFuncAttributes func_attr; + cudaErrchk(cudaFuncGetAttributes(&func_attr, (const void*)func)); + const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; + const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; + + const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); + const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); + + const Index_type max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, func, block_size, shmem); + const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const Index_type grid_size = std::min(normal_grid_size, max_grid_size); + + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + + RPlaunchCudaKernel( func, + grid_size, block_size, + shmem, res.get_stream(), + counts, + bins, + iend, + num_bins, + shared_replication, + global_replication ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + for (Index_type bin = 0; bin < num_bins; ++bin) { + Data_type count_final = Data_type(0); + for (Index_type r = 0; r < global_replication; ++r) { + Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins; + count_final += hcounts[offset]; + } + counts_final[bin] = count_final; + } + + } + stopTimer(); + + RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); + + } else if ( vid == RAJA_CUDA ) { + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< + RAJA::cuda::MultiReduceTuning< + RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::thread_xyz<>, + RAJA::GetOffsetRight>, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::warp_global_xyz<>, + RAJA::GetOffsetLeft>>>; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + HISTOGRAM_BODY; + }); + + HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy); + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; + } + +} + + +void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + runCudaVariantLibrary(vid); + + } + + t += 1; + + } + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); + + } + + t += 1; + + } + + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { + + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); + + } + + t += 1; + + }); + + } + + }); + + }); + + } + + }); + + } else { + + getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA ) { + + addVariantTuningName(vid, "cub"); + + } + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + addVariantTuningName(vid, "atomic_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } + + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { + + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { + + addVariantTuningName(vid, "atomic_" + "shared("+std::to_string(shared_replication)+")_"+ + "global("+std::to_string(global_replication)+")_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } + + }); + + }); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp new file mode 100644 index 000000000..5a25bca5c --- /dev/null +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -0,0 +1,426 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#if defined(__HIPCC__) +#define ROCPRIM_HIP_API 1 +#include "rocprim/device/device_histogram.hpp" +#elif defined(__CUDACC__) +#include "cub/device/device_histogram.cuh" +#include "cub/util_allocator.cuh" +#endif + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +constexpr Index_type warp_size = 64; + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, + Index_ptr bins, + Index_type iend, + Index_type num_bins, + Index_type shared_replication, + Index_type global_replication) +{ + if (shared_replication > 0) { + + extern __shared__ HISTOGRAM::Data_type shared_counts[]; + for (Index_type t = threadIdx.x; + t < Index_type(num_bins * shared_replication); + t += block_size) { + shared_counts[t] = HISTOGRAM::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + for ( ; i < iend ; i += gridDim.x * block_size ) { + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); + RAJA::atomicAdd(&shared_counts[offset], HISTOGRAM::Data_type(1)); + } + } + + __syncthreads(); + for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) { + auto block_sum = HISTOGRAM::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_counts[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; + } + if (block_sum != HISTOGRAM::Data_type(0)) { + Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins; + RAJA::atomicAdd(&global_counts[offset], block_sum); + } + } + + } else { + + Index_type i = blockIdx.x * block_size + threadIdx.x; + Index_type warp = i / warp_size; + for ( ; i < iend ; i += gridDim.x * block_size ) { + Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; + RAJA::atomicAdd(&global_counts[offset], HISTOGRAM::Data_type(1)); + } + } +} + + +void HISTOGRAM::runHipVariantLibrary(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + HISTOGRAM_DATA_SETUP; + + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, 1); + + RAJAPERF_UNUSED_VAR(counts_init); + + if ( vid == Base_HIP ) { + + hipStream_t stream = res.get_stream(); + + int len = iend - ibegin; + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; +#if defined(__HIPCC__) + hipErrchk(::rocprim::histogram_even(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + len, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + stream)); +#elif defined(__CUDACC__) + cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + len, + stream)); +#endif + + // Allocate temporary storage + unsigned char* temp_storage; + allocData(DataSpace::HipDevice, temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Run +#if defined(__HIPCC__) + hipErrchk(::rocprim::histogram_even(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + len, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + stream)); +#elif defined(__CUDACC__) + cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + len, + stream)); +#endif + + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, 1); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, 1); + + } + stopTimer(); + + // Free temporary storage + deallocData(DataSpace::HipDevice, temp_storage); + + } else { + getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; + } + + RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); + +} + + +template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication, + typename MappingHelper > +void HISTOGRAM::runHipVariantAtomicRuntime(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + HISTOGRAM_DATA_SETUP; + + if ( vid == Base_HIP ) { + + auto* func = &histogram_atomic_runtime; + + hipFuncAttributes func_attr; + hipErrchk(hipFuncGetAttributes(&func_attr, (const void*)func)); + const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; + const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; + + const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); + const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); + + const Index_type max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, func, block_size, shmem); + const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const Index_type grid_size = std::min(normal_grid_size, max_grid_size); + + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + + RPlaunchHipKernel( func, + grid_size, block_size, + shmem, res.get_stream(), + counts, + bins, + iend, + num_bins, + shared_replication, + global_replication ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + for (Index_type bin = 0; bin < num_bins; ++bin) { + Data_type count_final = Data_type(0); + for (Index_type r = 0; r < global_replication; ++r) { + Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins; + count_final += hcounts[offset]; + } + counts_final[bin] = count_final; + } + + } + stopTimer(); + + RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); + + } else if ( vid == RAJA_HIP ) { + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< + RAJA::hip::MultiReduceTuning< + RAJA::hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::thread_xyz<>, + RAJA::GetOffsetRight>, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::warp_global_xyz<>, + RAJA::GetOffsetLeft>>>; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + HISTOGRAM_BODY; + }); + + HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy); + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; + } + +} + + +void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + runHipVariantLibrary(vid); + + } + + t += 1; + + } + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); + + } + + t += 1; + + } + + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { + + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); + + } + + t += 1; + + }); + + } + + }); + + }); + + } + + }); + + } else { + + getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void HISTOGRAM::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP ) { + + addVariantTuningName(vid, "rocprim"); + + } + + if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + addVariantTuningName(vid, "atomic_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } + + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { + + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { + + addVariantTuningName(vid, "atomic_" + "shared("+std::to_string(shared_replication)+")_"+ + "global("+std::to_string(global_replication)+")_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } + + }); + + }); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/algorithm/HISTOGRAM-OMP.cpp b/src/algorithm/HISTOGRAM-OMP.cpp new file mode 100644 index 000000000..87b554b47 --- /dev/null +++ b/src/algorithm/HISTOGRAM-OMP.cpp @@ -0,0 +1,121 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void HISTOGRAM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + HISTOGRAM_SETUP_COUNTS; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + HISTOGRAM_BODY; + } + + HISTOGRAM_FINALIZE_COUNTS; + + } + stopTimer(); + + HISTOGRAM_TEARDOWN_COUNTS; + + break; + } + + case Lambda_OpenMP : { + + HISTOGRAM_SETUP_COUNTS; + + auto histogram_base_lam = [=](Index_type i) { + #pragma omp atomic + HISTOGRAM_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + histogram_base_lam(i); + } + + HISTOGRAM_FINALIZE_COUNTS; + + } + stopTimer(); + + HISTOGRAM_TEARDOWN_COUNTS; + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS_RAJA(RAJA::omp_multi_reduce); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + HISTOGRAM_BODY; + }); + + HISTOGRAM_FINALIZE_COUNTS_RAJA(RAJA::omp_multi_reduce); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n HISTOGRAM : Unknown variant id = " << vid << std::endl; + } + + } + + HISTOGRAM_DATA_TEARDOWN; + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM-OMPTarget.cpp b/src/algorithm/HISTOGRAM-OMPTarget.cpp new file mode 100644 index 000000000..033f309c9 --- /dev/null +++ b/src/algorithm/HISTOGRAM-OMPTarget.cpp @@ -0,0 +1,68 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void HISTOGRAM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(counts, counts_init, num_bins); + + #pragma omp target is_device_ptr(counts, bins) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + HISTOGRAM_BODY; + } + + getOpenMPDeviceData(counts_final, counts, num_bins); + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown OMP Target variant id = " << vid << std::endl; + } + + HISTOGRAM_DATA_TEARDOWN; + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/algorithm/HISTOGRAM-Seq.cpp b/src/algorithm/HISTOGRAM-Seq.cpp new file mode 100644 index 000000000..e41ab171e --- /dev/null +++ b/src/algorithm/HISTOGRAM-Seq.cpp @@ -0,0 +1,114 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void HISTOGRAM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + HISTOGRAM_SETUP_COUNTS; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS; + + for (Index_type i = ibegin; i < iend; ++i ) { + HISTOGRAM_BODY; + } + + HISTOGRAM_FINALIZE_COUNTS; + + } + stopTimer(); + + HISTOGRAM_TEARDOWN_COUNTS; + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + HISTOGRAM_SETUP_COUNTS; + + auto histogram_base_lam = [=](Index_type i) { + HISTOGRAM_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS; + + for (Index_type i = ibegin; i < iend; ++i ) { + histogram_base_lam(i); + } + + HISTOGRAM_FINALIZE_COUNTS; + + } + stopTimer(); + + HISTOGRAM_TEARDOWN_COUNTS; + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS_RAJA(RAJA::seq_multi_reduce); + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + HISTOGRAM_BODY; + }); + + HISTOGRAM_FINALIZE_COUNTS_RAJA(RAJA::seq_multi_reduce); + + } + stopTimer(); + + break; + } +#endif + + default : { + getCout() << "\n HISTOGRAM : Unknown variant id = " << vid << std::endl; + } + + } + + HISTOGRAM_DATA_TEARDOWN; + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp new file mode 100644 index 000000000..60ad2975e --- /dev/null +++ b/src/algorithm/HISTOGRAM.cpp @@ -0,0 +1,151 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +HISTOGRAM::HISTOGRAM(const RunParams& params) + : KernelBase(rajaperf::Algorithm_HISTOGRAM, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(50); + + setActualProblemSize( getTargetProblemSize() ); + + m_num_bins = params.getMultiReduceNumBins(); + m_bin_assignment_algorithm = params.getMultiReduceBinAssignmentAlgorithm(); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins + + 1*sizeof(Index_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep(1 * getActualProblemSize()); + + setUsesFeature(Forall); + setUsesFeature(Atomic); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); +} + +HISTOGRAM::~HISTOGRAM() +{ +} + +void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocData(m_bins, getActualProblemSize(), vid); + { + auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); + + const bool init_random_per_iterate = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Random); + const bool init_random_sizes = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsRandomSizes); + const bool init_even_sizes = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsEvenSizes); + const bool init_all_one = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Single); + + if (init_even_sizes || init_random_sizes || init_all_one) { + Real_ptr data = nullptr; + if (init_even_sizes) { + allocData(data, m_num_bins, Base_Seq); + for (Index_type b = 0; b < m_num_bins; ++b) { + data[b] = static_cast(b+1) / m_num_bins; + } + } else if (init_random_sizes) { + allocAndInitDataRandValue(data, m_num_bins, Base_Seq); + std::sort(data, data+m_num_bins); + } else if (init_all_one) { + allocData(data, m_num_bins, Base_Seq); + for (Index_type b = 0; b < m_num_bins; ++b) { + data[b] = static_cast(0); + } + } + + Index_type actual_prob_size = getActualProblemSize(); + Index_type bin = 0; + for (Index_type i = 0; i < actual_prob_size; ++i) { + Real_type pos = static_cast(i) / actual_prob_size; + while (bin+1 < m_num_bins && pos >= data[bin]) { + bin += 1; + } + m_bins[i] = bin; + } + + deallocData(data, Base_Seq); + + } else if (init_random_per_iterate) { + Real_ptr data; + allocAndInitDataRandValue(data, getActualProblemSize(), Base_Seq); + + for (Index_type i = 0; i < getActualProblemSize(); ++i) { + m_bins[i] = static_cast(data[i] * m_num_bins); + if (m_bins[i] >= m_num_bins) { + m_bins[i] = m_num_bins - 1; + } + if (m_bins[i] < 0) { + m_bins[i] = 0; + } + } + + deallocData(data, Base_Seq); + } else { + throw 1; + } + } + + m_counts_init.resize(m_num_bins, 0); + m_counts_final.resize(m_num_bins, 0); +} + +void HISTOGRAM::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid][tune_idx] += calcChecksum(m_counts_final.data(), m_num_bins, vid); +} + +void HISTOGRAM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + deallocData(m_bins, vid); + m_counts_init.clear(); m_counts_init.shrink_to_fit(); + m_counts_final.clear(); m_counts_final.shrink_to_fit(); +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp new file mode 100644 index 000000000..2752f2c92 --- /dev/null +++ b/src/algorithm/HISTOGRAM.hpp @@ -0,0 +1,146 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// HISTOGRAM kernel reference implementation: +/// +/// Index_type* counts = calloc(num_bins, sizeof(Index_type)); +/// for (Index_type i = 0; i < N; ++i ) { +/// counts[bins[i]] += 1; +/// } +/// + +#ifndef RAJAPerf_Algorithm_HISTOGRAM_HPP +#define RAJAPerf_Algorithm_HISTOGRAM_HPP + +#define HISTOGRAM_DATA_SETUP \ + Index_type num_bins = m_num_bins; \ + Index_ptr bins = m_bins; \ + std::vector& counts_init = m_counts_init; \ + std::vector& counts_final = m_counts_final; + +#define HISTOGRAM_DATA_TEARDOWN + + +#define HISTOGRAM_SETUP_COUNTS \ + Data_ptr counts; \ + allocData(getReductionDataSpace(vid), counts, num_bins); + +#define HISTOGRAM_TEARDOWN_COUNTS \ + deallocData(counts, vid); + +#define HISTOGRAM_INIT_COUNTS \ + for (Index_type b = 0; b < num_bins; ++b ) { \ + counts[b] = counts_init[b]; \ + } + +#define HISTOGRAM_FINALIZE_COUNTS \ + for (Index_type b = 0; b < num_bins; ++b ) { \ + counts_final[b] = counts[b]; \ + } + +#define HISTOGRAM_INIT_COUNTS_RAJA(policy) \ + RAJA::MultiReduceSum counts(counts_init); + +#define HISTOGRAM_FINALIZE_COUNTS_RAJA(policy) \ + counts.get_all(counts_final); + +#define HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, replication) \ + for (Index_type b = 0; b < (num_bins); ++b) { \ + Data_type count_final = 0; \ + for (size_t r = 0; r < (replication); ++r) { \ + count_final += (hcounts)[HISTOGRAM_GPU_BIN_INDEX(b, r, replication)]; \ + } \ + counts_final[b] = count_final; \ + } + + +#define HISTOGRAM_BODY \ + counts[bins[i]] += static_cast(1); + +#define HISTOGRAM_RAJA_BODY(policy) \ + RAJA::atomicAdd(&counts[bins[i]], static_cast(1)); + +#define HISTOGRAM_GPU_BIN_INDEX(bin, offset, replication) \ + ((bin)*(replication) + ((offset)%(replication))) + +#define HISTOGRAM_GPU_RAJA_BODY(policy, counts, index, value) \ + RAJA::atomicAdd(&(counts)[(index)], (value)); + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace algorithm +{ + +class HISTOGRAM : public KernelBase +{ +public: + using Data_type = unsigned long long; + using Data_ptr = Data_type*; + + HISTOGRAM(const RunParams& params); + + ~HISTOGRAM(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + void runCudaVariantLibrary(VariantID vid); + void runHipVariantLibrary(VariantID vid); + + template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication, + typename MappingHelper > + void runCudaVariantAtomicRuntime(VariantID vid); + template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication, + typename MappingHelper > + void runHipVariantAtomicRuntime(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + + static const size_t default_cuda_atomic_global_replication = 2; + static const size_t default_cuda_atomic_shared_replication = 16; + using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + + static const size_t default_hip_atomic_global_replication = 32; + static const size_t default_hip_atomic_shared_replication = 4; + using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + + Index_type m_num_bins; + RunParams::BinAssignmentAlgorithm m_bin_assignment_algorithm; + Index_ptr m_bins; + std::vector m_counts_init; + std::vector m_counts_final; +}; + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp index fca6848f8..9f0fda034 100644 --- a/src/algorithm/MEMCPY-Cuda.cpp +++ b/src/algorithm/MEMCPY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -48,7 +48,9 @@ void MEMCPY::runCudaVariantLibrary(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync(MEMCPY_STD_ARGS, cudaMemcpyDefault, res.get_stream()) ); + cudaErrchk( cudaMemcpyAsync(MEMCPY_STD_ARGS, + cudaMemcpyDefault, + res.get_stream()) ); } stopTimer(); @@ -89,9 +91,11 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - memcpy<<>>( - x, y, iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (memcpy), + grid_size, block_size, + shmem, res.get_stream(), + x, y, iend ); } stopTimer(); @@ -107,9 +111,12 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, memcpy_lambda ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, memcpy_lambda ); } stopTimer(); diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp index d0c239a67..0e880c1b4 100644 --- a/src/algorithm/MEMCPY-Hip.cpp +++ b/src/algorithm/MEMCPY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -48,7 +48,9 @@ void MEMCPY::runHipVariantLibrary(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync(MEMCPY_STD_ARGS, hipMemcpyDefault, res.get_stream()) ); + hipErrchk( hipMemcpyAsync(MEMCPY_STD_ARGS, + hipMemcpyDefault, + res.get_stream()) ); } stopTimer(); @@ -89,10 +91,11 @@ void MEMCPY::runHipVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL( (memcpy), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - x, y, iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (memcpy), + grid_size, block_size, + shmem, res.get_stream(), + x, y, iend ); } stopTimer(); @@ -108,10 +111,12 @@ void MEMCPY::runHipVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, memcpy_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, memcpy_lambda ); } stopTimer(); diff --git a/src/algorithm/MEMCPY-OMP.cpp b/src/algorithm/MEMCPY-OMP.cpp index 55b63afd6..184f897bf 100644 --- a/src/algorithm/MEMCPY-OMP.cpp +++ b/src/algorithm/MEMCPY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY-OMPTarget.cpp b/src/algorithm/MEMCPY-OMPTarget.cpp index 4f4932793..0b3536d42 100644 --- a/src/algorithm/MEMCPY-OMPTarget.cpp +++ b/src/algorithm/MEMCPY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY-Seq.cpp b/src/algorithm/MEMCPY-Seq.cpp index 02a24668f..57c3f219f 100644 --- a/src/algorithm/MEMCPY-Seq.cpp +++ b/src/algorithm/MEMCPY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index 49446a265..f8ced7ac7 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ MEMCPY::MEMCPY(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp index 9fa46ae9e..b6cd49038 100644 --- a/src/algorithm/MEMCPY.hpp +++ b/src/algorithm/MEMCPY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -71,7 +71,7 @@ class MEMCPY : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp index bca349509..d0c60e97d 100644 --- a/src/algorithm/MEMSET-Cuda.cpp +++ b/src/algorithm/MEMSET-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -89,11 +89,11 @@ void MEMSET::runCudaVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - memset<<>>( x, - val, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (memset), + grid_size, block_size, + shmem, res.get_stream(), + x, val, iend ); } stopTimer(); @@ -109,9 +109,12 @@ void MEMSET::runCudaVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, memset_lambda ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, memset_lambda ); } stopTimer(); diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp index d0dacd545..c838aed28 100644 --- a/src/algorithm/MEMSET-Hip.cpp +++ b/src/algorithm/MEMSET-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -89,10 +89,11 @@ void MEMSET::runHipVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL( (memset), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - x, val, iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (memset), + grid_size, block_size, + shmem, res.get_stream(), + x, val, iend ); } stopTimer(); @@ -108,10 +109,12 @@ void MEMSET::runHipVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, memset_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, memset_lambda ); } stopTimer(); diff --git a/src/algorithm/MEMSET-OMP.cpp b/src/algorithm/MEMSET-OMP.cpp index ebd931e4d..66a6e027c 100644 --- a/src/algorithm/MEMSET-OMP.cpp +++ b/src/algorithm/MEMSET-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET-OMPTarget.cpp b/src/algorithm/MEMSET-OMPTarget.cpp index ec6d9c716..cee5a8577 100644 --- a/src/algorithm/MEMSET-OMPTarget.cpp +++ b/src/algorithm/MEMSET-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET-Seq.cpp b/src/algorithm/MEMSET-Seq.cpp index 145fd462e..3064e7cb1 100644 --- a/src/algorithm/MEMSET-Seq.cpp +++ b/src/algorithm/MEMSET-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index 95d3d5321..04ad4f52c 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ MEMSET::MEMSET(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (0*sizeof(Real_type) + 1*sizeof(Real_type)) + - (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/algorithm/MEMSET.hpp b/src/algorithm/MEMSET.hpp index ebf2f867b..0266c9e1a 100644 --- a/src/algorithm/MEMSET.hpp +++ b/src/algorithm/MEMSET.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -71,7 +71,7 @@ class MEMSET : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_type m_val; diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index d36614f9e..302ab35d6 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,6 +18,10 @@ #include "cub/util_allocator.cuh" #include +#include +#include +#include + namespace rajaperf { @@ -26,7 +30,7 @@ namespace algorithm template < size_t block_size > __launch_bounds__(block_size) -__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, +__global__ void reduce_sum(Real_ptr x, Real_ptr sum, Real_type sum_init, Index_type iend) { extern __shared__ Real_type psum[ ]; @@ -46,15 +50,9 @@ __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, __syncthreads(); } -#if 1 // serialized access to shared data; - if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( dsum, psum[ 0 ] ); - } -#else // this doesn't work due to data races if ( threadIdx.x == 0 ) { - *dsum += psum[ 0 ]; + RAJA::atomicAdd( sum, psum[ 0 ] ); } -#endif } @@ -74,8 +72,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) int len = iend - ibegin; - Real_type* sum_storage; - allocData(DataSpace::CudaPinned, sum_storage, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -83,7 +80,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + sum, len, ::cub::Sum(), m_sum_init, @@ -102,21 +99,21 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + sum, len, ::cub::Sum(), m_sum_init, stream)); - cudaErrchk(cudaStreamSynchronize(stream)); - m_sum = *sum_storage; + RAJAPERF_CUDA_REDUCER_COPY_BACK(sum, hsum, 1, 1); + m_sum = hsum[0]; } stopTimer(); // Free temporary storage deallocData(DataSpace::CudaDevice, temp_storage); - deallocData(DataSpace::CudaPinned, sum_storage); + RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum); } else { @@ -126,11 +123,10 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) } -template < size_t block_size > -void REDUCE_SUM::runCudaVariantBlock(VariantID vid) +template < size_t block_size, typename MappingHelper > +void REDUCE_SUM::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -139,40 +135,68 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - Real_ptr dsum; - allocData(DataSpace::CudaDevice, dsum, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (reduce_sum), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1, 1); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - reduce_sum<<>>( x, - dsum, m_sum_init, - iend ); - cudaErrchk( cudaGetLastError() ); + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - cudaErrchk( cudaMemcpyAsync( &m_sum, dsum, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + RPlaunchCudaKernel( (reduce_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, sum, m_sum_init, iend ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(sum, hsum, 1, 1); + m_sum = hsum[0]; } stopTimer(); - deallocData(DataSpace::CudaDevice, dsum); + RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE_SUM::runCudaVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_SUM_DATA_SETUP; - } else if ( vid == RAJA_CUDA ) { + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sum(m_sum_init); + RAJA::ReduceSum sum(m_sum_init); - RAJA::forall< RAJA::cuda_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_SUM_BODY; }); @@ -190,11 +214,54 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) } +template < size_t block_size, typename MappingHelper > +void REDUCE_SUM::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] __device__ (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) { - if ( vid == Base_CUDA ) { + size_t t = 0; - size_t t = 0; + if ( vid == Base_CUDA ) { if (tune_idx == t) { @@ -204,39 +271,59 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) t += 1; + } + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - setBlockSize(block_size); - runCudaVariantBlock(vid); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - } + if ( vid == Base_CUDA ) { - t += 1; + if (tune_idx == t) { - } + setBlockSize(block_size); + runCudaVariantBase(vid); - }); + } - } else if ( vid == RAJA_CUDA ) { + t += 1; - size_t t = 0; + } else if ( vid == RAJA_CUDA ) { - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runCudaVariantRAJA(vid); - runCudaVariantBlock(vid); + } - } + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } - t += 1; + t += 1; + + } + + }); } @@ -256,31 +343,53 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, "cub"); + } + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - } + if ( vid == Base_CUDA ) { - }); + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - } else if ( vid == RAJA_CUDA ) { + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + } else if ( vid == RAJA_CUDA ) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + }); } }); } + } } // end namespace algorithm diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 88a16f331..831978015 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,6 +23,10 @@ #include "common/HipDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -31,7 +35,7 @@ namespace algorithm template < size_t block_size > __launch_bounds__(block_size) -__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, +__global__ void reduce_sum(Real_ptr x, Real_ptr sum, Real_type sum_init, Index_type iend) { HIP_DYNAMIC_SHARED(Real_type, psum); @@ -51,15 +55,9 @@ __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, __syncthreads(); } -#if 1 // serialized access to shared data; - if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( dsum, psum[ 0 ] ); - } -#else // this doesn't work due to data races if ( threadIdx.x == 0 ) { - *dsum += psum[ 0 ]; + RAJA::atomicAdd( sum, psum[ 0 ] ); } -#endif } @@ -79,8 +77,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) int len = iend - ibegin; - Real_type* sum_storage; - allocData(DataSpace::HipPinned, sum_storage, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -89,7 +86,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::rocprim::reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + sum, m_sum_init, len, rocprim::plus(), @@ -98,7 +95,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + sum, len, ::cub::Sum(), m_sum_init, @@ -119,7 +116,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::rocprim::reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + sum, m_sum_init, len, rocprim::plus(), @@ -128,22 +125,22 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + sum, len, ::cub::Sum(), m_sum_init, stream)); #endif - hipErrchk(hipStreamSynchronize(stream)); - m_sum = *sum_storage; + RAJAPERF_HIP_REDUCER_COPY_BACK(sum, hsum, 1, 1); + m_sum = hsum[0]; } stopTimer(); // Free temporary storage deallocData(DataSpace::HipDevice, temp_storage); - deallocData(DataSpace::HipPinned, sum_storage); + RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum); } else { @@ -153,11 +150,10 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) } -template < size_t block_size > -void REDUCE_SUM::runHipVariantBlock(VariantID vid) +template < size_t block_size, typename MappingHelper > +void REDUCE_SUM::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -166,39 +162,68 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - Real_ptr dsum; - allocData(DataSpace::HipDevice, dsum, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (reduce_sum), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1, 1); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - hipLaunchKernelGGL( (reduce_sum), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), - x, dsum, m_sum_init, iend ); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (reduce_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, sum, m_sum_init, iend ); - hipErrchk( hipMemcpyAsync( &m_sum, dsum, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + RAJAPERF_HIP_REDUCER_COPY_BACK(sum, hsum, 1, 1); + m_sum = hsum[0]; } stopTimer(); - deallocData(DataSpace::HipDevice, dsum); + RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE_SUM::runHipVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_SUM_DATA_SETUP; - } else if ( vid == RAJA_HIP ) { + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sum(m_sum_init); + RAJA::ReduceSum sum(m_sum_init); - RAJA::forall< RAJA::hip_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_SUM_BODY; }); @@ -216,11 +241,54 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) } +template < size_t block_size, typename MappingHelper > +void REDUCE_SUM::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] __device__ (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + } + +} + void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) { - if ( vid == Base_HIP ) { + size_t t = 0; - size_t t = 0; + if ( vid == Base_HIP ) { if (tune_idx == t) { @@ -230,39 +298,59 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) t += 1; + } + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - setBlockSize(block_size); - runHipVariantBlock(vid); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - } + if ( vid == Base_HIP ) { - t += 1; + if (tune_idx == t) { - } + setBlockSize(block_size); + runHipVariantBase(vid); - }); + } - } else if ( vid == RAJA_HIP ) { + t += 1; - size_t t = 0; + } else if ( vid == RAJA_HIP ) { - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runHipVariantRAJA(vid); - runHipVariantBlock(vid); + } - } + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); - t += 1; + } + + t += 1; + + } + + }); } @@ -286,25 +374,45 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) addVariantTuningName(vid, "cub"); #endif + } + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - } + if ( vid == Base_HIP ) { - }); + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - } else if ( vid == RAJA_HIP ) { + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + } else if ( vid == RAJA_HIP ) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + }); } diff --git a/src/algorithm/REDUCE_SUM-OMP.cpp b/src/algorithm/REDUCE_SUM-OMP.cpp index 49d0d766e..1295887f5 100644 --- a/src/algorithm/REDUCE_SUM-OMP.cpp +++ b/src/algorithm/REDUCE_SUM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,7 +18,7 @@ namespace algorithm { -void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -76,21 +76,48 @@ void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else if (tune_idx == 1) { - RAJA::ReduceSum sum(m_sum_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE_SUM_BODY; - }); + Real_type tsum = m_sum_init; - m_sum = sum.get(); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_SUM : Unknown OpenMP tuning index = " << tune_idx << std::endl; } - stopTimer(); + break; } @@ -103,8 +130,17 @@ void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void REDUCE_SUM::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp index a8652099e..1c1be1ab7 100644 --- a/src/algorithm/REDUCE_SUM-OMPTarget.cpp +++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -27,7 +27,7 @@ namespace algorithm const size_t threads_per_team = 256; -void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -56,21 +56,47 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else if (tune_idx == 1) { - RAJA::ReduceSum sum(m_sum_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE_SUM_BODY; - }); + Real_type tsum = m_sum_init; - m_sum = sum.get(); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_SUM : Unknown OMP Target tuning index = " << tune_idx << std::endl; } - stopTimer(); } else { getCout() << "\n REDUCE_SUM : Unknown OMP Target variant id = " << vid << std::endl; @@ -78,6 +104,14 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } +void REDUCE_SUM::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM-Seq.cpp b/src/algorithm/REDUCE_SUM-Seq.cpp index 8c7086057..8d4fdacb2 100644 --- a/src/algorithm/REDUCE_SUM-Seq.cpp +++ b/src/algorithm/REDUCE_SUM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,8 +18,11 @@ namespace algorithm { -void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_SUM::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -73,23 +76,48 @@ void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else if (tune_idx == 1) { - RAJA::ReduceSum sum(m_sum_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE_SUM_BODY; - }); + Real_type tsum = m_sum_init; - m_sum = sum.get(); + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_SUM : Unknown Seq tuning index = " << tune_idx << std::endl; } - stopTimer(); break; - } + } #endif default : { @@ -100,5 +128,13 @@ void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id } +void REDUCE_SUM::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM-Sycl.cpp b/src/algorithm/REDUCE_SUM-Sycl.cpp new file mode 100644 index 000000000..516048863 --- /dev/null +++ b/src/algorithm/REDUCE_SUM-Sycl.cpp @@ -0,0 +1,103 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace algorithm +{ + +template +void REDUCE_SUM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + Real_ptr sum; + allocAndInitSyclDeviceData(sum, &m_sum_init, 1, qu); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(sum, &m_sum_init, 1, qu); + + qu->submit([&] (sycl::handler& h) { + + auto sumReduction = sycl::reduction(sum, sycl::plus()); + + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sumReduction, + [=] (sycl::nd_item<1> item, auto& sum) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + REDUCE_SUM_BODY; + } + + }); + }); + + Real_type lsum; + Real_ptr plsum = &lsum; + getSyclDeviceData(plsum, sum, 1, qu); + m_sum = lsum; + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + RAJA::forall< RAJA::sycl_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + std::cout << "\n REDUCE_SUM : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_SUM, Sycl) + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 3712f5ffa..4aebb5b0f 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(getActualProblemSize()); setUsesFeature(Forall); @@ -51,6 +52,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } REDUCE_SUM::~REDUCE_SUM() diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index ba9e9308b..c9f1a3c74 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -58,19 +58,37 @@ class REDUCE_SUM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + void runCudaVariantCub(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + void runHipVariantRocprim(VariantID vid); - template < size_t block_size > - void runCudaVariantBlock(VariantID vid); - template < size_t block_size > - void runHipVariantBlock(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); + + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_type m_sum_init; diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 674e25f5a..977c91e24 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -16,6 +16,7 @@ #include "cub/util_allocator.cuh" #include "common/CudaDataUtils.hpp" +#include "common/CudaGridScan.hpp" #include @@ -24,8 +25,52 @@ namespace rajaperf namespace algorithm { +template < size_t block_size > +using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< + detail::cuda::grid_scan_max_items_per_thread::value+1, + integer::LessEqual::value>>; -void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) + +template < size_t block_size, size_t items_per_thread > +__launch_bounds__(block_size) +__global__ void scan(Real_ptr x, + Real_ptr y, + Real_ptr block_counts, + Real_ptr grid_counts, + unsigned* block_readys, + Index_type iend) +{ + // blocks do start running in order in cuda, so a block with a higher + // index can wait on a block with a lower index without deadlocking + // (replace with an atomicInc if this changes) + const int block_id = blockIdx.x; + + Real_type vals[items_per_thread]; + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + if (i < iend) { + vals[ti] = x[i]; + } else { + vals[ti] = 0; + } + } + + Real_type exclusives[items_per_thread]; + Real_type inclusives[items_per_thread]; + detail::cuda::GridScan::grid_scan( + block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + if (i < iend) { + y[i] = exclusives[ti]; + } + } +} + + +void SCAN::runCudaVariantLibrary(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -85,16 +130,175 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::exclusive_scan< RAJA::cuda_exec >(res, RAJA_SCAN_ARGS); + RAJA::exclusive_scan< RAJA::cuda_exec<0, true /*async*/> >(res, RAJA_SCAN_ARGS); + + } + stopTimer(); + + } else { + getCout() << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size, size_t items_per_thread > +void SCAN::runCudaVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + SCAN_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); + const size_t shmem_size = 0; + + Real_ptr block_counts; + allocData(DataSpace::CudaDevice, block_counts, grid_size); + Real_ptr grid_counts; + allocData(DataSpace::CudaDevice, grid_counts, grid_size); + unsigned* block_readys; + allocData(DataSpace::CudaDevice, block_readys, grid_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, + res.get_stream()) ); + RPlaunchCudaKernel( (scan), + grid_size, block_size, + shmem_size, res.get_stream(), + x+ibegin, y+ibegin, + block_counts, grid_counts, block_readys, + iend-ibegin ); } stopTimer(); + deallocData(DataSpace::CudaDevice, block_counts); + deallocData(DataSpace::CudaDevice, grid_counts); + deallocData(DataSpace::CudaDevice, block_readys); + } else { getCout() << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; } } + +void SCAN::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + runCudaVariantLibrary(vid); + + } + + t += 1; + + if ( vid == Base_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + using cuda_items_per_thread = cuda_items_per_thread_type; + + if (camp::size::value == 0) { + + if (tune_idx == t) { + + runCudaVariantImpl::value + >(vid); + + } + + t += 1; + + } + + seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + if (tune_idx == t) { + + runCudaVariantImpl(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } + + } else { + + getCout() << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; + + } +} + +void SCAN::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + addVariantTuningName(vid, "cub"); + + if ( vid == Base_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + using cuda_items_per_thread = cuda_items_per_thread_type; + + if (camp::size::value == 0) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + } + + seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_" + "block_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 6e7135188..22f0bea57 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -21,6 +21,7 @@ #endif #include "common/HipDataUtils.hpp" +#include "common/HipGridScan.hpp" #include @@ -29,8 +30,52 @@ namespace rajaperf namespace algorithm { +template < size_t block_size > +using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< + detail::hip::grid_scan_max_items_per_thread::value+1, + integer::LessEqual::value>>; -void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) + +template < size_t block_size, size_t items_per_thread > +__launch_bounds__(block_size) +__global__ void scan(Real_ptr x, + Real_ptr y, + Real_ptr block_counts, + Real_ptr grid_counts, + unsigned* block_readys, + Index_type iend) +{ + // It looks like blocks do not start running in order in hip, so a block + // with a higher index can't wait on a block with a lower index without + // deadlocking (have to replace with an atomicInc) + const int block_id = blockIdx.x; + + Real_type vals[items_per_thread]; + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + if (i < iend) { + vals[ti] = x[i]; + } else { + vals[ti] = 0; + } + } + + Real_type exclusives[items_per_thread]; + Real_type inclusives[items_per_thread]; + detail::hip::GridScan::grid_scan( + block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + if (i < iend) { + y[i] = exclusives[ti]; + } + } +} + + +void SCAN::runHipVariantLibrary(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -112,7 +157,7 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::exclusive_scan< RAJA::hip_exec >(res, RAJA_SCAN_ARGS); + RAJA::exclusive_scan< RAJA::hip_exec<0, true /*async*/> >(res, RAJA_SCAN_ARGS); } stopTimer(); @@ -122,6 +167,164 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } } +template < size_t block_size, size_t items_per_thread > +void SCAN::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + SCAN_DATA_SETUP; + + if ( vid == Base_HIP ) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); + const size_t shmem_size = 0; + + Real_ptr block_counts; + allocData(DataSpace::HipDevice, block_counts, grid_size); + Real_ptr grid_counts; + allocData(DataSpace::HipDevice, grid_counts, grid_size); + unsigned* block_readys; + allocData(DataSpace::HipDevice, block_readys, grid_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, + res.get_stream()) ); + + RPlaunchHipKernel( (scan), + grid_size, block_size, + shmem_size, res.get_stream(), + x+ibegin, y+ibegin, + block_counts, grid_counts, block_readys, + iend-ibegin ); + + } + stopTimer(); + + deallocData(DataSpace::HipDevice, block_counts); + deallocData(DataSpace::HipDevice, grid_counts); + deallocData(DataSpace::HipDevice, block_readys); + + } else { + getCout() << "\n SCAN : Unknown Hip variant id = " << vid << std::endl; + } +} + + +void SCAN::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + if (tune_idx == t) { + + runHipVariantLibrary(vid); + + } + + t += 1; + + if ( vid == Base_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + using hip_items_per_thread = hip_items_per_thread_type; + + if (camp::size::value == 0) { + + if (tune_idx == t) { + + runHipVariantImpl::value + >(vid); + + } + + t += 1; + + } + + seq_for(hip_items_per_thread{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + if (tune_idx == t) { + + runHipVariantImpl(vid); + + } + + t += 1; + + } + + }); + + } + + }); + } + + } else { + + getCout() << "\n SCAN : Unknown Hip variant id = " << vid << std::endl; + + } +} + +void SCAN::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + addVariantTuningName(vid, "rocprim"); + + if ( vid == Base_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + using hip_items_per_thread = hip_items_per_thread_type; + + if (camp::size::value == 0) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + } + + seq_for(hip_items_per_thread{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_" + "block_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 30cb534df..a5c04abc4 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -28,7 +28,9 @@ SCAN::SCAN(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); checksum_scale_factor = 1e-2 * diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index 519789a55..f55381d21 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -10,9 +10,10 @@ /// SCAN kernel reference implementation: /// /// // exclusive scan -/// y[ibegin] = 0; -/// for (Index_type i = ibegin+1; i < iend; ++i) { -/// y[i] = y[i-1] + x[i-1]; +/// Real_type scan_var = 0; +/// for (Index_type i = ibegin; i < iend; ++i) { +/// y[i] = scan_var; +/// scan_var += x[i]; /// } /// @@ -62,8 +63,18 @@ class SCAN : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + void runCudaVariantLibrary(VariantID vid); + void runHipVariantLibrary(VariantID vid); + template < size_t block_size, size_t items_per_thread > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size, size_t items_per_thread > + void runHipVariantImpl(VariantID vid); + private: - static const size_t default_gpu_block_size = 0; + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index 45cd40d63..4d77667d7 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index d87445413..c464bae4e 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp index 05b885d50..133b00a88 100644 --- a/src/algorithm/SORT-OMP.cpp +++ b/src/algorithm/SORT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp index c5e1503af..2d458ff4d 100644 --- a/src/algorithm/SORT-Seq.cpp +++ b/src/algorithm/SORT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index b7738f264..bc99df634 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ SORT::SORT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); // touched data size, not actual number of stores and loads + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Sort); diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index b51bf12f9..9df61e411 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index 57176e3db..1f102eb91 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index aece079d4..467a3cbf4 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp index 39705af9a..cdf0f044a 100644 --- a/src/algorithm/SORTPAIRS-OMP.cpp +++ b/src/algorithm/SORTPAIRS-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp index 91c094ce9..320e307f4 100644 --- a/src/algorithm/SORTPAIRS-Seq.cpp +++ b/src/algorithm/SORTPAIRS-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index a07f1e79b..6315970b2 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ SORTPAIRS::SORTPAIRS(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (2*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); // touched data size, not actual number of stores and loads + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm + setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Sort); diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index 4cfc3eb36..fa53a15c3 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index bade73b59..facb3d592 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -16,6 +16,40 @@ namespace rajaperf namespace apps { + +std::ostream& operator<<(std::ostream& stream, const ADomain& domain) +{ + return stream + + << "ADomain" + + << " ndims " << domain.ndims + << " NPNL " << domain.NPNL + << " NPNR " << domain.NPNR + + << " imin " << domain.imin + << " jmin " << domain.jmin + << " kmin " << domain.kmin + << " imax " << domain.imax + << " jmax " << domain.jmax + << " kmax " << domain.kmax + + << " jp " << domain.jp + << " kp " << domain.kp + << " nnalls " << domain.nnalls + + << " fpn " << domain.fpn + << " lpn " << domain.lpn + << " frn " << domain.frn + << " lrn " << domain.lrn + + << " fpz " << domain.fpz + << " lpz " << domain.lpz + + << " n_real_zones " << domain.n_real_zones + << " n_real_nodes " << domain.n_real_nodes ; +} + // // Set zone indices for 2d mesh. // @@ -38,10 +72,10 @@ void setRealZones_2d(Index_type* real_zones, for (Index_type j = jmin; j < jmax; j++) { for (Index_type i = imin; i < imax; i++) { - Index_type ip = i + j*jp ; + Index_type iz = i + j*jp ; - Index_type id = (i-imin) + (j-jmin)*j_stride ; - real_zones[id] = ip; + Index_type il = (i-imin) + (j-jmin)*j_stride ; + real_zones[il] = iz; } } } @@ -73,10 +107,10 @@ void setRealZones_3d(Index_type* real_zones, for (Index_type k = kmin; k < kmax; k++) { for (Index_type j = jmin; j < jmax; j++) { for (Index_type i = imin; i < imax; i++) { - Index_type ip = i + j*jp + k*kp ; + Index_type iz = i + j*jp + k*kp ; - Index_type id = (i-imin) + (j-jmin)*j_stride + (k-kmin)*k_stride ; - real_zones[id] = ip; + Index_type il = (i-imin) + (j-jmin)*j_stride + (k-kmin)*k_stride ; + real_zones[il] = iz; } } } @@ -104,20 +138,13 @@ void setMeshPositions_2d(Real_ptr x, Real_type dx, Index_type npnl = domain.NPNL; Index_type npnr = domain.NPNR; - Real_ptr x1, x2, x3, x4; - Real_ptr y1, y2, y3, y4; - NDSET2D(domain.jp, x, x1,x2,x3,x4) ; - NDSET2D(domain.jp, y, y1,y2,y3,y4) ; + for (Index_type j = jmin - npnl; j < jmax+1 + npnr; j++) { + for (Index_type i = imin - npnl; i < imax+1 + npnr; i++) { + Index_type in = i + j*jp ; - for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { - for (Index_type i = imin - npnl; i < imax + npnr; i++) { - Index_type iz = i + j*jp ; - - x3[iz] = x4[iz] = i*dx; - x1[iz] = x2[iz] = (i+1)*dx; + x[in] = i*dx; - y1[iz] = y4[iz] = j*dy; - y2[iz] = y3[iz] = (j+1)*dy; + y[in] = j*dy; } } @@ -150,26 +177,16 @@ void setMeshPositions_3d(Real_ptr x, Real_type dx, Index_type npnl = domain.NPNL; Index_type npnr = domain.NPNR; - Real_ptr x0, x1, x2, x3, x4, x5, x6, x7; - Real_ptr y0, y1, y2, y3, y4, y5, y6, y7; - Real_ptr z0, z1, z2, z3, z4, z5, z6, z7; - NDPTRSET(domain.jp, domain.kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - NDPTRSET(domain.jp, domain.kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; - NDPTRSET(domain.jp, domain.kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; - - for (Index_type k = kmin - npnl; k < kmax + npnr; k++) { - for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { - for (Index_type i = imin - npnl; i < imax + npnr; i++) { - Index_type iz = i + j*jp + k*kp ; + for (Index_type k = kmin - npnl; k < kmax+1 + npnr; k++) { + for (Index_type j = jmin - npnl; j < jmax+1 + npnr; j++) { + for (Index_type i = imin - npnl; i < imax+1 + npnr; i++) { + Index_type in = i + j*jp + k*kp ; - x0[iz] = x2[iz] = x4[iz] = x6[iz] = i*dx; - x1[iz] = x3[iz] = x5[iz] = x7[iz] = (i+1)*dx; + x[in] = i*dx; - y0[iz] = y1[iz] = y4[iz] = y5[iz] = j*dy; - y2[iz] = y3[iz] = y6[iz] = y7[iz] = (j+1)*dy; + y[in] = j*dy; - z0[iz] = z1[iz] = z2[iz] = z3[iz] = k*dz; - z4[iz] = z5[iz] = z6[iz] = z7[iz] = (k+1)*dz; + z[in] = k*dz; } } diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index a4b566c6b..b1908b7a5 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -9,6 +9,8 @@ #ifndef RAJAPerf_AppsData_HPP #define RAJAPerf_AppsData_HPP +#include + #include "common/RPTypes.hpp" namespace rajaperf @@ -47,40 +49,57 @@ class ADomain ADomain() = delete; - ADomain( Index_type rzmax, Index_type ndims ) + ADomain( Index_type real_nodes_per_dim, Index_type ndims ) : ndims(ndims), NPNL(2), NPNR(1) { - imin = NPNL; - jmin = NPNL; - imax = rzmax + NPNR; - jmax = rzmax + NPNR; - jp = imax - imin + 1 + NPNL + NPNR; - n_real_zones = (imax - imin); - n_real_nodes = (imax+1 - imin); - - if ( ndims == 2 ) { - kmin = 0; - kmax = 0; - kp = 0; - nnalls = jp * (jmax - jmin + 1 + NPNL + NPNR) ; + int NPZL = NPNL - 1; + int NPZR = NPNR+1 - 1; + + if ( ndims >= 1 ) { + imin = NPNL; + imax = NPNL + real_nodes_per_dim-1; + nnalls = (imax+1 - imin + NPNL + NPNR); + n_real_zones = (imax - imin); + n_real_nodes = (imax+1 - imin); + } else { + imin = 0; + imax = 0; + nnalls = 0; + } + + if ( ndims >= 2 ) { + jmin = NPNL; + jmax = NPNL + real_nodes_per_dim-1; + jp = nnalls; + nnalls *= (jmax+1 - jmin + NPNL + NPNR); n_real_zones *= (jmax - jmin); n_real_nodes *= (jmax+1 - jmin); - } else if ( ndims == 3 ) { + } else { + jmin = 0; + jmax = 0; + jp = 0; + } + + if ( ndims >= 3 ) { kmin = NPNL; - kmax = rzmax + NPNR; - kp = jp * (jmax - jmin + 1 + NPNL + NPNR); - nnalls = kp * (kmax - kmin + 1 + NPNL + NPNR) ; - n_real_zones *= (jmax - jmin) * (kmax - kmin); - n_real_nodes *= (jmax+1 - jmin) * (kmax+1 - kmin); + kmax = NPNL + real_nodes_per_dim-1; + kp = nnalls; + nnalls *= (kmax+1 - kmin + NPNL + NPNR); + n_real_zones *= (kmax - kmin); + n_real_nodes *= (kmax+1 - kmin); + } else { + kmin = 0; + kmax = 0; + kp = 0; } - fpn = 0; - lpn = nnalls - 1; - frn = fpn + NPNL * (kp + jp) + NPNL; - lrn = lpn - NPNR * (kp + jp) - NPNR; + frn = kmin*kp + jmin*jp + imin; + lrn = kmax*kp + jmax*jp + imax; + fpn = (kmin - NPNL)*kp + (jmin - NPNL)*jp + (imin - NPNL); + lpn = (kmax + NPNR)*kp + (jmax + NPNR)*jp + (imax + NPNR); - fpz = frn - jp - kp - 1; - lpz = lrn; + fpz = (kmin - NPZL)*kp + (jmin - NPZL)*jp + (imin - NPZL); + lpz = (kmax-1 + NPZR)*kp + (jmax-1 + NPZR)*jp + (imax-1 + NPZR); } ~ADomain() @@ -114,6 +133,8 @@ class ADomain Index_type n_real_nodes; }; +std::ostream& operator<<(std::ostream& stream, const ADomain& domain); + // // Routines for initializing real zone indices for 2d/3d domains. // diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index dc3485354..4a0584e96 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -15,95 +15,103 @@ blt_add_library( CONVECTION3DPA-Seq.cpp CONVECTION3DPA-OMP.cpp CONVECTION3DPA-OMPTarget.cpp - DEL_DOT_VEC_2D.cpp - DEL_DOT_VEC_2D-Seq.cpp - DEL_DOT_VEC_2D-Hip.cpp - DEL_DOT_VEC_2D-Cuda.cpp - DEL_DOT_VEC_2D-OMP.cpp - DEL_DOT_VEC_2D-OMPTarget.cpp + CONVECTION3DPA-Sycl.cpp + DEL_DOT_VEC_2D.cpp + DEL_DOT_VEC_2D-Seq.cpp + DEL_DOT_VEC_2D-Hip.cpp + DEL_DOT_VEC_2D-Cuda.cpp + DEL_DOT_VEC_2D-OMP.cpp + DEL_DOT_VEC_2D-OMPTarget.cpp + DEL_DOT_VEC_2D-Sycl.cpp DIFFUSION3DPA.cpp DIFFUSION3DPA-Cuda.cpp DIFFUSION3DPA-Hip.cpp DIFFUSION3DPA-Seq.cpp DIFFUSION3DPA-OMP.cpp DIFFUSION3DPA-OMPTarget.cpp + DIFFUSION3DPA-Sycl.cpp EDGE3D.cpp EDGE3D-Cuda.cpp EDGE3D-Hip.cpp EDGE3D-Seq.cpp EDGE3D-OMP.cpp EDGE3D-OMPTarget.cpp + EDGE3D-Sycl.cpp ENERGY.cpp ENERGY-Seq.cpp - ENERGY-Hip.cpp - ENERGY-Cuda.cpp - ENERGY-OMP.cpp - ENERGY-OMPTarget.cpp + ENERGY-Hip.cpp + ENERGY-Cuda.cpp + ENERGY-OMP.cpp + ENERGY-OMPTarget.cpp + ENERGY-Sycl.cpp FIR.cpp FIR-Seq.cpp FIR-Hip.cpp FIR-Cuda.cpp FIR-OMP.cpp FIR-OMPTarget.cpp - HALOEXCHANGE.cpp - HALOEXCHANGE-Seq.cpp - HALOEXCHANGE-Hip.cpp - HALOEXCHANGE-Cuda.cpp - HALOEXCHANGE-OMP.cpp - HALOEXCHANGE-OMPTarget.cpp - HALOEXCHANGE_FUSED.cpp - HALOEXCHANGE_FUSED-Seq.cpp - HALOEXCHANGE_FUSED-Hip.cpp - HALOEXCHANGE_FUSED-Cuda.cpp - HALOEXCHANGE_FUSED-OMP.cpp - HALOEXCHANGE_FUSED-OMPTarget.cpp + FIR-Sycl.cpp LTIMES.cpp LTIMES-Seq.cpp LTIMES-Hip.cpp LTIMES-Cuda.cpp LTIMES-OMP.cpp LTIMES-OMPTarget.cpp + LTIMES-Sycl.cpp LTIMES_NOVIEW.cpp LTIMES_NOVIEW-Seq.cpp LTIMES_NOVIEW-Hip.cpp LTIMES_NOVIEW-Cuda.cpp LTIMES_NOVIEW-OMP.cpp LTIMES_NOVIEW-OMPTarget.cpp + LTIMES_NOVIEW-Sycl.cpp MASS3DEA.cpp MASS3DEA-Cuda.cpp MASS3DEA-Hip.cpp MASS3DEA-Seq.cpp MASS3DEA-OMP.cpp - MASS3DEA-OMPTarget.cpp + MASS3DEA-OMPTarget.cpp + MASS3DEA-Sycl.cpp MASS3DPA.cpp MASS3DPA-Cuda.cpp MASS3DPA-Hip.cpp MASS3DPA-Seq.cpp MASS3DPA-OMP.cpp MASS3DPA-OMPTarget.cpp + MASS3DPA-Sycl.cpp + MATVEC_3D_STENCIL.cpp + MATVEC_3D_STENCIL-Seq.cpp + MATVEC_3D_STENCIL-Hip.cpp + MATVEC_3D_STENCIL-Cuda.cpp + MATVEC_3D_STENCIL-OMP.cpp + MATVEC_3D_STENCIL-OMPTarget.cpp + MATVEC_3D_STENCIL-Sycl.cpp NODAL_ACCUMULATION_3D.cpp NODAL_ACCUMULATION_3D-Seq.cpp NODAL_ACCUMULATION_3D-Hip.cpp NODAL_ACCUMULATION_3D-Cuda.cpp NODAL_ACCUMULATION_3D-OMP.cpp NODAL_ACCUMULATION_3D-OMPTarget.cpp - PRESSURE.cpp - PRESSURE-Seq.cpp - PRESSURE-Hip.cpp - PRESSURE-Cuda.cpp - PRESSURE-OMP.cpp - PRESSURE-OMPTarget.cpp + PRESSURE.cpp + PRESSURE-Seq.cpp + PRESSURE-Hip.cpp + PRESSURE-Cuda.cpp + PRESSURE-OMP.cpp + PRESSURE-OMPTarget.cpp + PRESSURE-Sycl.cpp VOL3D.cpp VOL3D-Seq.cpp - VOL3D-Hip.cpp - VOL3D-Cuda.cpp - VOL3D-OMP.cpp - VOL3D-OMPTarget.cpp + VOL3D-Hip.cpp + VOL3D-Cuda.cpp + VOL3D-OMP.cpp + VOL3D-OMPTarget.cpp + VOL3D-Sycl.cpp ZONAL_ACCUMULATION_3D.cpp ZONAL_ACCUMULATION_3D-Seq.cpp ZONAL_ACCUMULATION_3D-Hip.cpp ZONAL_ACCUMULATION_3D-Cuda.cpp ZONAL_ACCUMULATION_3D-OMP.cpp ZONAL_ACCUMULATION_3D-OMPTarget.cpp + ZONAL_ACCUMULATION_3D-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/apps/CONVECTION3DPA-Cuda.cpp b/src/apps/CONVECTION3DPA-Cuda.cpp index 5b5e5f1f4..6160430c0 100644 --- a/src/apps/CONVECTION3DPA-Cuda.cpp +++ b/src/apps/CONVECTION3DPA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -138,16 +138,16 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { case Base_CUDA: { - dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); constexpr size_t shmem = 0; - Convection3DPA<<>> - (Basis, tBasis, dBasis, D, X, Y); - cudaErrchk(cudaGetLastError()); + RPlaunchCudaKernel( (Convection3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + Basis, tBasis, dBasis, D, X, Y ); } stopTimer(); diff --git a/src/apps/CONVECTION3DPA-Hip.cpp b/src/apps/CONVECTION3DPA-Hip.cpp index ed0eef3e4..12300f940 100644 --- a/src/apps/CONVECTION3DPA-Hip.cpp +++ b/src/apps/CONVECTION3DPA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -138,18 +138,16 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { case Base_HIP: { - dim3 nblocks(NE); - dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); constexpr size_t shmem = 0; - hipLaunchKernelGGL((Convection3DPA), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - Basis, tBasis, dBasis, D, X, Y); - - hipErrchk(hipGetLastError()); + + RPlaunchHipKernel( (Convection3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + Basis, tBasis, dBasis, D, X, Y ); } stopTimer(); diff --git a/src/apps/CONVECTION3DPA-OMP.cpp b/src/apps/CONVECTION3DPA-OMP.cpp index b414122cb..2826defd0 100644 --- a/src/apps/CONVECTION3DPA-OMP.cpp +++ b/src/apps/CONVECTION3DPA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA-OMPTarget.cpp b/src/apps/CONVECTION3DPA-OMPTarget.cpp index e0317c930..6affba0c6 100644 --- a/src/apps/CONVECTION3DPA-OMPTarget.cpp +++ b/src/apps/CONVECTION3DPA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA-Seq.cpp b/src/apps/CONVECTION3DPA-Seq.cpp index a62a93409..9f18a2da8 100644 --- a/src/apps/CONVECTION3DPA-Seq.cpp +++ b/src/apps/CONVECTION3DPA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA-Sycl.cpp b/src/apps/CONVECTION3DPA-Sycl.cpp new file mode 100644 index 000000000..c01087818 --- /dev/null +++ b/src/apps/CONVECTION3DPA-Sycl.cpp @@ -0,0 +1,421 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "CONVECTION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t work_group_size > +void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + CONVECTION3DPA_DATA_SETUP; + + const ::sycl::range<3> workGroupSize(CPA_Q1D, CPA_Q1D, CPA_Q1D); + const ::sycl::range<3> gridSize(CPA_Q1D,CPA_Q1D,CPA_Q1D*NE); + + constexpr size_t shmem = 0; + + switch (vid) { + + case Base_SYCL: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&](cl::sycl::handler& h) { + + constexpr int max_D1D = CPA_D1D; + constexpr int max_Q1D = CPA_Q1D; + constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; + + auto sm0_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm1_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm2_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm3_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm4_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm5_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, workGroupSize), + [=] (cl::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(2); + + double *sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1 = sm1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm2 = sm2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm3 = sm3_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm4 = sm4_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm5 = sm5_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; + double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; + double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; + double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; + double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; + double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; + double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; + double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; + double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; + double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; + double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; + double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; + + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + { + SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + { + SYCL_FOREACH_THREAD(dx,2,CPA_D1D) + { + CONVECTION3DPA_1; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + { + SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + { + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + { + CONVECTION3DPA_2; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + { + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + { + CONVECTION3DPA_3; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qz,0,CPA_Q1D) + { + CONVECTION3DPA_4; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(qz,0,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + { + CONVECTION3DPA_5; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + { + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + { + CONVECTION3DPA_6; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + { + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + { + SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + { + CONVECTION3DPA_7; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + { + SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + { + SYCL_FOREACH_THREAD(dx,2,CPA_D1D) + { + CONVECTION3DPA_8; + } + } + } + }); + + }); + + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = + RAJA::LaunchPolicy>; + + using outer_x = + RAJA::LoopPolicy; + + using inner_x = + RAJA::LoopPolicy; + + using inner_y = + RAJA::LoopPolicy; + + using inner_z = + RAJA::LoopPolicy; + + //Caclulate amount of shared memory needed + size_t shmem = 0; + { + constexpr int max_D1D = CPA_D1D; + constexpr int max_Q1D = CPA_Q1D; + constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; + + constexpr int no_mats = 6; + shmem += max_DQ*max_DQ*max_DQ * no_mats * sizeof(double); + } + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(CPA_Q1D, CPA_Q1D, CPA_Q1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + //Redefine inside the lambda to keep consistent with base version + constexpr int max_D1D = CPA_D1D; + constexpr int max_Q1D = CPA_Q1D; + constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; + + double * sm0 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm1 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm2 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm3 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm4 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm5 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + + double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; + double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; + double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; + double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; + double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; + double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; + double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; + double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; + double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; + double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; + double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; + double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dx) { + + CONVECTION3DPA_1; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + + CONVECTION3DPA_2; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qy) { + + CONVECTION3DPA_3; + + } // lambda (dy) + ); // RAJA::loop + } // lambda (dx) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qz) { + + CONVECTION3DPA_4; + + } // lambda (qz) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qx) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + + CONVECTION3DPA_5; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + + CONVECTION3DPA_6; + + } // lambda (dz) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qx) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dy) { + + CONVECTION3DPA_7; + + } // lambda (dy) + ); // RAJA::loop + } // lambda (qx) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dx) { + + CONVECTION3DPA_8; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n CONVECTION3DPA : Unknown Sycl variant id = " << vid + << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(CONVECTION3DPA, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index 43ed5d539..8213c2c90 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,17 +28,18 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params) setDefaultProblemSize(m_NE_default*CPA_Q1D*CPA_Q1D*CPA_Q1D); setDefaultReps(50); - m_NE = std::max(getTargetProblemSize()/(CPA_Q1D*CPA_Q1D*CPA_Q1D), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (CPA_Q1D*CPA_Q1D*CPA_Q1D)/2) / (CPA_Q1D*CPA_Q1D*CPA_Q1D), Index_type(1)); setActualProblemSize( m_NE*CPA_Q1D*CPA_Q1D*CPA_Q1D ); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( 3*CPA_Q1D*CPA_D1D*sizeof(Real_type) + - CPA_VDIM*CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE*sizeof(Real_type) + - CPA_D1D*CPA_D1D*CPA_D1D*m_NE*sizeof(Real_type) + - CPA_D1D*CPA_D1D*CPA_D1D*m_NE*sizeof(Real_type) ); + setBytesReadPerRep( 3*sizeof(Real_type) * CPA_Q1D*CPA_D1D + // b, bt, g + 2*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x, y + CPA_VDIM*sizeof(Real_type) * CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE ); // d + setBytesWrittenPerRep( 1*sizeof(Real_type) + CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * ( 4 * CPA_D1D * CPA_Q1D * CPA_D1D * CPA_D1D + //2 @@ -64,6 +65,9 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } CONVECTION3DPA::~CONVECTION3DPA() diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index 784b2d4cd..38629b28c 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -378,17 +378,22 @@ class CONVECTION3DPA : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = CPA_Q1D * CPA_Q1D * CPA_Q1D; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 64094c4ab..3c7edcd40 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,6 +52,7 @@ template < size_t block_size > void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; const Index_type iend = m_domain->n_real_zones; auto res{getCudaResource()}; @@ -64,17 +65,19 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - deldotvec2d<<>>(div, - x1, x2, x3, x4, - y1, y2, y3, y4, - fx1, fx2, fx3, fx4, - fy1, fy2, fy3, fy4, - real_zones, - half, ptiny, - iend); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (deldotvec2d), + grid_size, block_size, + shmem, res.get_stream(), + div, + x1, x2, x3, x4, + y1, y2, y3, y4, + fx1, fx2, fx3, fx4, + fy1, fy2, fy3, fy4, + real_zones, + half, ptiny, + iend ); } stopTimer(); @@ -84,17 +87,20 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + auto deldotvec2d_lambda = [=] __device__ (Index_type ii) { + DEL_DOT_VEC_2D_BODY_INDEX; + DEL_DOT_VEC_2D_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - 0, iend, - [=] __device__ (Index_type ii) { - DEL_DOT_VEC_2D_BODY_INDEX; - DEL_DOT_VEC_2D_BODY; - }); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, + deldotvec2d_lambda ); } stopTimer(); diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 590ea31b2..79cef6b09 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,6 +52,7 @@ template < size_t block_size > void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; const Index_type iend = m_domain->n_real_zones; auto res{getHipResource()}; @@ -64,17 +65,19 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - hipLaunchKernelGGL((deldotvec2d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), div, - x1, x2, x3, x4, - y1, y2, y3, y4, - fx1, fx2, fx3, fx4, - fy1, fy2, fy3, fy4, - real_zones, - half, ptiny, - iend); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (deldotvec2d), + grid_size, block_size, + shmem, res.get_stream(), + div, + x1, x2, x3, x4, + y1, y2, y3, y4, + fx1, fx2, fx3, fx4, + fy1, fy2, fy3, fy4, + real_zones, + half, ptiny, + iend ); } stopTimer(); @@ -85,18 +88,19 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { auto deldotvec2d_lambda = [=] __device__ (Index_type ii) { - DEL_DOT_VEC_2D_BODY_INDEX; DEL_DOT_VEC_2D_BODY; }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), - 0, iend, deldotvec2d_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, + deldotvec2d_lambda ); } stopTimer(); diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index 1fc9b5775..730b49887 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index 8dfa12e6c..b3527802a 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index ffb533e3a..76b04a96f 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp new file mode 100644 index 000000000..13f59e29d --- /dev/null +++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp @@ -0,0 +1,87 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DEL_DOT_VEC_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "AppsData.hpp" + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +template +void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + DEL_DOT_VEC_2D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type ii = item.get_global_id(0); + if (ii < iend) { + DEL_DOT_VEC_2D_BODY_INDEX + DEL_DOT_VEC_2D_BODY + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + zones, [=] (Index_type i) { + DEL_DOT_VEC_2D_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n DEL_DOT_VEC_2D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DEL_DOT_VEC_2D, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index ffe5edeb2..8c72474bc 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,7 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setDefaultProblemSize(1000*1000); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::sqrt(getTargetProblemSize())+1; + Index_type rzmax = std::sqrt(getTargetProblemSize()) + 1 + std::sqrt(2)-1; m_domain = new ADomain(rzmax, /* ndims = */ 2); m_array_length = m_domain->nnalls; @@ -37,9 +37,10 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + - (1*sizeof(Real_type) + 0*sizeof(Real_type) ) * getItsPerRep() + - (0*sizeof(Real_type) + 4*sizeof(Real_type) ) * m_domain->n_real_nodes ) ; // touched data size, not actual number of stores and loads + setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + + 4*sizeof(Real_type) * m_domain->n_real_nodes ); // 4 variables with 2d nodal stencil pattern: 4 touches per iterate + setBytesWrittenPerRep( 1*sizeof(Index_type) * getItsPerRep() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(54 * m_domain->n_real_zones); setUsesFeature(Forall); @@ -62,6 +63,9 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index d82efc12f..d7c0d20f6 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -118,17 +118,22 @@ class DEL_DOT_VEC_2D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 863f83854..90a55905b 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -117,16 +117,16 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { case Base_CUDA: { - dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); constexpr size_t shmem = 0; - Diffusion3DPA<<>>( - Basis, dBasis, D, X, Y, symmetric); - cudaErrchk(cudaGetLastError()); + RPlaunchCudaKernel( (Diffusion3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + Basis, dBasis, D, X, Y, symmetric ); } stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 7f0dd77b1..15e27ed78 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -117,18 +117,16 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { case Base_HIP: { - dim3 nblocks(NE); - dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); constexpr size_t shmem = 0; - hipLaunchKernelGGL((Diffusion3DPA), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - Basis, dBasis, D, X, Y, symmetric); - hipErrchk(hipGetLastError()); + RPlaunchHipKernel( (Diffusion3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + Basis, dBasis, D, X, Y, symmetric ); } stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index a1dcdbe04..04f27ec63 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index 03a1811a3..be5bf5ecf 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 9e2818de1..c384b0695 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-Sycl.cpp b/src/apps/DIFFUSION3DPA-Sycl.cpp new file mode 100644 index 000000000..fccc14260 --- /dev/null +++ b/src/apps/DIFFUSION3DPA-Sycl.cpp @@ -0,0 +1,440 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "DIFFUSION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t work_group_size > +void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + DIFFUSION3DPA_DATA_SETUP; + + switch (vid) { + + case Base_SYCL: { + + const ::sycl::range<3> workGroupSize(DPA_Q1D, DPA_Q1D, DPA_Q1D); + const ::sycl::range<3> gridSize(DPA_Q1D,DPA_Q1D,DPA_Q1D*NE); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&](cl::sycl::handler& h) { + + constexpr int MQ1 = DPA_Q1D; + constexpr int MD1 = DPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + auto sBG_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1*MD1), h); + + auto sm0_0_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm0_1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm0_2_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm1_0_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm1_1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm1_2_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, workGroupSize), + [=] (cl::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(2); + + double *sBG = sBG_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + double *sm0_0 = sm0_0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm0_1 = sm0_1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm0_2 = sm0_2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1_0 = sm1_0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1_1 = sm1_1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1_2 = sm1_2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + double (*B)[MD1] = (double (*)[MD1]) sBG; + double (*G)[MD1] = (double (*)[MD1]) sBG; + double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; + double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; + + double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_2); + double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0_0); + double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0_1); + double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_0); + double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_1); + double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_2); + double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_0); + double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_1); + double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_2); + double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_0); + double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_1); + double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_2); + double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_0); + double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_1); + double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_2); + + SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + DIFFUSION3DPA_1; + } + } + } + + if (itm.get_local_id(0) == 0) + { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + DIFFUSION3DPA_2; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + DIFFUSION3DPA_3; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { + SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + DIFFUSION3DPA_4; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { + SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + DIFFUSION3DPA_5; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + if (itm.get_local_id(0) == 0) + { + SYCL_FOREACH_THREAD(d, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(q, 2, DPA_Q1D) { + DIFFUSION3DPA_6; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { + SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { + SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + DIFFUSION3DPA_7; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + DIFFUSION3DPA_8; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + DIFFUSION3DPA_9; + } + } + } + + }); + }); + + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = + RAJA::LaunchPolicy>; + + using outer_x = + RAJA::LoopPolicy; + + using inner_x = + RAJA::LoopPolicy; + + using inner_y = + RAJA::LoopPolicy; + + using inner_z = + RAJA::LoopPolicy; + + size_t shmem = 0; + { + constexpr int MQ1 = DPA_Q1D; + constexpr int MD1 = DPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + const size_t local_mats = 6; + shmem += MQ1*MD1*sizeof(double) + local_mats*MDQ*MDQ*MDQ*sizeof(double); + } + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + const bool symmetric = true; + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + //Redefine inside the lambda to keep consistent with base version + constexpr int MQ1 = DPA_Q1D; + constexpr int MD1 = DPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + double *sBG = ctx.getSharedMemory(MQ1*MD1); + double *sm0_0 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm0_1 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm0_2 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm1_0 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm1_1 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm1_2 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + + double (*B)[MD1] = (double (*)[MD1]) sBG; + double (*G)[MD1] = (double (*)[MD1]) sBG; + double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; + double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; + + double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_2); + double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0_0); + double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0_1); + double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_0); + double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_1); + double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_2); + double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_0); + double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_1); + double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_2); + double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_0); + double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_1); + double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_2); + double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_0); + double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_1); + double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_2); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + DIFFUSION3DPA_1; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](int RAJA_UNUSED_ARG(dz)) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + DIFFUSION3DPA_2; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + DIFFUSION3DPA_3; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + DIFFUSION3DPA_4; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + DIFFUSION3DPA_5; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](int RAJA_UNUSED_ARG(dz)) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + + DIFFUSION3DPA_6; + + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + DIFFUSION3DPA_7; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + DIFFUSION3DPA_8; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (qz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + DIFFUSION3DPA_9; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n DIFFUSION3DPA : Unknown Sycl variant id = " << vid + << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DIFFUSION3DPA, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index e0cd0f6d0..16cf307b5 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,17 +28,18 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D); setDefaultReps(50); - m_NE = std::max(getTargetProblemSize()/(DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (DPA_Q1D*DPA_Q1D*DPA_Q1D)/2) / (DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1)); setActualProblemSize( m_NE*DPA_Q1D*DPA_Q1D*DPA_Q1D ); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( 2*DPA_Q1D*DPA_D1D*sizeof(Real_type) + - DPA_Q1D*DPA_Q1D*DPA_Q1D*SYM*m_NE*sizeof(Real_type) + - DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) + - DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) ); + setBytesReadPerRep( 2*sizeof(Real_type) * DPA_Q1D*DPA_D1D + // b, g + 2*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x, y + SYM*sizeof(Real_type) * DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE ); // d + setBytesWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D + 5 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_D1D + @@ -65,6 +66,9 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } DIFFUSION3DPA::~DIFFUSION3DPA() diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 62967d5c0..5b587279c 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -481,17 +481,22 @@ class DIFFUSION3DPA : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/EDGE3D-Cuda.cpp b/src/apps/EDGE3D-Cuda.cpp index 9136dc961..5f212fb9b 100644 --- a/src/apps/EDGE3D-Cuda.cpp +++ b/src/apps/EDGE3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -66,12 +66,14 @@ void EDGE3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - edge3d<<>>(sum, - x0, x1, x2, x3, x4, x5, x6, x7, - y0, y1, y2, y3, y4, y5, y6, y7, - z0, z1, z2, z3, z4, z5, z6, z7, - ibegin, iend); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (edge3d), + grid_size, block_size, + shmem, res.get_stream(), + sum, + x0, x1, x2, x3, x4, x5, x6, x7, + y0, y1, y2, y3, y4, y5, y6, y7, + z0, z1, z2, z3, z4, z5, z6, z7, + ibegin, iend ); } stopTimer(); @@ -81,14 +83,17 @@ void EDGE3D::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto edge3d_lambda = [=] __device__ (Index_type i) { EDGE3D_BODY; }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - auto edge3d_lam = [=] __device__ (Index_type i) { EDGE3D_BODY; }; - - lambda_cuda_forall<<>>( - ibegin, iend, edge3d_lam); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, + edge3d_lambda ); } stopTimer(); diff --git a/src/apps/EDGE3D-Hip.cpp b/src/apps/EDGE3D-Hip.cpp index 5da3606c4..56ff054d8 100644 --- a/src/apps/EDGE3D-Hip.cpp +++ b/src/apps/EDGE3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -65,13 +65,15 @@ void EDGE3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - - hipLaunchKernelGGL((edge3d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), sum, - x0, x1, x2, x3, x4, x5, x6, x7, - y0, y1, y2, y3, y4, y5, y6, y7, - z0, z1, z2, z3, z4, z5, z6, z7, - ibegin, iend); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (edge3d), + grid_size, block_size, + shmem, res.get_stream(), + sum, + x0, x1, x2, x3, x4, x5, x6, x7, + y0, y1, y2, y3, y4, y5, y6, y7, + z0, z1, z2, z3, z4, z5, z6, z7, + ibegin, iend ); } stopTimer(); @@ -81,16 +83,17 @@ void EDGE3D::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto edge3d_lambda = [=] __device__ (Index_type i) { EDGE3D_BODY; }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - auto edge3d_lam = [=] __device__ (Index_type i) { EDGE3D_BODY; }; - - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, edge3d_lam); - - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, + edge3d_lambda ); } stopTimer(); diff --git a/src/apps/EDGE3D-OMP.cpp b/src/apps/EDGE3D-OMP.cpp index bb79de639..1671872b4 100644 --- a/src/apps/EDGE3D-OMP.cpp +++ b/src/apps/EDGE3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/EDGE3D-OMPTarget.cpp b/src/apps/EDGE3D-OMPTarget.cpp index bf86d856c..2a348ec28 100644 --- a/src/apps/EDGE3D-OMPTarget.cpp +++ b/src/apps/EDGE3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -37,11 +37,6 @@ void EDGE3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu EDGE3D_DATA_SETUP; - auto edge3d_lam = - [=](Index_type i) { - EDGE3D_BODY; - }; - if ( vid == Base_OpenMPTarget ) { startTimer(); @@ -61,8 +56,6 @@ void EDGE3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } else if ( vid == RAJA_OpenMPTarget ) { - EDGE3D_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/apps/EDGE3D-Seq.cpp b/src/apps/EDGE3D-Seq.cpp index 6658650b1..5f7114127 100644 --- a/src/apps/EDGE3D-Seq.cpp +++ b/src/apps/EDGE3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,9 +28,11 @@ void EDGE3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) EDGE3D_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto edge3d_lam = [=](Index_type i) { EDGE3D_BODY; }; +#endif switch ( vid ) { @@ -70,7 +72,7 @@ void EDGE3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( + RAJA::forall( RAJA::RangeSegment(ibegin, iend), edge3d_lam); } diff --git a/src/apps/EDGE3D-Sycl.cpp b/src/apps/EDGE3D-Sycl.cpp new file mode 100644 index 000000000..6b60bbc3c --- /dev/null +++ b/src/apps/EDGE3D-Sycl.cpp @@ -0,0 +1,84 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "EDGE3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +template < size_t work_group_size > +void EDGE3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + EDGE3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0) + ibegin; + if (i < iend) { + EDGE3D_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + EDGE3D_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n EDGE3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(EDGE3D, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp index 3b93d281b..d917bb321 100644 --- a/src/apps/EDGE3D.cpp +++ b/src/apps/EDGE3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -27,23 +27,22 @@ EDGE3D::EDGE3D(const RunParams& params) { setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(10); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_array_length = m_domain->nnalls; size_t number_of_elements = m_domain->lpz+1 - m_domain->fpz; - setActualProblemSize( number_of_elements ); + setActualProblemSize( m_domain->n_real_zones ); setItsPerRep( number_of_elements ); setKernelsPerRep(1); // touched data size, not actual number of stores and loads // see VOL3D.cpp - size_t reads_per_node = 3*sizeof(Real_type); - size_t writes_per_zone = 1*sizeof(Real_type); - setBytesPerRep( writes_per_zone * getItsPerRep() + - reads_per_node * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); + setBytesAtomicModifyWrittenPerRep( 0 ); constexpr size_t flops_k_loop = 15 + 6*flops_Jxx() @@ -83,6 +82,9 @@ EDGE3D::EDGE3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } EDGE3D::~EDGE3D() diff --git a/src/apps/EDGE3D.hpp b/src/apps/EDGE3D.hpp index 82e07c6a5..a5ae54cfa 100644 --- a/src/apps/EDGE3D.hpp +++ b/src/apps/EDGE3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -417,17 +417,22 @@ class EDGE3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index a62974a15..c33321ea7 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -123,51 +123,63 @@ void ENERGY::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - energycalc1<<>>( e_new, e_old, delvc, - p_old, q_old, work, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc2<<>>( delvc, q_new, - compHalfStep, pHalfStep, - e_new, bvc, pbvc, - ql_old, qq_old, - rho0, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc3<<>>( e_new, delvc, - p_old, q_old, - pHalfStep, q_new, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc4<<>>( e_new, work, - e_cut, emin, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc5<<>>( delvc, - pbvc, e_new, vnewc, - bvc, p_new, - ql_old, qq_old, - p_old, q_old, - pHalfStep, q_new, - rho0, e_cut, emin, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc6<<>>( delvc, - pbvc, e_new, vnewc, - bvc, p_new, - q_new, - ql_old, qq_old, - rho0, q_cut, - iend ); - cudaErrchk( cudaGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (energycalc1), + grid_size, block_size, + shmem, res.get_stream(), + e_new, e_old, delvc, + p_old, q_old, work, + iend ); + + RPlaunchCudaKernel( (energycalc2), + grid_size, block_size, + shmem, res.get_stream(), + delvc, q_new, + compHalfStep, pHalfStep, + e_new, bvc, pbvc, + ql_old, qq_old, + rho0, + iend ); + + RPlaunchCudaKernel( (energycalc3), + grid_size, block_size, + shmem, res.get_stream(), + e_new, delvc, + p_old, q_old, + pHalfStep, q_new, + iend ); + + RPlaunchCudaKernel( (energycalc4), + grid_size, block_size, + shmem, res.get_stream(), + e_new, work, + e_cut, emin, + iend ); + + RPlaunchCudaKernel( (energycalc5), + grid_size, block_size, + shmem, res.get_stream(), + delvc, + pbvc, e_new, vnewc, + bvc, p_new, + ql_old, qq_old, + p_old, q_old, + pHalfStep, q_new, + rho0, e_cut, emin, + iend ); + + RPlaunchCudaKernel( (energycalc6), + grid_size, block_size, + shmem, res.get_stream(), + delvc, + pbvc, e_new, vnewc, + bvc, p_new, + q_new, + ql_old, qq_old, + rho0, q_cut, + iend ); } stopTimer(); diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index c7064e591..e0424d55a 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -123,51 +123,63 @@ void ENERGY::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - hipLaunchKernelGGL((energycalc1), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), e_new, e_old, delvc, - p_old, q_old, work, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc2), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), delvc, q_new, - compHalfStep, pHalfStep, - e_new, bvc, pbvc, - ql_old, qq_old, - rho0, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc3), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), e_new, delvc, - p_old, q_old, - pHalfStep, q_new, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc4), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), e_new, work, - e_cut, emin, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc5), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), delvc, - pbvc, e_new, vnewc, - bvc, p_new, - ql_old, qq_old, - p_old, q_old, - pHalfStep, q_new, - rho0, e_cut, emin, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc6), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), delvc, - pbvc, e_new, vnewc, - bvc, p_new, - q_new, - ql_old, qq_old, - rho0, q_cut, - iend ); - hipErrchk( hipGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (energycalc1), + grid_size, block_size, + shmem, res.get_stream(), + e_new, e_old, delvc, + p_old, q_old, work, + iend ); + + RPlaunchHipKernel( (energycalc2), + grid_size, block_size, + shmem, res.get_stream(), + delvc, q_new, + compHalfStep, pHalfStep, + e_new, bvc, pbvc, + ql_old, qq_old, + rho0, + iend ); + + RPlaunchHipKernel( (energycalc3), + grid_size, block_size, + shmem, res.get_stream(), + e_new, delvc, + p_old, q_old, + pHalfStep, q_new, + iend ); + + RPlaunchHipKernel( (energycalc4), + grid_size, block_size, + shmem, res.get_stream(), + e_new, work, + e_cut, emin, + iend ); + + RPlaunchHipKernel( (energycalc5), + grid_size, block_size, + shmem, res.get_stream(), + delvc, + pbvc, e_new, vnewc, + bvc, p_new, + ql_old, qq_old, + p_old, q_old, + pHalfStep, q_new, + rho0, e_cut, emin, + iend ); + + RPlaunchHipKernel( (energycalc6), + grid_size, block_size, + shmem, res.get_stream(), + delvc, + pbvc, e_new, vnewc, + bvc, p_new, + q_new, + ql_old, qq_old, + rho0, q_cut, + iend ); } stopTimer(); diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp index 235386ff0..687f69d25 100644 --- a/src/apps/ENERGY-OMP.cpp +++ b/src/apps/ENERGY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp index 83ce48357..786623a8f 100644 --- a/src/apps/ENERGY-OMPTarget.cpp +++ b/src/apps/ENERGY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp index c7e3ffdf2..bbf9d73c0 100644 --- a/src/apps/ENERGY-Seq.cpp +++ b/src/apps/ENERGY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp new file mode 100644 index 000000000..7ebc7f3c7 --- /dev/null +++ b/src/apps/ENERGY-Sycl.cpp @@ -0,0 +1,174 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ENERGY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +template +void ENERGY::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + ENERGY_DATA_SETUP; + + using sycl::sqrt; + using sycl::fabs; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY1 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY2 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY3 + } + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY4 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY5 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY6 + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + const bool async = true; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY1; + }); + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY2; + }); + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY3; + }); + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY4; + }); + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY5; + }); + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY6; + }); + + }); // end sequential region (for single-source code) + + } + stopTimer(); + + } else { + std::cout << "\n ENERGY : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ENERGY, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index fd1988300..7f480d00f 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -29,13 +29,22 @@ ENERGY::ENERGY(const RunParams& params) setItsPerRep( 6 * getActualProblemSize() ); setKernelsPerRep(6); // some branches are never taken due to the nature of the initialization of delvc - // the additional reads and writes that would be done if those branches were taken are noted in the comments - setBytesPerRep( (1*sizeof(Real_type) + 5*sizeof(Real_type)) * getActualProblemSize() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() + /* 1 + 8 */ - (1*sizeof(Real_type) + 6*sizeof(Real_type)) * getActualProblemSize() + - (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() + - (1*sizeof(Real_type) + 7*sizeof(Real_type)) * getActualProblemSize() + /* 1 + 12 */ - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); /* 1 + 8 */ + // the additional reads that would be done if those branches were taken are noted in the comments + setBytesReadPerRep((5*sizeof(Real_type) + + 1*sizeof(Real_type) + // 8 + 6*sizeof(Real_type) + + 2*sizeof(Real_type) + + 7*sizeof(Real_type) + // 12 + 1*sizeof(Real_type) // 8 + ) * getActualProblemSize() ); + setBytesWrittenPerRep((1*sizeof(Real_type) + + 1*sizeof(Real_type) + + 1*sizeof(Real_type) + + 1*sizeof(Real_type) + + 1*sizeof(Real_type) + + 0*sizeof(Real_type) + ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((6 + 11 + // 1 sqrt 8 + @@ -62,6 +71,9 @@ ENERGY::ENERGY(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } ENERGY::~ENERGY() diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 22af34867..ba5b69949 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -203,17 +203,22 @@ class ENERGY : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_e_new; Real_ptr m_e_old; diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index f88e7b55d..fd4386324 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -24,6 +24,11 @@ for (int i = threadIdx.k; i < N; i += blockDim.k) #endif +#if defined(RAJA_ENABLE_SYCL) +#define SYCL_FOREACH_THREAD(i, k, N) \ + for (int i = itm.get_local_id(k); i < N; i += itm.get_local_range(k)) +#endif + #define CPU_FOREACH(i, k, N) for (int i = 0; i < N; i++) #endif // closing endif for header file include guard diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 01266c7d6..9605d85b1 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -83,7 +83,7 @@ void FIR::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -98,20 +98,24 @@ void FIR::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; #if defined(USE_CUDA_CONSTANT_MEMORY) - fir<<>>( out, in, - coefflen, - iend ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (fir), + grid_size, block_size, + shmem, res.get_stream(), + out, in, + coefflen, + iend ); #else - fir<<>>( out, in, - coeff, - coefflen, - iend ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (fir), + grid_size, block_size, + shmem, res.get_stream(), + out, in, + coeff, + coefflen, + iend ); #endif } diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 47dc40efb..a3272cb23 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -81,7 +81,7 @@ void FIR::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -96,20 +96,24 @@ void FIR::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; #if defined(USE_HIP_CONSTANT_MEMORY) - hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), out, in, - coefflen, - iend ); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (fir), + grid_size, block_size, + shmem, res.get_stream(), + out, in, + coefflen, + iend ); #else - hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), out, in, - coeff, - coefflen, - iend ); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (fir), + grid_size, block_size, + shmem, res.get_stream(), + out, in, + coeff, + coefflen, + iend ); #endif } diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp index 5b3cc2a35..5fcad1616 100644 --- a/src/apps/FIR-OMP.cpp +++ b/src/apps/FIR-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -25,7 +25,7 @@ void FIR::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); FIR_COEFF; diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 5715f884a..3ba913846 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -43,7 +43,7 @@ void FIR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); FIR_DATA_SETUP; diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index b13d30818..001ffd194 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,7 +23,7 @@ void FIR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); FIR_COEFF; diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp new file mode 100644 index 000000000..eee240a5f --- /dev/null +++ b/src/apps/FIR-Sycl.cpp @@ -0,0 +1,109 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIR.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +#define FIR_DATA_SETUP_SYCL \ + Real_ptr coeff; \ +\ + allocAndInitSyclDeviceData(in, m_in, getActualProblemSize(), qu); \ + allocAndInitSyclDeviceData(out, m_out, getActualProblemSize(), qu); \ + Real_ptr tcoeff = &coeff_array[0]; \ + allocAndInitSyclDeviceData(coeff, tcoeff, FIR_COEFFLEN, qu); + +#define FIR_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_out, out, getActualProblemSize(), qu); \ + deallocSyclDeviceData(in, qu); \ + deallocSyclDeviceData(out, qu); \ + deallocSyclDeviceData(coeff, qu); + + +template +void FIR::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + FIR_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + FIR_COEFF; + + FIR_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + FIR_BODY + } + + }); + }); + + } + stopTimer(); + + FIR_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + FIR_COEFF; + + FIR_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + FIR_BODY; + }); + + } + stopTimer(); + + FIR_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n FIR : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIR, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 7b51aaebc..f4a2de7e8 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,11 +28,13 @@ FIR::FIR(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - setItsPerRep( getActualProblemSize() - m_coefflen ); + setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); - setFLOPsPerRep((2 * m_coefflen) * (getActualProblemSize() - m_coefflen)); + setBytesReadPerRep( m_coefflen*sizeof(Real_type) + + 1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep((2 * m_coefflen) * getActualProblemSize()); checksum_scale_factor = 0.0001 * ( static_cast(getDefaultProblemSize()) / @@ -56,6 +58,9 @@ FIR::FIR(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } FIR::~FIR() @@ -64,7 +69,7 @@ FIR::~FIR() void FIR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitData(m_in, getActualProblemSize(), vid); + allocAndInitData(m_in, getActualProblemSize() + m_coefflen-1, vid); allocAndInitDataConst(m_out, getActualProblemSize(), 0.0, vid); } diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 3ca8a1cef..72968045f 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -78,17 +78,22 @@ class FIR : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_in; Real_ptr m_out; diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp deleted file mode 100644 index 58534da21..000000000 --- a/src/apps/HALOEXCHANGE.cpp +++ /dev/null @@ -1,449 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "HALOEXCHANGE.hpp" - -#include "RAJA/RAJA.hpp" - -#include "common/DataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace apps -{ - -HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) - : KernelBase(rajaperf::Apps_HALOEXCHANGE, params) -{ - m_grid_dims_default[0] = 100; - m_grid_dims_default[1] = 100; - m_grid_dims_default[2] = 100; - m_halo_width_default = 1; - m_num_vars_default = 3; - - setDefaultProblemSize( m_grid_dims_default[0] * - m_grid_dims_default[1] * - m_grid_dims_default[2] ); - setDefaultReps(50); - - double cbrt_run_size = std::cbrt(getTargetProblemSize()); - - m_grid_dims[0] = cbrt_run_size; - m_grid_dims[1] = cbrt_run_size; - m_grid_dims[2] = cbrt_run_size; - m_halo_width = m_halo_width_default; - m_num_vars = m_num_vars_default; - - m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width; - m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width; - m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width; - m_var_size = m_grid_plus_halo_dims[0] * - m_grid_plus_halo_dims[1] * - m_grid_plus_halo_dims[2] ; - - setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); - - setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); - setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); - setFLOPsPerRep(0); - - setUsesFeature(Forall); - - setVariantDefined( Base_Seq ); - setVariantDefined( Lambda_Seq ); - setVariantDefined( RAJA_Seq ); - - setVariantDefined( Base_OpenMP ); - setVariantDefined( Lambda_OpenMP ); - setVariantDefined( RAJA_OpenMP ); - - setVariantDefined( Base_OpenMPTarget ); - setVariantDefined( RAJA_OpenMPTarget ); - - setVariantDefined( Base_CUDA ); - setVariantDefined( RAJA_CUDA ); - - setVariantDefined( Base_HIP ); - setVariantDefined( RAJA_HIP ); -} - -HALOEXCHANGE::~HALOEXCHANGE() -{ -} - -void HALOEXCHANGE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) -{ - m_vars.resize(m_num_vars, nullptr); - for (Index_type v = 0; v < m_num_vars; ++v) { - allocAndInitData(m_vars[v], m_var_size, vid); - auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); - - Real_ptr var = m_vars[v]; - - for (Index_type i = 0; i < m_var_size; i++) { - var[i] = i + v; - } - } - - m_pack_index_lists.resize(s_num_neighbors, nullptr); - m_pack_index_list_lengths.resize(s_num_neighbors, 0); - create_pack_lists(m_pack_index_lists, m_pack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_unpack_index_lists.resize(s_num_neighbors, nullptr); - m_unpack_index_list_lengths.resize(s_num_neighbors, 0); - create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_buffers.resize(s_num_neighbors, nullptr); - for (Index_type l = 0; l < s_num_neighbors; ++l) { - Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_buffers[l], buffer_len, vid); - } -} - -void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) -{ - for (Real_ptr var : m_vars) { - checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); - } -} - -void HALOEXCHANGE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) -{ - for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l], vid); - } - m_buffers.clear(); - - destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid); - m_unpack_index_list_lengths.clear(); - m_unpack_index_lists.clear(); - - destroy_pack_lists(m_pack_index_lists, s_num_neighbors, vid); - m_pack_index_list_lengths.clear(); - m_pack_index_lists.clear(); - - for (int v = 0; v < m_num_vars; ++v) { - deallocData(m_vars[v], vid); - } - m_vars.clear(); -} - -namespace { - -struct Extent -{ - Index_type i_min; - Index_type i_max; - Index_type j_min; - Index_type j_max; - Index_type k_min; - Index_type k_max; -}; - -} - -// -// Function to generate index lists for packing. -// -void HALOEXCHANGE::create_pack_lists( - std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - std::vector pack_index_list_extents(num_neighbors); - - // faces - pack_index_list_extents[0] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - // edges - pack_index_list_extents[6] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[7] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[8] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[9] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[10] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[11] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - // corners - pack_index_list_extents[18] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[19] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[20] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[21] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { - - Extent extent = pack_index_list_extents[l]; - - pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); - - Int_ptr pack_list = pack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type pack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; - - pack_list[list_idx] = pack_idx; - - list_idx += 1; - } - } - } - } -} - -// -// Function to destroy packing index lists. -// -void HALOEXCHANGE::destroy_pack_lists( - std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - (void) vid; - - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(pack_index_lists[l], vid); - } -} - -// -// Function to generate index lists for unpacking. -// -void HALOEXCHANGE::create_unpack_lists( - std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - std::vector unpack_index_list_extents(num_neighbors); - - // faces - unpack_index_list_extents[0] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - // edges - unpack_index_list_extents[6] = Extent{0 , halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[7] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[10] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[11] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - // corners - unpack_index_list_extents[18] = Extent{0 , halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[19] = Extent{0 , halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[20] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[21] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { - - Extent extent = unpack_index_list_extents[l]; - - unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - - Int_ptr unpack_list = unpack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type unpack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; - - unpack_list[list_idx] = unpack_idx; - - list_idx += 1; - } - } - } - } -} - -// -// Function to destroy unpacking index lists. -// -void HALOEXCHANGE::destroy_unpack_lists( - std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - (void) vid; - - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(unpack_index_lists[l], vid); - } -} - -} // end namespace apps -} // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp deleted file mode 100644 index 1f21d9616..000000000 --- a/src/apps/HALOEXCHANGE.hpp +++ /dev/null @@ -1,151 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -/// -/// HALOEXCHANGE kernel reference implementation: -/// -/// // pack message for each neighbor -/// for (Index_type l = 0; l < num_neighbors; ++l) { -/// Real_ptr buffer = buffers[l]; -/// Int_ptr list = pack_index_lists[l]; -/// Index_type len = pack_index_list_lengths[l]; -/// // pack part of each variable -/// for (Index_type v = 0; v < num_vars; ++v) { -/// Real_ptr var = vars[v]; -/// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_PACK_BODY; -/// } -/// buffer += len; -/// } -/// // send message to neighbor -/// } -/// -/// // unpack messages for each neighbor -/// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive message from neighbor -/// Real_ptr buffer = buffers[l]; -/// Int_ptr list = unpack_index_lists[l]; -/// Index_type len = unpack_index_list_lengths[l]; -/// // unpack part of each variable -/// for (Index_type v = 0; v < num_vars; ++v) { -/// Real_ptr var = vars[v]; -/// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_UNPACK_BODY; -/// } -/// buffer += len; -/// } -/// } -/// - -#ifndef RAJAPerf_Apps_HALOEXCHANGE_HPP -#define RAJAPerf_Apps_HALOEXCHANGE_HPP - -#define HALOEXCHANGE_DATA_SETUP \ - std::vector vars = m_vars; \ - std::vector buffers = m_buffers; \ -\ - Index_type num_neighbors = s_num_neighbors; \ - Index_type num_vars = m_num_vars; \ - std::vector pack_index_lists = m_pack_index_lists; \ - std::vector pack_index_list_lengths = m_pack_index_list_lengths; \ - std::vector unpack_index_lists = m_unpack_index_lists; \ - std::vector unpack_index_list_lengths = m_unpack_index_list_lengths; - -#define HALOEXCHANGE_PACK_BODY \ - buffer[i] = var[list[i]]; - -#define HALOEXCHANGE_UNPACK_BODY \ - var[list[i]] = buffer[i]; - - -#include "common/KernelBase.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -class RunParams; - -namespace apps -{ - -class HALOEXCHANGE : public KernelBase -{ -public: - - HALOEXCHANGE(const RunParams& params); - - ~HALOEXCHANGE(); - - void setUp(VariantID vid, size_t tune_idx); - void updateChecksum(VariantID vid, size_t tune_idx); - void tearDown(VariantID vid, size_t tune_idx); - - void runSeqVariant(VariantID vid, size_t tune_idx); - void runOpenMPVariant(VariantID vid, size_t tune_idx); - void runCudaVariant(VariantID vid, size_t tune_idx); - void runHipVariant(VariantID vid, size_t tune_idx); - void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - - void setCudaTuningDefinitions(VariantID vid); - void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); - -private: - static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; - - static const int s_num_neighbors = 26; - - Index_type m_grid_dims[3]; - Index_type m_halo_width; - Index_type m_num_vars; - - Index_type m_grid_dims_default[3]; - Index_type m_halo_width_default; - Index_type m_num_vars_default; - - Index_type m_grid_plus_halo_dims[3]; - Index_type m_var_size; - Index_type m_var_halo_size; - - std::vector m_vars; - std::vector m_buffers; - - std::vector m_pack_index_lists; - std::vector m_pack_index_list_lengths; - std::vector m_unpack_index_lists; - std::vector m_unpack_index_list_lengths; - - void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - void destroy_pack_lists(std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid); - void create_unpack_lists(std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - void destroy_unpack_lists(std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid); -}; - -} // end namespace apps -} // end namespace rajaperf - -#endif // closing endif for header file include guard diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp deleted file mode 100644 index 6be241d43..000000000 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ /dev/null @@ -1,256 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "HALOEXCHANGE_FUSED.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace apps -{ - -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ - Real_ptr* pack_buffer_ptrs; \ - Int_ptr* pack_list_ptrs; \ - Real_ptr* pack_var_ptrs; \ - Index_type* pack_len_ptrs; \ - allocData(DataSpace::HipPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_len_ptrs, num_neighbors * num_vars); \ - Real_ptr* unpack_buffer_ptrs; \ - Int_ptr* unpack_list_ptrs; \ - Real_ptr* unpack_var_ptrs; \ - Index_type* unpack_len_ptrs; \ - allocData(DataSpace::HipPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_len_ptrs, num_neighbors * num_vars); - -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ - deallocData(DataSpace::HipPinned, pack_buffer_ptrs); \ - deallocData(DataSpace::HipPinned, pack_list_ptrs); \ - deallocData(DataSpace::HipPinned, pack_var_ptrs); \ - deallocData(DataSpace::HipPinned, pack_len_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_buffer_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_list_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_var_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_len_ptrs); - -template < size_t block_size > -__launch_bounds__(block_size) -__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, - Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) -{ - Index_type j = blockIdx.y; - - Real_ptr buffer = pack_buffer_ptrs[j]; - Int_ptr list = pack_list_ptrs[j]; - Real_ptr var = pack_var_ptrs[j]; - Index_type len = pack_len_ptrs[j]; - - for (Index_type i = threadIdx.x + blockIdx.x * block_size; - i < len; - i += block_size * gridDim.x) { - HALOEXCHANGE_FUSED_PACK_BODY; - } -} - -template < size_t block_size > -__launch_bounds__(block_size) -__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, - Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) -{ - Index_type j = blockIdx.y; - - Real_ptr buffer = unpack_buffer_ptrs[j]; - Int_ptr list = unpack_list_ptrs[j]; - Real_ptr var = unpack_var_ptrs[j]; - Index_type len = unpack_len_ptrs[j]; - - for (Index_type i = threadIdx.x + blockIdx.x * block_size; - i < len; - i += block_size * gridDim.x) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - } -} - - -template < size_t block_size > -void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - - auto res{getHipResource()}; - - HALOEXCHANGE_FUSED_DATA_SETUP; - - if ( vid == Base_HIP ) { - - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - constexpr size_t shmem = 0; - - Index_type pack_index = 0; - Index_type pack_len_sum = 0; - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pack_buffer_ptrs[pack_index] = buffer; - pack_list_ptrs[pack_index] = list; - pack_var_ptrs[pack_index] = var; - pack_len_ptrs[pack_index] = len; - pack_len_sum += len; - pack_index += 1; - buffer += len; - } - } - Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; - dim3 pack_nthreads_per_block(block_size); - dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), - pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); - hipErrchk( hipGetLastError() ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - - Index_type unpack_index = 0; - Index_type unpack_len_sum = 0; - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - unpack_buffer_ptrs[unpack_index] = buffer; - unpack_list_ptrs[unpack_index] = list; - unpack_var_ptrs[unpack_index] = var; - unpack_len_ptrs[unpack_index] = len; - unpack_len_sum += len; - unpack_index += 1; - buffer += len; - } - } - Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; - dim3 unpack_nthreads_per_block(block_size); - dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), - unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); - hipErrchk( hipGetLastError() ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - - } - stopTimer(); - - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - using AllocatorHolder = RAJAPoolAllocatorHolder; - using Allocator = AllocatorHolder::Allocator; - - AllocatorHolder allocatorHolder; - - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::hip_work_async, -#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) - RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, -#else - RAJA::ordered, -#endif - RAJA::constant_stride_array_of_objects >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(res); - res.wait(); - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(res); - res.wait(); - - } - stopTimer(); - - } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; - } -} - -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Hip) - -} // end namespace apps -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp deleted file mode 100644 index 74dd5b0d5..000000000 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ /dev/null @@ -1,449 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "HALOEXCHANGE_FUSED.hpp" - -#include "RAJA/RAJA.hpp" - -#include "common/DataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace apps -{ - -HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) - : KernelBase(rajaperf::Apps_HALOEXCHANGE_FUSED, params) -{ - m_grid_dims_default[0] = 100; - m_grid_dims_default[1] = 100; - m_grid_dims_default[2] = 100; - m_halo_width_default = 1; - m_num_vars_default = 3; - - setDefaultProblemSize( m_grid_dims_default[0] * - m_grid_dims_default[1] * - m_grid_dims_default[2] ); - setDefaultReps(50); - - double cbrt_run_size = std::cbrt(getTargetProblemSize()); - - m_grid_dims[0] = cbrt_run_size; - m_grid_dims[1] = cbrt_run_size; - m_grid_dims[2] = cbrt_run_size; - m_halo_width = m_halo_width_default; - m_num_vars = m_num_vars_default; - - m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width; - m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width; - m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width; - m_var_size = m_grid_plus_halo_dims[0] * - m_grid_plus_halo_dims[1] * - m_grid_plus_halo_dims[2] ; - - setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); - - setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); - setKernelsPerRep( 2 ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); - setFLOPsPerRep(0); - - setUsesFeature(Workgroup); - - setVariantDefined( Base_Seq ); - setVariantDefined( Lambda_Seq ); - setVariantDefined( RAJA_Seq ); - - setVariantDefined( Base_OpenMP ); - setVariantDefined( Lambda_OpenMP ); - setVariantDefined( RAJA_OpenMP ); - - setVariantDefined( Base_OpenMPTarget ); - setVariantDefined( RAJA_OpenMPTarget ); - - setVariantDefined( Base_CUDA ); - setVariantDefined( RAJA_CUDA ); - - setVariantDefined( Base_HIP ); - setVariantDefined( RAJA_HIP ); -} - -HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() -{ -} - -void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) -{ - m_vars.resize(m_num_vars, nullptr); - for (Index_type v = 0; v < m_num_vars; ++v) { - allocAndInitData(m_vars[v], m_var_size, vid); - auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); - - Real_ptr var = m_vars[v]; - - for (Index_type i = 0; i < m_var_size; i++) { - var[i] = i + v; - } - } - - m_pack_index_lists.resize(s_num_neighbors, nullptr); - m_pack_index_list_lengths.resize(s_num_neighbors, 0); - create_pack_lists(m_pack_index_lists, m_pack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_unpack_index_lists.resize(s_num_neighbors, nullptr); - m_unpack_index_list_lengths.resize(s_num_neighbors, 0); - create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_buffers.resize(s_num_neighbors, nullptr); - for (Index_type l = 0; l < s_num_neighbors; ++l) { - Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_buffers[l], buffer_len, vid); - } -} - -void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) -{ - for (Real_ptr var : m_vars) { - checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); - } -} - -void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) -{ - for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l], vid); - } - m_buffers.clear(); - - destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid); - m_unpack_index_list_lengths.clear(); - m_unpack_index_lists.clear(); - - destroy_pack_lists(m_pack_index_lists, s_num_neighbors, vid); - m_pack_index_list_lengths.clear(); - m_pack_index_lists.clear(); - - for (int v = 0; v < m_num_vars; ++v) { - deallocData(m_vars[v], vid); - } - m_vars.clear(); -} - -namespace { - -struct Extent -{ - Index_type i_min; - Index_type i_max; - Index_type j_min; - Index_type j_max; - Index_type k_min; - Index_type k_max; -}; - -} - -// -// Function to generate index lists for packing. -// -void HALOEXCHANGE_FUSED::create_pack_lists( - std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - std::vector pack_index_list_extents(num_neighbors); - - // faces - pack_index_list_extents[0] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - // edges - pack_index_list_extents[6] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[7] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[8] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[9] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[10] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[11] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - // corners - pack_index_list_extents[18] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[19] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[20] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[21] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { - - Extent extent = pack_index_list_extents[l]; - - pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); - - Int_ptr pack_list = pack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type pack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; - - pack_list[list_idx] = pack_idx; - - list_idx += 1; - } - } - } - } -} - -// -// Function to destroy packing index lists. -// -void HALOEXCHANGE_FUSED::destroy_pack_lists( - std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - (void) vid; - - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(pack_index_lists[l], vid); - } -} - -// -// Function to generate index lists for unpacking. -// -void HALOEXCHANGE_FUSED::create_unpack_lists( - std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - std::vector unpack_index_list_extents(num_neighbors); - - // faces - unpack_index_list_extents[0] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - // edges - unpack_index_list_extents[6] = Extent{0 , halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[7] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[10] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[11] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - // corners - unpack_index_list_extents[18] = Extent{0 , halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[19] = Extent{0 , halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[20] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[21] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { - - Extent extent = unpack_index_list_extents[l]; - - unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - - Int_ptr unpack_list = unpack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type unpack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; - - unpack_list[list_idx] = unpack_idx; - - list_idx += 1; - } - } - } - } -} - -// -// Function to destroy unpacking index lists. -// -void HALOEXCHANGE_FUSED::destroy_unpack_lists( - std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - (void) vid; - - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(unpack_index_lists[l], vid); - } -} - -} // end namespace apps -} // end namespace rajaperf diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 8fe91fbf7..a0142d1aa 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -25,8 +25,8 @@ namespace apps // Define thread block shape for CUDA execution // #define m_block_sz (32) -#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ m_block_sz, g_block_sz, z_block_sz @@ -91,11 +91,12 @@ void LTIMES::runCudaVariantImpl(VariantID vid) LTIMES_NBLOCKS_CUDA; constexpr size_t shmem = 0; - ltimes - <<>>(phidat, elldat, psidat, - num_d, - num_m, num_g, num_z); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (ltimes), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + phidat, elldat, psidat, + num_d, num_m, num_g, num_z ); } stopTimer(); @@ -105,19 +106,24 @@ void LTIMES::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto ltimes_lambda = [=] __device__ (Index_type z, Index_type g, + Index_type m) { + for (Index_type d = 0; d < num_d; ++d ) { + LTIMES_BODY; + } + }; + LTIMES_THREADS_PER_BLOCK_CUDA; LTIMES_NBLOCKS_CUDA; constexpr size_t shmem = 0; - ltimes_lam - <<>>(num_m, num_g, num_z, - [=] __device__ (Index_type z, Index_type g, Index_type m) { - for (Index_type d = 0; d < num_d; ++d ) { - LTIMES_BODY; - } - } - ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (ltimes_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + num_m, num_g, num_z, + ltimes_lambda ); } stopTimer(); @@ -144,14 +150,16 @@ void LTIMES::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(IDRange(0, num_d), - IZRange(0, num_z), - IGRange(0, num_g), - IMRange(0, num_m)), - res, + RAJA::kernel_resource( + RAJA::make_tuple(IDRange(0, num_d), + IZRange(0, num_z), + IGRange(0, num_g), + IMRange(0, num_m)), + res, [=] __device__ (ID d, IZ z, IG g, IM m) { - LTIMES_BODY_RAJA; - }); + LTIMES_BODY_RAJA; + } + ); } stopTimer(); diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 035bbc12d..949694d10 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -25,8 +25,8 @@ namespace apps // Define thread block shape for Hip execution // #define m_block_sz (32) -#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ m_block_sz, g_block_sz, z_block_sz @@ -90,12 +90,12 @@ void LTIMES::runHipVariantImpl(VariantID vid) LTIMES_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((ltimes), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - phidat, elldat, psidat, - num_d, - num_m, num_g, num_z); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (ltimes), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + phidat, elldat, psidat, + num_d, num_m, num_g, num_z ); } stopTimer(); @@ -105,21 +105,24 @@ void LTIMES::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto ltimes_lambda = [=] __device__ (Index_type z, Index_type g, + Index_type m) { + for (Index_type d = 0; d < num_d; ++d ) { + LTIMES_BODY; + } + }; + LTIMES_THREADS_PER_BLOCK_HIP; LTIMES_NBLOCKS_HIP; constexpr size_t shmem = 0; - auto ltimes_lambda = - [=] __device__ (Index_type z, Index_type g, Index_type m) { - for (Index_type d = 0; d < num_d; ++d ) { - LTIMES_BODY; - } - }; - - hipLaunchKernelGGL((ltimes_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - num_m, num_g, num_z, ltimes_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (ltimes_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + num_m, num_g, num_z, + ltimes_lambda ); } stopTimer(); @@ -146,14 +149,16 @@ void LTIMES::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(IDRange(0, num_d), - IZRange(0, num_z), - IGRange(0, num_g), - IMRange(0, num_m)), - res, + RAJA::kernel_resource( + RAJA::make_tuple(IDRange(0, num_d), + IZRange(0, num_z), + IGRange(0, num_g), + IMRange(0, num_m)), + res, [=] __device__ (ID d, IZ z, IG g, IM m) { - LTIMES_BODY_RAJA; - }); + LTIMES_BODY_RAJA; + } + ); } stopTimer(); diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp index 93ce138ef..80c4a4a0e 100644 --- a/src/apps/LTIMES-OMP.cpp +++ b/src/apps/LTIMES-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp index 7ae4ee1e2..da7047d20 100644 --- a/src/apps/LTIMES-OMPTarget.cpp +++ b/src/apps/LTIMES-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp index 33fd4b666..66503ed26 100644 --- a/src/apps/LTIMES-Seq.cpp +++ b/src/apps/LTIMES-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp new file mode 100644 index 000000000..541a132f7 --- /dev/null +++ b/src/apps/LTIMES-Sycl.cpp @@ -0,0 +1,116 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "LTIMES.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +// +// Define work-group shape for SYCL execution +// +#define m_wg_sz (32) +#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz)) +#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz)) + +template +void LTIMES::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + LTIMES_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), + g_wg_sz * RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), + m_wg_sz * RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); + sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type m = item.get_global_id(2); + Index_type g = item.get_global_id(1); + Index_type z = item.get_global_id(0); + + if (m < num_m && g < num_g && z < num_z) { + for (Index_type d = 0; d < num_d; ++d) { + LTIMES_BODY; + } + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + LTIMES_VIEWS_RANGES_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<1, RAJA::sycl_global_2, //z + RAJA::statement::For<2, RAJA::sycl_global_1, //g + RAJA::statement::For<3, RAJA::sycl_global_0, //m + RAJA::statement::For<0, RAJA::seq_exec, //d + RAJA::statement::Lambda<0> + > + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_resource( + RAJA::make_tuple(IDRange(0, num_d), + IZRange(0, num_z), + IGRange(0, num_g), + IMRange(0, num_m)), + res, + [=] (ID d, IZ z, IG g, IM m) { + LTIMES_BODY_RAJA; + }); + + } + stopTimer(); + + } else { + std::cout << "\n LTIMES : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(LTIMES, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 0abb82d35..798d44715 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,20 +23,15 @@ namespace apps LTIMES::LTIMES(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES, params) { - m_num_d_default = 64; - m_num_z_default = 488; - m_num_g_default = 32; - m_num_m_default = 25; + m_num_d = params.getLtimesNumD(); + m_num_g = params.getLtimesNumG(); + m_num_m = params.getLtimesNumM(); + Index_type num_z_default = std::max((Index_type{1000000} + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1)); - setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default); + setDefaultProblemSize(m_num_d * m_num_g * num_z_default); setDefaultReps(50); - m_num_z = std::max( getTargetProblemSize() / - (m_num_d_default * m_num_g_default), - Index_type(1) ); - m_num_g = m_num_g_default; - m_num_m = m_num_m_default; - m_num_d = m_num_d_default; + m_num_z = std::max((getTargetProblemSize() + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1)); m_philen = m_num_m * m_num_g * m_num_z; m_elllen = m_num_d * m_num_m; @@ -47,9 +42,11 @@ LTIMES::LTIMES(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); // using total data size instead of writes and reads - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_philen + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_elllen + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_psilen ); + setBytesReadPerRep( 1*sizeof(Real_type) * m_philen + + 1*sizeof(Real_type) * m_elllen + + 1*sizeof(Real_type) * m_psilen ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d); checksum_scale_factor = 0.001 * @@ -77,6 +74,9 @@ LTIMES::LTIMES(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } LTIMES::~LTIMES() diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index 2f3f0ca6d..0e74f187f 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -116,28 +116,28 @@ class LTIMES : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Real_ptr m_phidat; Real_ptr m_elldat; Real_ptr m_psidat; - Index_type m_num_d_default; - Index_type m_num_z_default; - Index_type m_num_g_default; - Index_type m_num_m_default; - Index_type m_num_d; Index_type m_num_z; Index_type m_num_g; diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index f12e5d131..9486f20e2 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -25,8 +25,8 @@ namespace apps // Define thread block shape for CUDA execution // #define m_block_sz (32) -#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ m_block_sz, g_block_sz, z_block_sz @@ -90,11 +90,12 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) LTIMES_NOVIEW_NBLOCKS_CUDA; constexpr size_t shmem = 0; - ltimes_noview - <<>>(phidat, elldat, psidat, - num_d, - num_m, num_g, num_z); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (ltimes_noview), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + phidat, elldat, psidat, + num_d, num_m, num_g, num_z ); } stopTimer(); @@ -104,19 +105,24 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA; - LTIMES_NOVIEW_NBLOCKS_CUDA; - constexpr size_t shmem = 0; - - ltimes_noview_lam - <<>>(num_m, num_g, num_z, + auto ltimes_noview_lambda = [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_NOVIEW_BODY; } - } - ); - cudaErrchk( cudaGetLastError() ); + }; + + LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA; + LTIMES_NOVIEW_NBLOCKS_CUDA; + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( + (ltimes_noview_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + num_m, num_g, num_z, + ltimes_noview_lambda ); } stopTimer(); @@ -141,14 +147,17 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment(0, num_d), - RAJA::RangeSegment(0, num_z), - RAJA::RangeSegment(0, num_g), - RAJA::RangeSegment(0, num_m)), - res, - [=] __device__ (Index_type d, Index_type z, Index_type g, Index_type m) { - LTIMES_NOVIEW_BODY; - }); + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, num_d), + RAJA::RangeSegment(0, num_z), + RAJA::RangeSegment(0, num_g), + RAJA::RangeSegment(0, num_m)), + res, + [=] __device__ (Index_type d, Index_type z, + Index_type g, Index_type m) { + LTIMES_NOVIEW_BODY; + } + ); } stopTimer(); diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 7252a5402..be6e2d756 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -25,8 +25,8 @@ namespace apps // Define thread block shape for Hip execution // #define m_block_sz (32) -#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ m_block_sz, g_block_sz, z_block_sz @@ -90,12 +90,12 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) LTIMES_NOVIEW_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((ltimes_noview), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - phidat, elldat, psidat, - num_d, - num_m, num_g, num_z); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (ltimes_noview), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + phidat, elldat, psidat, + num_d, num_m, num_g, num_z ); } stopTimer(); @@ -105,22 +105,24 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; - LTIMES_NOVIEW_NBLOCKS_HIP; - constexpr size_t shmem = 0; - - auto ltimes_noview_lambda = + auto ltimes_noview_lambda = [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_NOVIEW_BODY; } - }; + }; + + LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; + LTIMES_NOVIEW_NBLOCKS_HIP; + constexpr size_t shmem = 0; - hipLaunchKernelGGL((ltimes_noview_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - num_m, num_g, num_z, - ltimes_noview_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (ltimes_noview_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + num_m, num_g, num_z, + ltimes_noview_lambda ); } stopTimer(); @@ -145,14 +147,17 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment(0, num_d), - RAJA::RangeSegment(0, num_z), - RAJA::RangeSegment(0, num_g), - RAJA::RangeSegment(0, num_m)), - res, - [=] __device__ (Index_type d, Index_type z, Index_type g, Index_type m) { + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, num_d), + RAJA::RangeSegment(0, num_z), + RAJA::RangeSegment(0, num_g), + RAJA::RangeSegment(0, num_m)), + res, + [=] __device__ (Index_type d, Index_type z, + Index_type g, Index_type m) { LTIMES_NOVIEW_BODY; - }); + } + ); } stopTimer(); diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp index e9df87b83..900606076 100644 --- a/src/apps/LTIMES_NOVIEW-OMP.cpp +++ b/src/apps/LTIMES_NOVIEW-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp index 9a1f0bf06..1ffddaeaa 100644 --- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp +++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp index cd202004d..d4c6e4f41 100644 --- a/src/apps/LTIMES_NOVIEW-Seq.cpp +++ b/src/apps/LTIMES_NOVIEW-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp new file mode 100644 index 000000000..d9b5cfaf6 --- /dev/null +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -0,0 +1,114 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "LTIMES_NOVIEW.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +// +// Define work-group shape for SYCL execution +// +#define m_wg_sz (32) +#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz)) +#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz)) + +template +void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + LTIMES_NOVIEW_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), + g_wg_sz * RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), + m_wg_sz * RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); + sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type m = item.get_global_id(2); + Index_type g = item.get_global_id(1); + Index_type z = item.get_global_id(0); + + if (m < num_m && g < num_g && z < num_z) { + for (Index_type d = 0; d < num_d; ++d) { + LTIMES_NOVIEW_BODY; + } + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<1, RAJA::sycl_global_2, //z + RAJA::statement::For<2, RAJA::sycl_global_1, //g + RAJA::statement::For<3, RAJA::sycl_global_0, //m + RAJA::statement::For<0, RAJA::seq_exec, //d + RAJA::statement::Lambda<0> + > + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, num_d), + RAJA::RangeSegment(0, num_z), + RAJA::RangeSegment(0, num_g), + RAJA::RangeSegment(0, num_m)), + res, + [=] (Index_type d, Index_type z, Index_type g, Index_type m) { + LTIMES_NOVIEW_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n LTIMES_NOVIEW : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(LTIMES_NOVIEW, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index a106d5418..0e675d487 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,20 +23,15 @@ namespace apps LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES_NOVIEW, params) { - m_num_d_default = 64; - m_num_z_default = 488; - m_num_g_default = 32; - m_num_m_default = 25; + m_num_d = params.getLtimesNumD(); + m_num_g = params.getLtimesNumG(); + m_num_m = params.getLtimesNumM(); + Index_type num_z_default = std::max((Index_type{1000000} + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1)); - setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default); + setDefaultProblemSize(m_num_d * m_num_g * num_z_default); setDefaultReps(50); - m_num_z = std::max( getTargetProblemSize() / - (m_num_d_default * m_num_g_default), - Index_type(1) ); - m_num_g = m_num_g_default; - m_num_m = m_num_m_default; - m_num_d = m_num_d_default; + m_num_z = std::max((getTargetProblemSize() + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1)); m_philen = m_num_m * m_num_g * m_num_z; m_elllen = m_num_d * m_num_m; @@ -47,9 +42,11 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); // using total data size instead of writes and reads - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_philen + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_elllen + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_psilen ); + setBytesReadPerRep( 1*sizeof(Real_type) * m_philen + + 1*sizeof(Real_type) * m_elllen + + 1*sizeof(Real_type) * m_psilen ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d); checksum_scale_factor = 0.001 * @@ -76,6 +73,9 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } LTIMES_NOVIEW::~LTIMES_NOVIEW() diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 96a296366..4829b8171 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -66,28 +66,28 @@ class LTIMES_NOVIEW : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Real_ptr m_phidat; Real_ptr m_elldat; Real_ptr m_psidat; - Index_type m_num_d_default; - Index_type m_num_z_default; - Index_type m_num_g_default; - Index_type m_num_m_default; - Index_type m_num_d; Index_type m_num_z; Index_type m_num_g; diff --git a/src/apps/MASS3DEA-Cuda.cpp b/src/apps/MASS3DEA-Cuda.cpp index 87d918b11..649fd5b01 100644 --- a/src/apps/MASS3DEA-Cuda.cpp +++ b/src/apps/MASS3DEA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -69,15 +69,16 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { case Base_CUDA: { - dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); - constexpr size_t shmem = 0; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Mass3DEA<<>>(B, D, M); + dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); + constexpr size_t shmem = 0; - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (Mass3DEA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, D, M ); } stopTimer(); diff --git a/src/apps/MASS3DEA-Hip.cpp b/src/apps/MASS3DEA-Hip.cpp index 7184694e3..2eeabadeb 100644 --- a/src/apps/MASS3DEA-Hip.cpp +++ b/src/apps/MASS3DEA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -69,17 +69,16 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { case Base_HIP: { - dim3 nblocks(NE); - dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); - constexpr size_t shmem = 0; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Mass3DEA), dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - B, D, M); + dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); + constexpr size_t shmem = 0; - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (Mass3DEA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, D, M ); } stopTimer(); diff --git a/src/apps/MASS3DEA-OMP.cpp b/src/apps/MASS3DEA-OMP.cpp index 7dd1ab122..2b77eeb6e 100644 --- a/src/apps/MASS3DEA-OMP.cpp +++ b/src/apps/MASS3DEA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA-OMPTarget.cpp b/src/apps/MASS3DEA-OMPTarget.cpp index b3d8aa75f..6f41914ab 100644 --- a/src/apps/MASS3DEA-OMPTarget.cpp +++ b/src/apps/MASS3DEA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA-Seq.cpp b/src/apps/MASS3DEA-Seq.cpp index bc906f0f6..f3b0cfc99 100644 --- a/src/apps/MASS3DEA-Seq.cpp +++ b/src/apps/MASS3DEA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA-Sycl.cpp b/src/apps/MASS3DEA-Sycl.cpp new file mode 100644 index 000000000..a2dfd87f3 --- /dev/null +++ b/src/apps/MASS3DEA-Sycl.cpp @@ -0,0 +1,196 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MASS3DEA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t work_group_size > +void MASS3DEA::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MASS3DEA_DATA_SETUP; + + switch (vid) { + + case Base_SYCL: { + + const ::sycl::range<3> workGroupSize(MEA_Q1D, MEA_Q1D, MEA_Q1D); + const ::sycl::range<3> gridSize(MEA_Q1D,MEA_Q1D,MEA_Q1D*NE); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + constexpr size_t shmem = 0; + qu->submit([&](cl::sycl::handler& h) { + + ::sycl::local_accessor s_B(::sycl::range<2>(MEA_Q1D,MEA_D1D),h); + ::sycl::local_accessor s_D(::sycl::range<3>(MEA_Q1D,MEA_Q1D,MEA_Q1D),h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, workGroupSize), + [=] (cl::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(2); + + SYCL_FOREACH_THREAD(iz, 0, 1) { + SYCL_FOREACH_THREAD(d, 2, MEA_D1D) { + SYCL_FOREACH_THREAD(q, 1, MEA_Q1D) { + MASS3DEA_1 + } + } + } + + //not needed as we dynamicaly allocate shared memory in sycl + //MASS3DEA_2 + + SYCL_FOREACH_THREAD(k1, 2, MEA_Q1D) { + SYCL_FOREACH_THREAD(k2, 1, MEA_Q1D) { + SYCL_FOREACH_THREAD(k3, 0, MEA_Q1D) { + MASS3DEA_3 + } + } + } + + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(i1, 2, MEA_D1D) { + SYCL_FOREACH_THREAD(i2, 1, MEA_D1D) { + SYCL_FOREACH_THREAD(i3, 0, MEA_D1D) { + MASS3DEA_4 + } + } + } + + }); + }); + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + constexpr size_t shmem = (MEA_Q1D*MEA_D1D + MEA_Q1D*MEA_Q1D*MEA_Q1D)*sizeof(double); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + double * s_B_ptr = ctx.getSharedMemory(MEA_Q1D*MEA_D1D); + double * s_D_ptr = ctx.getSharedMemory(MEA_Q1D*MEA_Q1D*MEA_Q1D); + + double (*s_B)[MEA_D1D] = (double (*)[MEA_D1D]) s_B_ptr; + double (*s_D)[MEA_Q1D][MEA_Q1D] = (double (*)[MEA_Q1D][MEA_Q1D]) s_B_ptr; + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](int ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + [&](int d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + [&](int q) { + MASS3DEA_1 + } + ); // RAJA::loop + } + ); // RAJA::loop + } + ); // RAJA::loop + + //not needed as we dynamicaly allocate shared memory in sycl + //MASS3DEA_2 + + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + [&](int k1) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + [&](int k2) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + [&](int k3) { + MASS3DEA_3 + } + ); // RAJA::loop + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + [&](int i1) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + [&](int i2) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + [&](int i3) { + MASS3DEA_4 + } + ); // RAJA::loop + } + ); // RAJA::loop + } + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DEA : Unknown Sycl variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DEA, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index d6239222f..9689c35ae 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -29,20 +29,21 @@ MASS3DEA::MASS3DEA(const RunParams& params) setDefaultReps(1); const int ea_mat_entries = MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D; - - m_NE = std::max(getTargetProblemSize()/(ea_mat_entries), Index_type(1)); + + m_NE = std::max((getTargetProblemSize() + (ea_mat_entries)/2) / (ea_mat_entries), Index_type(1)); setActualProblemSize( m_NE*ea_mat_entries); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( MEA_Q1D*MEA_D1D*sizeof(Real_type) + // B - MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE*sizeof(Real_type) + // D - ea_mat_entries*m_NE*sizeof(Real_type) ); // M_e + setBytesReadPerRep( 1*sizeof(Real_type) * MEA_Q1D*MEA_D1D + // B + 1*sizeof(Real_type) * MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE ); // D + setBytesWrittenPerRep( 1*sizeof(Real_type) * ea_mat_entries*m_NE ); // M_e + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * 7 * ea_mat_entries); - + setUsesFeature(Launch); setVariantDefined( Base_Seq ); @@ -57,6 +58,9 @@ MASS3DEA::MASS3DEA(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } MASS3DEA::~MASS3DEA() diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index df029299e..7c0ea6e02 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -104,11 +104,9 @@ #define MASS3DEA_1 s_B[q][d] = B_MEA_(q, d); #define MASS3DEA_2 \ - double(*l_B)[MEA_D1D] = (double(*)[MEA_D1D])s_B; \ RAJA_TEAM_SHARED double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D]; #define MASS3DEA_2_CPU \ - double(*l_B)[MEA_D1D] = (double(*)[MEA_D1D])s_B; \ double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D]; #define MASS3DEA_3 s_D[k1][k2][k3] = D_MEA_(k1, k2, k3, e); @@ -123,9 +121,9 @@ for (int k2 = 0; k2 < MEA_Q1D; ++k2) { \ for (int k3 = 0; k3 < MEA_Q1D; ++k3) { \ \ - val += l_B[k1][i1] * l_B[k1][j1] * l_B[k2][i2] \ - * l_B[k2][j2] * \ - l_B[k3][i3] * l_B[k3][j3] * s_D[k1][k2][k3]; \ + val += s_B[k1][i1] * s_B[k1][j1] * s_B[k2][i2] \ + * s_B[k2][j2] * \ + s_B[k3][i3] * s_B[k3][j3] * s_D[k1][k2][k3]; \ } \ } \ } \ @@ -154,16 +152,23 @@ class MASS3DEA : public KernelBase { void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template void runCudaVariantImpl(VariantID vid); - template void runHipVariantImpl(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + + template + void runCudaVariantImpl(VariantID vid); + template + void runHipVariantImpl(VariantID vid); + template + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = MEA_D1D * MEA_D1D * MEA_D1D; using gpu_block_sizes_type = - gpu_block_size::list_type; + integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index a8f769d6a..60092ef7f 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -99,15 +99,16 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { case Base_CUDA: { - dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); - constexpr size_t shmem = 0; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Mass3DPA<<>>(B, Bt, D, X, Y); + dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); + constexpr size_t shmem = 0; - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (Mass3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, Bt, D, X, Y ); } stopTimer(); diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index c9d600136..1fbd0dea9 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -99,17 +99,16 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { case Base_HIP: { - dim3 nblocks(NE); - dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); - constexpr size_t shmem = 0; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Mass3DPA), dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - B, Bt, D, X, Y); + dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); + constexpr size_t shmem = 0; - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (Mass3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, Bt, D, X, Y ); } stopTimer(); diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index f2e122fed..4c6b2867c 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp index e4cc02a4f..d74c14641 100644 --- a/src/apps/MASS3DPA-OMPTarget.cpp +++ b/src/apps/MASS3DPA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 39087834d..e22068169 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-Sycl.cpp b/src/apps/MASS3DPA-Sycl.cpp new file mode 100644 index 000000000..7d65034a7 --- /dev/null +++ b/src/apps/MASS3DPA-Sycl.cpp @@ -0,0 +1,316 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t work_group_size > +void MASS3DPA::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MASS3DPA_DATA_SETUP; + + const ::sycl::range<3> workGroupSize(1, MPA_Q1D, MPA_Q1D); + const ::sycl::range<3> gridSize(1, MPA_Q1D, MPA_Q1D*NE); + + switch (vid) { + + case Base_SYCL: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&](cl::sycl::handler& h) { + + constexpr int MQ1 = MPA_Q1D; + constexpr int MD1 = MPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + auto sDQ_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1 * MD1), h); + auto sm0_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + auto sm1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, workGroupSize), + [=] (cl::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(2); + + double *sDQ = sDQ_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1 = sm1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; + double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; + + double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; + double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; + double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; + double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; + double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; + double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + + SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, MPA_D1D){ + MASS3DPA_1 + } + SYCL_FOREACH_THREAD(dx, 2, MPA_Q1D) { + MASS3DPA_2 + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { + MASS3DPA_3 + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { + MASS3DPA_4 + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { + MASS3DPA_5 + } + } + + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(d, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(q, 2, MPA_Q1D) { + MASS3DPA_6 + } + } + + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { + SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { + MASS3DPA_7 + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { + MASS3DPA_8 + } + } + + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { + MASS3DPA_9 + } + } + + }); + }); + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + //Caclulate amount of shared memory needed + size_t shmem = 0; + { + constexpr int MQ1 = MPA_Q1D; + constexpr int MD1 = MPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + constexpr int no_mats = 2; + shmem += MQ1 * MD1 * no_mats * MDQ * MDQ * MDQ * sizeof(double); + } + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MPA_Q1D, MPA_Q1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + //Redefine inside the lambda to keep consistent with base version + constexpr int MQ1 = MPA_Q1D; + constexpr int MD1 = MPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + double *sDQ = ctx.getSharedMemory(MQ1 * MD1); + double *sm0 = ctx.getSharedMemory(MDQ * MDQ * MDQ); + double *sm1 = ctx.getSharedMemory(MDQ * MDQ * MDQ); + + double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; + double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; + + double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; + double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; + double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; + double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; + double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; + double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dx) { + MASS3DPA_1 + } + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int dx) { + MASS3DPA_2 + } + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qx) { + MASS3DPA_3 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qx) { + MASS3DPA_4 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qx) { + MASS3DPA_5 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int q) { + MASS3DPA_6 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dx) { + MASS3DPA_7 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dx) { + MASS3DPA_8 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dx) { + MASS3DPA_9 + } + ); // RAJA::loop + } + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DPA : Unknown Sycl variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 1c99e9f73..f60e64c3e 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,18 +28,18 @@ MASS3DPA::MASS3DPA(const RunParams& params) setDefaultProblemSize(m_NE_default*MPA_Q1D*MPA_Q1D*MPA_Q1D); setDefaultReps(50); - m_NE = std::max(getTargetProblemSize()/(MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (MPA_Q1D*MPA_Q1D*MPA_Q1D)/2) / (MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1)); setActualProblemSize( m_NE*MPA_Q1D*MPA_Q1D*MPA_Q1D ); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( MPA_Q1D*MPA_D1D*sizeof(Real_type) + - MPA_Q1D*MPA_D1D*sizeof(Real_type) + - MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE*sizeof(Real_type) + - MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) + - MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) ); + setBytesReadPerRep( 2*sizeof(Real_type) * MPA_Q1D*MPA_D1D + // B, Bt + 2*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X, Y + 1*sizeof(Real_type) * MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE ); // D + setBytesWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D + 2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D + @@ -61,6 +61,9 @@ MASS3DPA::MASS3DPA(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } MASS3DPA::~MASS3DPA() diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 7365fa011..0e11e234b 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -363,17 +363,22 @@ class MASS3DPA : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/MATVEC_3D_STENCIL-Cuda.cpp b/src/apps/MATVEC_3D_STENCIL-Cuda.cpp new file mode 100644 index 000000000..e5ef75f08 --- /dev/null +++ b/src/apps/MATVEC_3D_STENCIL-Cuda.cpp @@ -0,0 +1,204 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D_STENCIL.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void matvec_3d(Real_ptr b, + Real_ptr xdbl, + Real_ptr xdbc, + Real_ptr xdbr, + Real_ptr xdcl, + Real_ptr xdcc, + Real_ptr xdcr, + Real_ptr xdfl, + Real_ptr xdfc, + Real_ptr xdfr, + Real_ptr xcbl, + Real_ptr xcbc, + Real_ptr xcbr, + Real_ptr xccl, + Real_ptr xccc, + Real_ptr xccr, + Real_ptr xcfl, + Real_ptr xcfc, + Real_ptr xcfr, + Real_ptr xubl, + Real_ptr xubc, + Real_ptr xubr, + Real_ptr xucl, + Real_ptr xucc, + Real_ptr xucr, + Real_ptr xufl, + Real_ptr xufc, + Real_ptr xufr, + Real_ptr dbl, + Real_ptr dbc, + Real_ptr dbr, + Real_ptr dcl, + Real_ptr dcc, + Real_ptr dcr, + Real_ptr dfl, + Real_ptr dfc, + Real_ptr dfr, + Real_ptr cbl, + Real_ptr cbc, + Real_ptr cbr, + Real_ptr ccl, + Real_ptr ccc, + Real_ptr ccr, + Real_ptr cfl, + Real_ptr cfc, + Real_ptr cfr, + Real_ptr ubl, + Real_ptr ubc, + Real_ptr ubr, + Real_ptr ucl, + Real_ptr ucc, + Real_ptr ucr, + Real_ptr ufl, + Real_ptr ufc, + Real_ptr ufr, + Index_ptr real_zones, + Index_type ibegin, Index_type iend) +{ + Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = ii + ibegin; + if (i < iend) { + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; + } +} + + +template < size_t block_size > +void MATVEC_3D_STENCIL::runCudaVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getCudaResource()}; + + MATVEC_3D_STENCIL_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (matvec_3d), + grid_size, block_size, + shmem, res.get_stream(), + b, + xdbl, + xdbc, + xdbr, + xdcl, + xdcc, + xdcr, + xdfl, + xdfc, + xdfr, + xcbl, + xcbc, + xcbr, + xccl, + xccc, + xccr, + xcfl, + xcfc, + xcfr, + xubl, + xubc, + xubr, + xucl, + xucc, + xucr, + xufl, + xufc, + xufr, + dbl, + dbc, + dbr, + dcl, + dcc, + dcr, + dfl, + dfc, + dfr, + cbl, + cbc, + cbr, + ccl, + ccc, + ccr, + cfl, + cfc, + cfr, + ubl, + ubc, + ubr, + ucl, + ucc, + ucr, + ufl, + ufc, + ufr, + real_zones, + ibegin, iend ); + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( res, + zones, [=] __device__ (Index_type i) { + MATVEC_3D_STENCIL_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n MATVEC_3D_STENCIL : Unknown Cuda variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Cuda) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/apps/MATVEC_3D_STENCIL-Hip.cpp b/src/apps/MATVEC_3D_STENCIL-Hip.cpp new file mode 100644 index 000000000..a24757cb8 --- /dev/null +++ b/src/apps/MATVEC_3D_STENCIL-Hip.cpp @@ -0,0 +1,204 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D_STENCIL.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void matvec_3d(Real_ptr b, + Real_ptr xdbl, + Real_ptr xdbc, + Real_ptr xdbr, + Real_ptr xdcl, + Real_ptr xdcc, + Real_ptr xdcr, + Real_ptr xdfl, + Real_ptr xdfc, + Real_ptr xdfr, + Real_ptr xcbl, + Real_ptr xcbc, + Real_ptr xcbr, + Real_ptr xccl, + Real_ptr xccc, + Real_ptr xccr, + Real_ptr xcfl, + Real_ptr xcfc, + Real_ptr xcfr, + Real_ptr xubl, + Real_ptr xubc, + Real_ptr xubr, + Real_ptr xucl, + Real_ptr xucc, + Real_ptr xucr, + Real_ptr xufl, + Real_ptr xufc, + Real_ptr xufr, + Real_ptr dbl, + Real_ptr dbc, + Real_ptr dbr, + Real_ptr dcl, + Real_ptr dcc, + Real_ptr dcr, + Real_ptr dfl, + Real_ptr dfc, + Real_ptr dfr, + Real_ptr cbl, + Real_ptr cbc, + Real_ptr cbr, + Real_ptr ccl, + Real_ptr ccc, + Real_ptr ccr, + Real_ptr cfl, + Real_ptr cfc, + Real_ptr cfr, + Real_ptr ubl, + Real_ptr ubc, + Real_ptr ubr, + Real_ptr ucl, + Real_ptr ucc, + Real_ptr ucr, + Real_ptr ufl, + Real_ptr ufc, + Real_ptr ufr, + Index_ptr real_zones, + Index_type ibegin, Index_type iend) +{ + Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = ii + ibegin; + if (i < iend) { + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; + } +} + + +template < size_t block_size > +void MATVEC_3D_STENCIL::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getHipResource()}; + + MATVEC_3D_STENCIL_DATA_SETUP; + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (matvec_3d), + grid_size, block_size, + shmem, res.get_stream(), + b, + xdbl, + xdbc, + xdbr, + xdcl, + xdcc, + xdcr, + xdfl, + xdfc, + xdfr, + xcbl, + xcbc, + xcbr, + xccl, + xccc, + xccr, + xcfl, + xcfc, + xcfr, + xubl, + xubc, + xubr, + xucl, + xucc, + xucr, + xufl, + xufc, + xufr, + dbl, + dbc, + dbr, + dcl, + dcc, + dcr, + dfl, + dfc, + dfr, + cbl, + cbc, + cbr, + ccl, + ccc, + ccr, + cfl, + cfc, + cfr, + ubl, + ubc, + ubr, + ucl, + ucc, + ucr, + ufl, + ufc, + ufr, + real_zones, + ibegin, iend ); + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( res, + zones, [=] __device__ (Index_type i) { + MATVEC_3D_STENCIL_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n MATVEC_3D_STENCIL : Unknown Hip variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Hip) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/MATVEC_3D_STENCIL-OMP.cpp b/src/apps/MATVEC_3D_STENCIL-OMP.cpp new file mode 100644 index 000000000..6365aed70 --- /dev/null +++ b/src/apps/MATVEC_3D_STENCIL-OMP.cpp @@ -0,0 +1,108 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D_STENCIL.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void MATVEC_3D_STENCIL::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_STENCIL_DATA_SETUP; + + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto matvec_3d_lam = [=](Index_type ii) { + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + matvec_3d_lam(ii); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + camp::resources::Resource working_res{camp::resources::Host::get_default()}; + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); + + auto matvec_3d_lam = [=](Index_type i) { + MATVEC_3D_STENCIL_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + zones, matvec_3d_lam); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n MATVEC_3D_STENCIL : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp b/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp new file mode 100644 index 000000000..09a3093d4 --- /dev/null +++ b/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp @@ -0,0 +1,87 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D_STENCIL.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void MATVEC_3D_STENCIL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_STENCIL_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(b, \ + dbl, dbc, dbr, dcl, dcc, dcr, dfl, dfc, dfr, \ + xdbl, xdbc, xdbr, xdcl, xdcc, xdcr, xdfl, xdfc, xdfr, \ + cbl, cbc, cbr, ccl, ccc, ccr, cfl, cfc, cfr, \ + xcbl, xcbc, xcbr, xccl, xccc, xccr, xcfl, xcfc, xcfr, \ + ubl, ubc, ubr, ucl, ucc, ucr, ufl, ufc, ufr, \ + xubl, xubc, xubr, xucl, xucc, xucr, xufl, xufc, xufr, \ + real_zones) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; + } + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + camp::resources::Resource working_res{camp::resources::Omp::get_default()}; + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + zones, [=](Index_type i) { + MATVEC_3D_STENCIL_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n MATVEC_3D_STENCIL : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/MATVEC_3D_STENCIL-Seq.cpp b/src/apps/MATVEC_3D_STENCIL-Seq.cpp new file mode 100644 index 000000000..795a01e19 --- /dev/null +++ b/src/apps/MATVEC_3D_STENCIL-Seq.cpp @@ -0,0 +1,101 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D_STENCIL.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void MATVEC_3D_STENCIL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_STENCIL_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto matvec_3d_lam = [=](Index_type ii) { + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + matvec_3d_lam(ii); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + camp::resources::Resource working_res{camp::resources::Host::get_default()}; + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); + + auto matvec_3d_lam = [=](Index_type i) { + MATVEC_3D_STENCIL_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall(zones, matvec_3d_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n MATVEC_3D_STENCIL : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MATVEC_3D_STENCIL-Sycl.cpp b/src/apps/MATVEC_3D_STENCIL-Sycl.cpp new file mode 100644 index 000000000..c6110f2d8 --- /dev/null +++ b/src/apps/MATVEC_3D_STENCIL-Sycl.cpp @@ -0,0 +1,89 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D_STENCIL.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "AppsData.hpp" + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +template +void MATVEC_3D_STENCIL::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MATVEC_3D_STENCIL_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type ii = item.get_global_id(0); + Index_type i = ii + ibegin; + if (i < iend) { + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>(res, + zones, [=](Index_type i) { + MATVEC_3D_STENCIL_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n MATVEC_3D_STENCIL : Unknown Sycl variant id = " << vid << std::endl; + } + +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp new file mode 100644 index 000000000..2b8cb6978 --- /dev/null +++ b/src/apps/MATVEC_3D_STENCIL.cpp @@ -0,0 +1,189 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D_STENCIL.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" +#include "common/DataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace apps +{ + + +MATVEC_3D_STENCIL::MATVEC_3D_STENCIL(const RunParams& params) + : KernelBase(rajaperf::Apps_MATVEC_3D_STENCIL, params) +{ + setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct + setDefaultReps(100); + + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; + m_domain = new ADomain(rzmax, /* ndims = */ 3); + + m_zonal_array_length = m_domain->lpz+1; + + setActualProblemSize( m_domain->n_real_zones ); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + + // touched data size, not actual number of stores and loads + const size_t ilen = m_domain->imax - m_domain->imin; + const size_t jlen = m_domain->jmax - m_domain->jmin; + const size_t klen = m_domain->kmax - m_domain->kmin; + auto get_size_extra = [&](size_t iextra, size_t jextra, size_t kextra) { + return (ilen + iextra) * (jlen + jextra) * (klen + kextra); + }; + auto get_size_matrix = [&](size_t ishift, size_t jshift, size_t kshift) { + // get the used size of matrix coefficient allocations + return get_size_extra(0,0,0) + // real zones + (get_size_extra(0,0,0) - (ilen - ishift) * // plus some extra from the + (jlen - jshift) * // edges based on the shift + (klen - kshift)); + }; + + const size_t b_accessed = get_size_extra(0, 0, 0); + const size_t x_accessed = get_size_extra(2, 2, 2) ; + const size_t m_accessed = get_size_matrix(0, 0, 0) + + get_size_matrix(1, 0, 0) + + get_size_matrix(1, 1, 0) + + get_size_matrix(0, 1, 0) + + get_size_matrix(1, 1, 0) + + get_size_matrix(1, 1, 1) + + get_size_matrix(0, 1, 1) + + get_size_matrix(1, 1, 1) + + get_size_matrix(1, 0, 1) + + get_size_matrix(0, 0, 1) + + get_size_matrix(1, 0, 1) + + get_size_matrix(1, 1, 1) + + get_size_matrix(0, 1, 1) + + get_size_matrix(1, 1, 1) ; + setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + + 1*sizeof(Real_type) * x_accessed + + 1*sizeof(Real_type) * m_accessed ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * b_accessed ); + setBytesAtomicModifyWrittenPerRep( 0 ); + + const size_t multiplies = 27; + const size_t adds = 26; + setFLOPsPerRep((multiplies + adds) * getItsPerRep()); + + checksum_scale_factor = 1.0 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + setUsesFeature(Forall); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); +} + +MATVEC_3D_STENCIL::~MATVEC_3D_STENCIL() +{ + delete m_domain; +} + +void MATVEC_3D_STENCIL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocAndInitDataConst(m_b, m_zonal_array_length, 0.0, vid); + allocAndInitData(m_x, m_zonal_array_length, vid); + + allocAndInitData(m_matrix.dbl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dbc, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dbr, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dcl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dcc, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dcr, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dfl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dfc, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dfr, m_zonal_array_length, vid); + allocAndInitData(m_matrix.cbl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.cbc, m_zonal_array_length, vid); + allocAndInitData(m_matrix.cbr, m_zonal_array_length, vid); + allocAndInitData(m_matrix.ccl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.ccc, m_zonal_array_length, vid); + m_matrix.ccr = m_matrix.ccl + 1 ; + m_matrix.cfl = m_matrix.cbr - 1 + m_domain->jp ; + m_matrix.cfc = m_matrix.cbc + m_domain->jp ; + m_matrix.cfr = m_matrix.cbl + 1 + m_domain->jp ; + m_matrix.ubl = m_matrix.dfr - 1 - m_domain->jp + m_domain->kp ; + m_matrix.ubc = m_matrix.dfc - m_domain->jp + m_domain->kp ; + m_matrix.ubr = m_matrix.dfl + 1 - m_domain->jp + m_domain->kp ; + m_matrix.ucl = m_matrix.dcr - 1 + m_domain->kp ; + m_matrix.ucc = m_matrix.dcc + m_domain->kp ; + m_matrix.ucr = m_matrix.dcl + 1 + m_domain->kp ; + m_matrix.ufl = m_matrix.dbr - 1 + m_domain->jp + m_domain->kp ; + m_matrix.ufc = m_matrix.dbc + m_domain->jp + m_domain->kp ; + m_matrix.ufr = m_matrix.dbl + 1 + m_domain->jp + m_domain->kp ; + + allocAndInitDataConst(m_real_zones, m_domain->n_real_zones, + static_cast(-1), vid); + + { + auto reset_rz = scopedMoveData(m_real_zones, m_domain->n_real_zones, vid); + + setRealZones_3d(m_real_zones, *m_domain); + } + +} + +void MATVEC_3D_STENCIL::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid].at(tune_idx) += calcChecksum(m_b, m_zonal_array_length, checksum_scale_factor , vid); +} + +void MATVEC_3D_STENCIL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + + deallocData(m_b, vid); + deallocData(m_x, vid); + + deallocData(m_matrix.dbl, vid); + deallocData(m_matrix.dbc, vid); + deallocData(m_matrix.dbr, vid); + deallocData(m_matrix.dcl, vid); + deallocData(m_matrix.dcc, vid); + deallocData(m_matrix.dcr, vid); + deallocData(m_matrix.dfl, vid); + deallocData(m_matrix.dfc, vid); + deallocData(m_matrix.dfr, vid); + deallocData(m_matrix.cbl, vid); + deallocData(m_matrix.cbc, vid); + deallocData(m_matrix.cbr, vid); + deallocData(m_matrix.ccl, vid); + deallocData(m_matrix.ccc, vid); + + deallocData(m_real_zones, vid); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MATVEC_3D_STENCIL.hpp b/src/apps/MATVEC_3D_STENCIL.hpp new file mode 100644 index 000000000..a537e7149 --- /dev/null +++ b/src/apps/MATVEC_3D_STENCIL.hpp @@ -0,0 +1,199 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// MATVEC_3D_STENCIL kernel reference implementation: +/// +/// for (Index_type ii = ibegin; ii < iend; ++ii ) { +/// Index_type i = real_zones[ii]; +/// +/// b[i] = dbl[i] * xdbl[i] + dbc[i] * xdbc[i] + dbr[i] * xdbr[i] + +/// dcl[i] * xdcl[i] + dcc[i] * xdcc[i] + dcr[i] * xdcr[i] + +/// dfl[i] * xdfl[i] + dfc[i] * xdfc[i] + dfr[i] * xdfr[i] + +/// +/// cbl[i] * xcbl[i] + cbc[i] * xcbc[i] + cbr[i] * xcbr[i] + +/// ccl[i] * xccl[i] + ccc[i] * xccc[i] + ccr[i] * xccr[i] + +/// cfl[i] * xcfl[i] + cfc[i] * xcfc[i] + cfr[i] * xcfr[i] + +/// +/// ubl[i] * xubl[i] + ubc[i] * xubc[i] + ubr[i] * xubr[i] + +/// ucl[i] * xucl[i] + ucc[i] * xucc[i] + ucr[i] * xucr[i] + +/// ufl[i] * xufl[i] + ufc[i] * xufc[i] + ufr[i] * xufr[i] ; +/// +/// } +/// + +#ifndef RAJAPerf_Apps_MATVEC_3D_STENCIL_HPP +#define RAJAPerf_Apps_MATVEC_3D_STENCIL_HPP + +#define MATVEC_3D_STENCIL_DATA_SETUP \ + Real_ptr b = m_b; \ + \ + Real_ptr xdbl = m_x - m_domain->kp - m_domain->jp - 1 ; \ + Real_ptr xdbc = m_x - m_domain->kp - m_domain->jp ; \ + Real_ptr xdbr = m_x - m_domain->kp - m_domain->jp + 1 ; \ + Real_ptr xdcl = m_x - m_domain->kp - 1 ; \ + Real_ptr xdcc = m_x - m_domain->kp ; \ + Real_ptr xdcr = m_x - m_domain->kp + 1 ; \ + Real_ptr xdfl = m_x - m_domain->kp + m_domain->jp - 1 ; \ + Real_ptr xdfc = m_x - m_domain->kp + m_domain->jp ; \ + Real_ptr xdfr = m_x - m_domain->kp + m_domain->jp + 1 ; \ + Real_ptr xcbl = m_x - m_domain->jp - 1 ; \ + Real_ptr xcbc = m_x - m_domain->jp ; \ + Real_ptr xcbr = m_x - m_domain->jp + 1 ; \ + Real_ptr xccl = m_x - 1 ; \ + Real_ptr xccc = m_x ; \ + Real_ptr xccr = m_x + 1 ; \ + Real_ptr xcfl = m_x + m_domain->jp - 1 ; \ + Real_ptr xcfc = m_x + m_domain->jp ; \ + Real_ptr xcfr = m_x + m_domain->jp + 1 ; \ + Real_ptr xubl = m_x + m_domain->kp - m_domain->jp - 1 ; \ + Real_ptr xubc = m_x + m_domain->kp - m_domain->jp ; \ + Real_ptr xubr = m_x + m_domain->kp - m_domain->jp + 1 ; \ + Real_ptr xucl = m_x + m_domain->kp - 1 ; \ + Real_ptr xucc = m_x + m_domain->kp ; \ + Real_ptr xucr = m_x + m_domain->kp + 1 ; \ + Real_ptr xufl = m_x + m_domain->kp + m_domain->jp - 1 ; \ + Real_ptr xufc = m_x + m_domain->kp + m_domain->jp ; \ + Real_ptr xufr = m_x + m_domain->kp + m_domain->jp + 1 ; \ + \ + Real_ptr dbl = m_matrix.dbl; \ + Real_ptr dbc = m_matrix.dbc; \ + Real_ptr dbr = m_matrix.dbr; \ + Real_ptr dcl = m_matrix.dcl; \ + Real_ptr dcc = m_matrix.dcc; \ + Real_ptr dcr = m_matrix.dcr; \ + Real_ptr dfl = m_matrix.dfl; \ + Real_ptr dfc = m_matrix.dfc; \ + Real_ptr dfr = m_matrix.dfr; \ + Real_ptr cbl = m_matrix.cbl; \ + Real_ptr cbc = m_matrix.cbc; \ + Real_ptr cbr = m_matrix.cbr; \ + Real_ptr ccl = m_matrix.ccl; \ + Real_ptr ccc = m_matrix.ccc; \ + Real_ptr ccr = m_matrix.ccr; \ + Real_ptr cfl = m_matrix.cfl; \ + Real_ptr cfc = m_matrix.cfc; \ + Real_ptr cfr = m_matrix.cfr; \ + Real_ptr ubl = m_matrix.ubl; \ + Real_ptr ubc = m_matrix.ubc; \ + Real_ptr ubr = m_matrix.ubr; \ + Real_ptr ucl = m_matrix.ucl; \ + Real_ptr ucc = m_matrix.ucc; \ + Real_ptr ucr = m_matrix.ucr; \ + Real_ptr ufl = m_matrix.ufl; \ + Real_ptr ufc = m_matrix.ufc; \ + Real_ptr ufr = m_matrix.ufr; \ + \ + Index_ptr real_zones = m_real_zones; + +#define MATVEC_3D_STENCIL_BODY_INDEX \ + Index_type i = real_zones[ii]; + +#define MATVEC_3D_STENCIL_BODY \ + b[i] = dbl[i] * xdbl[i] + dbc[i] * xdbc[i] + dbr[i] * xdbr[i] + \ + dcl[i] * xdcl[i] + dcc[i] * xdcc[i] + dcr[i] * xdcr[i] + \ + dfl[i] * xdfl[i] + dfc[i] * xdfc[i] + dfr[i] * xdfr[i] + \ + \ + cbl[i] * xcbl[i] + cbc[i] * xcbc[i] + cbr[i] * xcbr[i] + \ + ccl[i] * xccl[i] + ccc[i] * xccc[i] + ccr[i] * xccr[i] + \ + cfl[i] * xcfl[i] + cfc[i] * xcfc[i] + cfr[i] * xcfr[i] + \ + \ + ubl[i] * xubl[i] + ubc[i] * xubc[i] + ubr[i] * xubr[i] + \ + ucl[i] * xucl[i] + ucc[i] * xucc[i] + ucr[i] * xucr[i] + \ + ufl[i] * xufl[i] + ufc[i] * xufc[i] + ufr[i] * xufr[i] ; \ + + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace apps +{ +class ADomain; + +class MATVEC_3D_STENCIL : public KernelBase +{ +public: + + MATVEC_3D_STENCIL(const RunParams& params); + + ~MATVEC_3D_STENCIL(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + + struct Matrix + { + Real_ptr dbl; + Real_ptr dbc; + Real_ptr dbr; + Real_ptr dcl; + Real_ptr dcc; + Real_ptr dcr; + Real_ptr dfl; + Real_ptr dfc; + Real_ptr dfr; + Real_ptr cbl; + Real_ptr cbc; + Real_ptr cbr; + Real_ptr ccl; + Real_ptr ccc; + Real_ptr ccr; + Real_ptr cfl; + Real_ptr cfc; + Real_ptr cfr; + Real_ptr ubl; + Real_ptr ubc; + Real_ptr ubr; + Real_ptr ucl; + Real_ptr ucc; + Real_ptr ucr; + Real_ptr ufl; + Real_ptr ufc; + Real_ptr ufr; + }; + + Real_ptr m_b; + Real_ptr m_x; + Matrix m_matrix; + + ADomain* m_domain; + Index_type* m_real_zones; + Index_type m_zonal_array_length; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index 0b5d3b078..494d70bbb 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -61,11 +61,13 @@ void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - nodal_accumulation_3d<<>>(vol, - x0, x1, x2, x3, x4, x5, x6, x7, - real_zones, - ibegin, iend); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (nodal_accumulation_3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend ); } stopTimer(); diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index 476ab5da8..da8f5dd12 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -61,11 +61,13 @@ void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((nodal_accumulation_3d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol, - x0, x1, x2, x3, x4, x5, x6, x7, - real_zones, - ibegin, iend); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (nodal_accumulation_3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend ); } stopTimer(); diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp index d62b5527a..a17576c98 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp index a12a91efd..7d5c59614 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp index 9fbc2effa..cf176d4c1 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index ed1bd2078..5bbbbb986 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,7 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params) setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_nodal_array_length = m_domain->nnalls; @@ -39,9 +39,11 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); // touched data size, not actual number of stores and loads - setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_domain->n_real_nodes); + setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + + 1*sizeof(Real_type) * getItsPerRep() + + 1*sizeof(Real_type) * m_domain->n_real_nodes); + setBytesWrittenPerRep( 1*sizeof(Real_type) * m_domain->n_real_nodes ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(9 * getItsPerRep()); checksum_scale_factor = 0.001 * diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index a8d194387..085c0099a 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -100,6 +100,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > @@ -107,7 +108,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_vol; diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 16b395259..c98e3a48a 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -64,19 +64,22 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - pressurecalc1<<>>( bvc, compression, - cls, - iend ); - cudaErrchk( cudaGetLastError() ); - - pressurecalc2<<>>( p_new, bvc, e_old, - vnewc, - p_cut, eosvmax, pmin, - iend ); - cudaErrchk( cudaGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (pressurecalc1), + grid_size, block_size, + shmem, res.get_stream(), + bvc, compression, cls, + iend ); + + RPlaunchCudaKernel( (pressurecalc2), + grid_size, block_size, + shmem, res.get_stream(), + p_new, bvc, e_old, + vnewc, + p_cut, eosvmax, pmin, + iend ); } stopTimer(); diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 1d590e6f0..18d4a1c88 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -64,19 +64,22 @@ void PRESSURE::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - hipLaunchKernelGGL((pressurecalc1), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), bvc, compression, - cls, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((pressurecalc2), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), p_new, bvc, e_old, - vnewc, - p_cut, eosvmax, pmin, - iend ); - hipErrchk( hipGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (pressurecalc1), + grid_size, block_size, + shmem, res.get_stream(), + bvc, compression, cls, + iend ); + + RPlaunchHipKernel( (pressurecalc2), + grid_size, block_size, + shmem, res.get_stream(), + p_new, bvc, e_old, + vnewc, + p_cut, eosvmax, pmin, + iend ); } stopTimer(); diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp index 0f0dd2e4e..ceab0286c 100644 --- a/src/apps/PRESSURE-OMP.cpp +++ b/src/apps/PRESSURE-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp index 830859f07..4d0b61f58 100644 --- a/src/apps/PRESSURE-OMPTarget.cpp +++ b/src/apps/PRESSURE-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp index 4506eb7a2..18fc0929e 100644 --- a/src/apps/PRESSURE-Seq.cpp +++ b/src/apps/PRESSURE-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-Sycl.cpp b/src/apps/PRESSURE-Sycl.cpp new file mode 100644 index 000000000..2b0e3b4dd --- /dev/null +++ b/src/apps/PRESSURE-Sycl.cpp @@ -0,0 +1,106 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "PRESSURE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +template +void PRESSURE::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + PRESSURE_DATA_SETUP; + + using sycl::fabs; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + PRESSURE_BODY1 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + PRESSURE_BODY2 + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + const bool async = true; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + PRESSURE_BODY1; + }); + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + PRESSURE_BODY2; + }); + + }); // end sequential region (for single-source code) + + } + stopTimer(); + + } else { + std::cout << "\n PRESSURE : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PRESSURE, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index c772a0989..ed1e00306 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,11 @@ PRESSURE::PRESSURE(const RunParams& params) setItsPerRep( 2 * getActualProblemSize() ); setKernelsPerRep(2); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() + - (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() + + 3*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() + + 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((2 + 1 ) * getActualProblemSize()); @@ -52,6 +55,9 @@ PRESSURE::PRESSURE(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } PRESSURE::~PRESSURE() diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index c0568a8e0..d1cffe62a 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -72,17 +72,23 @@ class PRESSURE : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); + private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_compression; Real_ptr m_bvc; diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 18bb89c4c..378aa36bc 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -68,13 +68,15 @@ void VOL3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - vol3d<<>>(vol, - x0, x1, x2, x3, x4, x5, x6, x7, - y0, y1, y2, y3, y4, y5, y6, y7, - z0, z1, z2, z3, z4, z5, z6, z7, - vnormq, - ibegin, iend); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (vol3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + y0, y1, y2, y3, y4, y5, y6, y7, + z0, z1, z2, z3, z4, z5, z6, z7, + vnormq, + ibegin, iend ); } stopTimer(); diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 9a0a2323b..704008006 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -68,13 +68,15 @@ void VOL3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((vol3d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol, - x0, x1, x2, x3, x4, x5, x6, x7, - y0, y1, y2, y3, y4, y5, y6, y7, - z0, z1, z2, z3, z4, z5, z6, z7, - vnormq, - ibegin, iend); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (vol3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + y0, y1, y2, y3, y4, y5, y6, y7, + z0, z1, z2, z3, z4, z5, z6, z7, + vnormq, + ibegin, iend ); } stopTimer(); diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp index 44e3cdcf4..7c6562c47 100644 --- a/src/apps/VOL3D-OMP.cpp +++ b/src/apps/VOL3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp index 538fb47c7..d97a5889a 100644 --- a/src/apps/VOL3D-OMPTarget.cpp +++ b/src/apps/VOL3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp index 631c545bc..d174ad43c 100644 --- a/src/apps/VOL3D-Seq.cpp +++ b/src/apps/VOL3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp new file mode 100644 index 000000000..37c7bc90f --- /dev/null +++ b/src/apps/VOL3D-Sycl.cpp @@ -0,0 +1,84 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "VOL3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "AppsData.hpp" + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +template +void VOL3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + VOL3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type ii = item.get_global_id(0); + Index_type i = ii + ibegin; + if (i < iend) { + VOL3D_BODY + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + VOL3D_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n VOL3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(VOL3D, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 11051adc5..16951253d 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,18 +28,19 @@ VOL3D::VOL3D(const RunParams& params) setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_array_length = m_domain->nnalls; - setActualProblemSize( m_domain->lpz+1 - m_domain->fpz ); + setActualProblemSize( m_domain->n_real_zones ); setItsPerRep( m_domain->lpz+1 - m_domain->fpz ); setKernelsPerRep(1); // touched data size, not actual number of stores and loads - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Real_type) + 3*sizeof(Real_type)) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(72 * (m_domain->lpz+1 - m_domain->fpz)); checksum_scale_factor = 0.001 * @@ -64,6 +65,9 @@ VOL3D::VOL3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } VOL3D::~VOL3D() diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index f3d296440..6847ce13f 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -173,17 +173,22 @@ class VOL3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp index 1dc6216d7..f33c8656d 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -61,11 +61,13 @@ void ZONAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - zonal_accumulation_3d<<>>(vol, - x0, x1, x2, x3, x4, x5, x6, x7, - real_zones, - ibegin, iend); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (zonal_accumulation_3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend ); } stopTimer(); diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp index d861128b9..c92d7b6db 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -61,11 +61,13 @@ void ZONAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((zonal_accumulation_3d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol, - x0, x1, x2, x3, x4, x5, x6, x7, - real_zones, - ibegin, iend); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (zonal_accumulation_3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend ); } stopTimer(); diff --git a/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp b/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp index eea4614cf..9b0890ef4 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp index 573765e30..0a4f96119 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp index 1bd904088..ea9be8c17 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp new file mode 100644 index 000000000..67b25086e --- /dev/null +++ b/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp @@ -0,0 +1,89 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ZONAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +template < size_t work_group_size > +void ZONAL_ACCUMULATION_3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + ZONAL_ACCUMULATION_3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type ii = item.get_global_id(0); + Index_type i = ii + ibegin; + if (i < iend) { + ZONAL_ACCUMULATION_3D_BODY_INDEX; + ZONAL_ACCUMULATION_3D_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + zones, [=] (Index_type i) { + ZONAL_ACCUMULATION_3D_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n ZONAL_ACCUMULATION_3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ZONAL_ACCUMULATION_3D, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index 267190132..993f65a07 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,7 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_nodal_array_length = m_domain->nnalls; @@ -39,9 +39,10 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); // touched data size, not actual number of stores and loads - setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + - (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_domain->n_real_nodes); + setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + + 1*sizeof(Real_type) * m_domain->n_real_nodes ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(8 * getItsPerRep()); checksum_scale_factor = 0.001 * @@ -66,6 +67,9 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } ZONAL_ACCUMULATION_3D::~ZONAL_ACCUMULATION_3D() diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 1c823ea2a..572490caa 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -81,17 +81,22 @@ class ZONAL_ACCUMULATION_3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_vol; diff --git a/src/apps/mixed_fem_helper.hpp b/src/apps/mixed_fem_helper.hpp index 88f7d3b64..6ee3b1a06 100644 --- a/src/apps/mixed_fem_helper.hpp +++ b/src/apps/mixed_fem_helper.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 4b47c7b48..25969c207 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index b8ab91cd1..eb2cc1e83 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp b/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp index 9e74c4e0c..aadcf9401 100644 --- a/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index 19e916dac..a67b041c7 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp index 661180c7b..b07c6f881 100644 --- a/src/basic-kokkos/INIT3-Kokkos.cpp +++ b/src/basic-kokkos/INIT3-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp index 8c775a3b0..efd1ed118 100644 --- a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp index 9df018264..996ec6225 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/MULADDSUB-Kokkos.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp index 49e890315..fb2f59ac2 100644 --- a/src/basic-kokkos/MULADDSUB-Kokkos.cpp +++ b/src/basic-kokkos/MULADDSUB-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp index 36929cead..fb5b5ba98 100644 --- a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp +++ b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp index 233ca71af..6247970b1 100644 --- a/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp +++ b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp index 23c0ab6f4..e461c00a1 100644 --- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp +++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index 5cdb9060f..43b629cad 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS-Cuda.cpp b/src/basic/ARRAY_OF_PTRS-Cuda.cpp index b6b53e249..e9cdd2349 100644 --- a/src/basic/ARRAY_OF_PTRS-Cuda.cpp +++ b/src/basic/ARRAY_OF_PTRS-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -54,9 +54,11 @@ void ARRAY_OF_PTRS::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - array_of_ptrs<<>>( - y, x_array, array_size, iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (array_of_ptrs), + grid_size, block_size, + shmem, res.get_stream(), + y, x_array, array_size, iend ); } stopTimer(); @@ -66,13 +68,18 @@ void ARRAY_OF_PTRS::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto array_of_ptrs_lambda = [=] __device__ (Index_type i) { + ARRAY_OF_PTRS_BODY(x); + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - ARRAY_OF_PTRS_BODY(x); - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, array_of_ptrs_lambda ); } stopTimer(); diff --git a/src/basic/ARRAY_OF_PTRS-Hip.cpp b/src/basic/ARRAY_OF_PTRS-Hip.cpp index 26c0f8800..aa5777fb4 100644 --- a/src/basic/ARRAY_OF_PTRS-Hip.cpp +++ b/src/basic/ARRAY_OF_PTRS-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -54,9 +54,11 @@ void ARRAY_OF_PTRS::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((array_of_ptrs),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - y, x_array, array_size, iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (array_of_ptrs), + grid_size, block_size, + shmem, res.get_stream(), + y, x_array, array_size, iend ); } stopTimer(); @@ -72,9 +74,12 @@ void ARRAY_OF_PTRS::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, array_of_ptrs_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, array_of_ptrs_lambda ); } stopTimer(); diff --git a/src/basic/ARRAY_OF_PTRS-OMP.cpp b/src/basic/ARRAY_OF_PTRS-OMP.cpp index 3e05e929a..774fd8f98 100644 --- a/src/basic/ARRAY_OF_PTRS-OMP.cpp +++ b/src/basic/ARRAY_OF_PTRS-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp b/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp index 7a7642b4e..02301ca1e 100644 --- a/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp +++ b/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS-Seq.cpp b/src/basic/ARRAY_OF_PTRS-Seq.cpp index ba728d775..dd22e010d 100644 --- a/src/basic/ARRAY_OF_PTRS-Seq.cpp +++ b/src/basic/ARRAY_OF_PTRS-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,9 +26,11 @@ void ARRAY_OF_PTRS::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune ARRAY_OF_PTRS_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto array_of_ptrs_lam = [=](Index_type i) { ARRAY_OF_PTRS_BODY(x); }; +#endif switch ( vid ) { diff --git a/src/basic/ARRAY_OF_PTRS-Sycl.cpp b/src/basic/ARRAY_OF_PTRS-Sycl.cpp new file mode 100644 index 000000000..c3d987987 --- /dev/null +++ b/src/basic/ARRAY_OF_PTRS-Sycl.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ARRAY_OF_PTRS.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +template < size_t work_group_size > +void ARRAY_OF_PTRS::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + ARRAY_OF_PTRS_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + ARRAY_OF_PTRS_BODY(x); + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ARRAY_OF_PTRS_BODY(x); + }); + + } + stopTimer(); + + } else { + getCout() << "\n ARRAY_OF_PTRS : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ARRAY_OF_PTRS, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index 2a88e5005..9f0995d0d 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,11 +26,13 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - m_array_size = ARRAY_OF_PTRS_MAX_ARRAY_SIZE; + m_array_size = params.getArrayOfPtrsArraySize(); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + m_array_size*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( m_array_size*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_array_size * getActualProblemSize()); setUsesFeature(Forall); @@ -54,6 +56,9 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index ee8a44862..029353f45 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -24,8 +24,6 @@ #ifndef RAJAPerf_Basic_ARRAY_OF_PTRS_HPP #define RAJAPerf_Basic_ARRAY_OF_PTRS_HPP -#define ARRAY_OF_PTRS_MAX_ARRAY_SIZE 26 - #define ARRAY_OF_PTRS_DATA_SETUP_X_ARRAY \ for (Index_type a = 0; a < array_size; ++a) { \ x[a] = x_data + a * iend ; \ @@ -72,18 +70,24 @@ class ARRAY_OF_PTRS : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index d21d46e5a..ee71119ce 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -14,18 +14,21 @@ blt_add_library( ARRAY_OF_PTRS-Cuda.cpp ARRAY_OF_PTRS-OMP.cpp ARRAY_OF_PTRS-OMPTarget.cpp + ARRAY_OF_PTRS-Sycl.cpp COPY8.cpp COPY8-Seq.cpp COPY8-Hip.cpp COPY8-Cuda.cpp COPY8-OMP.cpp COPY8-OMPTarget.cpp + COPY8-Sycl.cpp DAXPY.cpp DAXPY-Seq.cpp DAXPY-Hip.cpp DAXPY-Cuda.cpp DAXPY-OMP.cpp DAXPY-OMPTarget.cpp + DAXPY-Sycl.cpp DAXPY_ATOMIC.cpp DAXPY_ATOMIC-Seq.cpp DAXPY_ATOMIC-Hip.cpp @@ -38,6 +41,7 @@ blt_add_library( IF_QUAD-Cuda.cpp IF_QUAD-OMP.cpp IF_QUAD-OMPTarget.cpp + IF_QUAD-Sycl.cpp INDEXLIST.cpp INDEXLIST-Seq.cpp INDEXLIST-Hip.cpp @@ -56,36 +60,42 @@ blt_add_library( INIT3-Cuda.cpp INIT3-OMP.cpp INIT3-OMPTarget.cpp + INIT3-Sycl.cpp INIT_VIEW1D.cpp INIT_VIEW1D-Seq.cpp INIT_VIEW1D-Hip.cpp INIT_VIEW1D-Cuda.cpp INIT_VIEW1D-OMP.cpp INIT_VIEW1D-OMPTarget.cpp + INIT_VIEW1D-Sycl.cpp INIT_VIEW1D_OFFSET.cpp INIT_VIEW1D_OFFSET-Seq.cpp INIT_VIEW1D_OFFSET-Hip.cpp INIT_VIEW1D_OFFSET-Cuda.cpp INIT_VIEW1D_OFFSET-OMP.cpp INIT_VIEW1D_OFFSET-OMPTarget.cpp + INIT_VIEW1D_OFFSET-Sycl.cpp MAT_MAT_SHARED.cpp MAT_MAT_SHARED-Seq.cpp MAT_MAT_SHARED-Hip.cpp MAT_MAT_SHARED-Cuda.cpp MAT_MAT_SHARED-OMP.cpp MAT_MAT_SHARED-OMPTarget.cpp + MAT_MAT_SHARED-Sycl.cpp MULADDSUB.cpp MULADDSUB-Seq.cpp MULADDSUB-Hip.cpp MULADDSUB-Cuda.cpp MULADDSUB-OMP.cpp MULADDSUB-OMPTarget.cpp + MULADDSUB-Sycl.cpp NESTED_INIT.cpp NESTED_INIT-Seq.cpp NESTED_INIT-Hip.cpp NESTED_INIT-Cuda.cpp NESTED_INIT-OMP.cpp NESTED_INIT-OMPTarget.cpp + NESTED_INIT-Sycl.cpp PI_ATOMIC.cpp PI_ATOMIC-Seq.cpp PI_ATOMIC-Hip.cpp @@ -98,12 +108,14 @@ blt_add_library( PI_REDUCE-Cuda.cpp PI_REDUCE-OMP.cpp PI_REDUCE-OMPTarget.cpp + PI_REDUCE-Sycl.cpp REDUCE3_INT.cpp REDUCE3_INT-Seq.cpp REDUCE3_INT-Hip.cpp REDUCE3_INT-Cuda.cpp REDUCE3_INT-OMP.cpp REDUCE3_INT-OMPTarget.cpp + REDUCE3_INT-Sycl.cpp REDUCE_STRUCT.cpp REDUCE_STRUCT-Seq.cpp REDUCE_STRUCT-Hip.cpp @@ -116,5 +128,12 @@ blt_add_library( TRAP_INT-Cuda.cpp TRAP_INT-OMPTarget.cpp TRAP_INT-OMP.cpp + TRAP_INT-Sycl.cpp + MULTI_REDUCE.cpp + MULTI_REDUCE-Seq.cpp + MULTI_REDUCE-Hip.cpp + MULTI_REDUCE-Cuda.cpp + MULTI_REDUCE-OMP.cpp + MULTI_REDUCE-OMPTarget.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/basic/COPY8-Cuda.cpp b/src/basic/COPY8-Cuda.cpp index b5bfeafbc..f8f1aeb31 100644 --- a/src/basic/COPY8-Cuda.cpp +++ b/src/basic/COPY8-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,8 +23,10 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) -__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7, - Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7, +__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, + Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7, + Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, + Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; @@ -52,11 +54,13 @@ void COPY8::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - copy8<<>>( - y0, y1, y2, y3, y4, y5, y6, y7, - x0, x1, x2, x3, x4, x5, x6, x7, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (copy8), + grid_size, block_size, + shmem, res.get_stream(), + y0, y1, y2, y3, y4, y5, y6, y7, + x0, x1, x2, x3, x4, x5, x6, x7, + iend ); } stopTimer(); @@ -66,13 +70,18 @@ void COPY8::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto copy8_lambda = [=] __device__ (Index_type i) { + COPY8_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - COPY8_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, copy8_lambda ); } stopTimer(); diff --git a/src/basic/COPY8-Hip.cpp b/src/basic/COPY8-Hip.cpp index fe24822f5..714a00a0b 100644 --- a/src/basic/COPY8-Hip.cpp +++ b/src/basic/COPY8-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,8 +23,10 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) -__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7, - Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7, +__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, + Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7, + Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, + Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; @@ -53,11 +55,13 @@ void COPY8::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((copy8),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - y0, y1, y2, y3, y4, y5, y6, y7, - x0, x1, x2, x3, x4, x5, x6, x7, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (copy8), + grid_size, block_size, + shmem, res.get_stream(), + y0, y1, y2, y3, y4, y5, y6, y7, + x0, x1, x2, x3, x4, x5, x6, x7, + iend ); } stopTimer(); @@ -73,9 +77,12 @@ void COPY8::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy8_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, copy8_lambda ); } stopTimer(); diff --git a/src/basic/COPY8-OMP.cpp b/src/basic/COPY8-OMP.cpp index 8ba6699c6..a8dec3228 100644 --- a/src/basic/COPY8-OMP.cpp +++ b/src/basic/COPY8-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/COPY8-OMPTarget.cpp b/src/basic/COPY8-OMPTarget.cpp index 729449861..63a207ba8 100644 --- a/src/basic/COPY8-OMPTarget.cpp +++ b/src/basic/COPY8-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -40,7 +40,8 @@ void COPY8::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp target is_device_ptr(x, y) device( did ) + #pragma omp target is_device_ptr(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, \ + y3, y4, y5, y6, y7) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { COPY8_BODY; @@ -70,4 +71,4 @@ void COPY8::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } // end namespace basic } // end namespace rajaperf -#endif // RAJA_ENABLE_TARGET_OPENMP +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic/COPY8-Seq.cpp b/src/basic/COPY8-Seq.cpp index 1ae6af854..32bf188d6 100644 --- a/src/basic/COPY8-Seq.cpp +++ b/src/basic/COPY8-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/COPY8-Sycl.cpp b/src/basic/COPY8-Sycl.cpp new file mode 100644 index 000000000..8ce2a8e24 --- /dev/null +++ b/src/basic/COPY8-Sycl.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "COPY8.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +template < size_t work_group_size > +void COPY8::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + COPY8_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + COPY8_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + COPY8_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n COPY8 : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(COPY8, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp index 7a75daa40..ce8847032 100644 --- a/src/basic/COPY8.cpp +++ b/src/basic/COPY8.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ COPY8::COPY8(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (8*sizeof(Real_type) + 8*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 8*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 8*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); @@ -51,6 +53,9 @@ COPY8::COPY8(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } COPY8::~COPY8() diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp index f98784d16..61945eed3 100644 --- a/src/basic/COPY8.hpp +++ b/src/basic/COPY8.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -79,18 +79,24 @@ class COPY8 : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x0; Real_ptr m_x1; diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 79f1fde4a..d58468ba3 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,9 +52,11 @@ void DAXPY::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - daxpy<<>>( y, x, a, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (daxpy), + grid_size, block_size, + shmem, res.get_stream(), + y, x, a, iend ); } stopTimer(); @@ -64,13 +66,18 @@ void DAXPY::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto daxpy_lambda = [=] __device__ (Index_type i) { + DAXPY_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - DAXPY_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, daxpy_lambda ); } stopTimer(); diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 22f86b4d7..f08dba1fc 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,9 +53,11 @@ void DAXPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((daxpy),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), y, x, a, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (daxpy), + grid_size, block_size, + shmem, res.get_stream(), + y, x, a, iend ); } stopTimer(); @@ -71,9 +73,12 @@ void DAXPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, daxpy_lambda ); } stopTimer(); diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp index afc0e653c..8f1b95641 100644 --- a/src/basic/DAXPY-OMP.cpp +++ b/src/basic/DAXPY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp index 387a4c40d..fc36ad257 100644 --- a/src/basic/DAXPY-OMPTarget.cpp +++ b/src/basic/DAXPY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 7b024ca49..e23cc5e6f 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp new file mode 100644 index 000000000..15642a08b --- /dev/null +++ b/src/basic/DAXPY-Sycl.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +template +void DAXPY::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + DAXPY_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + DAXPY_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + DAXPY_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n DAXPY : Unknown Sycl variant id = " << vid << std::endl; + } + +} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DAXPY, Sycl) + +} // end namespace basic +} // end namespace rajaperf + + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 8aa05e66a..fafb9bb66 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ DAXPY::DAXPY(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize()); setUsesFeature(Forall); @@ -52,6 +54,9 @@ DAXPY::DAXPY(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index bcaca8054..db9edba60 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,18 +52,24 @@ class DAXPY : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 29a142d01..c4cee2dd2 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,9 +52,11 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - daxpy_atomic<<>>( y, x, a, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (daxpy_atomic), + grid_size, block_size, + shmem, res.get_stream(), + y, x, a, iend ); } stopTimer(); @@ -64,13 +66,18 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto daxpy_atomic_lambda = [=] __device__ (Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic); + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic); - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, daxpy_atomic_lambda ); } stopTimer(); diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index 0688950b0..258c979b6 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,9 +52,11 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((daxpy_atomic),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), y, x, a, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (daxpy_atomic), + grid_size, block_size, + shmem, res.get_stream(), + y, x, a, iend ); } stopTimer(); @@ -70,9 +72,12 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_atomic_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, daxpy_atomic_lambda ); } stopTimer(); diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp index 4d2f4db87..a41c6c049 100644 --- a/src/basic/DAXPY_ATOMIC-OMP.cpp +++ b/src/basic/DAXPY_ATOMIC-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp index bc6b08932..ae7319e25 100644 --- a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp +++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp index 1c33c45f8..9fd78fecf 100644 --- a/src/basic/DAXPY_ATOMIC-Seq.cpp +++ b/src/basic/DAXPY_ATOMIC-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index a9f709276..24ec906c4 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ DAXPY_ATOMIC::DAXPY_ATOMIC(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 0 ); + setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); setFLOPsPerRep(2 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index 9c2890e48..17bf3979b 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -55,10 +55,12 @@ class DAXPY_ATOMIC : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > @@ -66,7 +68,7 @@ class DAXPY_ATOMIC : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 0702e7d2d..da959e485 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,8 +53,13 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - ifquad<<>>( x1, x2, a, b, c, iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (ifquad), + grid_size, block_size, + shmem, res.get_stream(), + x1, x2, + a, b, c, + iend ); } stopTimer(); @@ -63,13 +68,18 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto ifquad_lambda = [=] __device__ (Index_type i) { + IF_QUAD_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - IF_QUAD_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, ifquad_lambda ); } stopTimer(); diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 5b47d786b..259306d0f 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,9 +53,13 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((ifquad), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x1, x2, a, b, c, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (ifquad), + grid_size, block_size, + shmem, res.get_stream(), + x1, x2, + a, b, c, + iend ); } stopTimer(); @@ -71,9 +75,12 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, ifquad_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, ifquad_lambda ); } stopTimer(); diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp index e952f05fb..297decc78 100644 --- a/src/basic/IF_QUAD-OMP.cpp +++ b/src/basic/IF_QUAD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index d6232ec13..bedec322c 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp index aa2448a1b..14735ecd8 100644 --- a/src/basic/IF_QUAD-Seq.cpp +++ b/src/basic/IF_QUAD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp new file mode 100644 index 000000000..17e569c6f --- /dev/null +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "IF_QUAD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace basic +{ + +template +void IF_QUAD::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + IF_QUAD_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + + if (i < iend) { + IF_QUAD_BODY + } + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + IF_QUAD_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n IF_QUAD : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(IF_QUAD, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index c31dc79d4..58ccb9f58 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ IF_QUAD::IF_QUAD(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (2*sizeof(Real_type) + 3*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 3*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(11 * getActualProblemSize()); // 1 sqrt checksum_scale_factor = 0.0001 * @@ -56,6 +58,9 @@ IF_QUAD::IF_QUAD(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index f1f3e12a8..a742d22eb 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -69,18 +69,24 @@ class IF_QUAD : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index cb6c88a9e..afcb54176 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -13,11 +13,7 @@ #if defined(RAJA_ENABLE_CUDA) #include "common/CudaDataUtils.hpp" - -#include -#include -#include -#include +#include "common/CudaGridScan.hpp" #include @@ -26,177 +22,11 @@ namespace rajaperf namespace basic { - // - // Define magic numbers for CUDA execution - // - const size_t warp_size = 32; - const size_t items_per_thread = 15; - - -// perform a grid scan on val and returns the result at each thread -// in exclusive and inclusive, note that val is used as scratch space -template < size_t block_size, size_t items_per_thread > -__device__ void grid_scan(const int block_id, - Index_type (&val)[items_per_thread], - Index_type (&exclusive)[items_per_thread], - Index_type (&inclusive)[items_per_thread], - Index_type* block_counts, - Index_type* grid_counts, - unsigned* block_readys) -{ - const bool first_block = (block_id == 0); - const bool last_block = (block_id == gridDim.x-1); - const bool last_thread = (threadIdx.x == block_size-1); - const bool last_warp = (threadIdx.x >= block_size - warp_size); - const int warp_index = (threadIdx.x % warp_size); - const unsigned warp_index_mask = (1u << warp_index); - const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u); - - using BlockScan = cub::BlockScan; //, cub::BLOCK_SCAN_WARP_SCANS>; - using BlockExchange = cub::BlockExchange; - using WarpReduce = cub::WarpReduce; - - union SharedStorage { - typename BlockScan::TempStorage block_scan_storage; - typename BlockExchange::TempStorage block_exchange_storage; - typename WarpReduce::TempStorage warp_reduce_storage; - volatile Index_type prev_grid_count; - }; - __shared__ SharedStorage s_temp_storage; - - - BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val); - __syncthreads(); - - - BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive); - __syncthreads(); - - for (size_t ti = 0; ti < items_per_thread; ++ti) { - inclusive[ti] = exclusive[ti] + val[ti]; - } - - BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive); - __syncthreads(); - BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive); - __syncthreads(); - if (first_block) { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure block_counts, grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready - } - - } else { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - __threadfence(); // ensure block_counts ready (release) - atomicExch(&block_readys[block_id], 1u); // write block_counts is ready - } - - // get prev_grid_count using last warp in block - if (last_warp) { - - Index_type prev_grid_count = 0; - - // accumulate previous block counts into registers of warp - - int prev_block_base_id = block_id - warp_size; - - unsigned prev_block_ready = 0u; - unsigned prev_blocks_ready_ballot = 0u; - unsigned prev_grids_ready_ballot = 0u; - - // accumulate full warp worths of block counts - // stop if run out of full warps of a grid count is ready - while (prev_block_base_id >= 0) { - - const int prev_block_id = prev_block_base_id + warp_index; - - // ensure previous block_counts are ready - do { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - - prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); - - } while (prev_blocks_ready_ballot != 0xffffffffu); - - prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); - - if (prev_grids_ready_ballot != 0u) { - break; - } - - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - - prev_block_ready = 0u; - - prev_block_base_id -= warp_size; - } - - const int prev_block_id = prev_block_base_id + warp_index; - - // ensure previous block_counts are ready - // this checks that block counts is ready for all blocks above - // the highest grid count that is ready - while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { - - if (prev_block_id >= 0) { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - } - - prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); - prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); - } - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // read one grid_count from a block with id grid_count_ready_id - // and read the block_counts from blocks with higher ids. - if (warp_index_mask > prev_grids_ready_ballot) { - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { - // accumulate grid_count for grid_count_ready_id - prev_grid_count += grid_counts[prev_block_id]; - } - - - prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count); - prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp - - if (last_thread) { - - if (!last_block) { - grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready - } - - s_temp_storage.prev_grid_count = prev_grid_count; - } - } - - __syncthreads(); - Index_type prev_grid_count = s_temp_storage.prev_grid_count; - - for (size_t ti = 0; ti < items_per_thread; ++ti) { - exclusive[ti] = prev_grid_count + exclusive[ti]; - inclusive[ti] = prev_grid_count + inclusive[ti]; - } +template < size_t block_size > +using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< + detail::cuda::grid_scan_max_items_per_thread::value+1, + integer::LessEqual::value>>; - if (last_block) { - for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) { - while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); - } - } - } -} template < size_t block_size, size_t items_per_thread > __launch_bounds__(block_size) @@ -208,7 +38,7 @@ __global__ void indexlist(Real_ptr x, Index_type* len, Index_type iend) { - // blocks do start running in order in cuda and hip, so a block with a higher + // blocks do start running in order in cuda, so a block with a higher // index can wait on a block with a lower index without deadlocking // (replace with an atomicInc if this changes) const int block_id = blockIdx.x; @@ -228,7 +58,7 @@ __global__ void indexlist(Real_ptr x, Index_type exclusives[items_per_thread]; Index_type inclusives[items_per_thread]; - grid_scan( + detail::cuda::GridScan::grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); for (size_t ti = 0; ti < items_per_thread; ++ti) { @@ -246,7 +76,8 @@ __global__ void indexlist(Real_ptr x, } } -template < size_t block_size > + +template < size_t block_size, size_t items_per_thread > void INDEXLIST::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -270,18 +101,18 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) allocData(DataSpace::CudaDevice, grid_counts, grid_size); unsigned* block_readys; allocData(DataSpace::CudaDevice, block_readys, grid_size); - cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - indexlist - <<>>( - x+ibegin, list+ibegin, - block_counts, grid_counts, block_readys, - len, iend-ibegin ); - cudaErrchk( cudaGetLastError() ); + cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, + res.get_stream()) ); + RPlaunchCudaKernel( (indexlist), + grid_size, block_size, + shmem_size, res.get_stream(), + x+ibegin, list+ibegin, + block_counts, grid_counts, block_readys, + len, iend-ibegin ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); m_len = *len; @@ -299,7 +130,98 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INDEXLIST, Cuda) + +void INDEXLIST::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + using cuda_items_per_thread = cuda_items_per_thread_type; + + if (camp::size::value == 0) { + + if (tune_idx == t) { + + runCudaVariantImpl::value + >(vid); + + } + + t += 1; + + } + + seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + if (tune_idx == t) { + + runCudaVariantImpl(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n INDEXLIST : Unknown Cuda variant id = " << vid << std::endl; + + } +} + +void INDEXLIST::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + using cuda_items_per_thread = cuda_items_per_thread_type; + + if (camp::size::value == 0) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + } + + seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_" + "block_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 9b0555057..3be527d35 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -13,11 +13,7 @@ #if defined(RAJA_ENABLE_HIP) #include "common/HipDataUtils.hpp" - -#include -#include -#include -#include +#include "common/HipGridScan.hpp" #include @@ -26,177 +22,11 @@ namespace rajaperf namespace basic { - // - // Define magic numbers for HIP execution - // - const size_t warp_size = 64; - const size_t items_per_thread = 8; - - -// perform a grid scan on val and returns the result at each thread -// in exclusive and inclusive, note that val is used as scratch space -template < size_t block_size, size_t items_per_thread > -__device__ void grid_scan(const int block_id, - Index_type (&val)[items_per_thread], - Index_type (&exclusive)[items_per_thread], - Index_type (&inclusive)[items_per_thread], - Index_type* block_counts, - Index_type* grid_counts, - unsigned* block_readys) -{ - const bool first_block = (block_id == 0); - const bool last_block = (block_id == static_cast(gridDim.x-1)); - const bool last_thread = (threadIdx.x == block_size-1); - const bool last_warp = (threadIdx.x >= block_size - warp_size); - const int warp_index = (threadIdx.x % warp_size); - const unsigned long long warp_index_mask = (1ull << warp_index); - const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull); - - using BlockScan = rocprim::block_scan; //, rocprim::block_scan_algorithm::reduce_then_scan>; - using BlockExchange = rocprim::block_exchange; - using WarpReduce = rocprim::warp_reduce; - - union SharedStorage { - typename BlockScan::storage_type block_scan_storage; - typename BlockExchange::storage_type block_exchange_storage; - typename WarpReduce::storage_type warp_reduce_storage; - volatile Index_type prev_grid_count; - }; - __shared__ SharedStorage s_temp_storage; - - - BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage); - __syncthreads(); - - - BlockScan().exclusive_scan(val, exclusive, Index_type{0}, s_temp_storage.block_scan_storage); - __syncthreads(); - - for (size_t ti = 0; ti < items_per_thread; ++ti) { - inclusive[ti] = exclusive[ti] + val[ti]; - } - - BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage); - __syncthreads(); - BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage); - __syncthreads(); - if (first_block) { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure block_counts, grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready - } - - } else { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - __threadfence(); // ensure block_counts ready (release) - atomicExch(&block_readys[block_id], 1u); // write block_counts is ready - } - - // get prev_grid_count using last warp in block - if (last_warp) { - - Index_type prev_grid_count = 0; - - // accumulate previous block counts into registers of warp - - int prev_block_base_id = block_id - warp_size; - - unsigned prev_block_ready = 0u; - unsigned long long prev_blocks_ready_ballot = 0ull; - unsigned long long prev_grids_ready_ballot = 0ull; - - // accumulate full warp worths of block counts - // stop if run out of full warps of a grid count is ready - while (prev_block_base_id >= 0) { - - const int prev_block_id = prev_block_base_id + warp_index; - - // ensure previous block_counts are ready - do { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - - prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); - - } while (prev_blocks_ready_ballot != 0xffffffffffffffffull); - - prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); - - if (prev_grids_ready_ballot != 0ull) { - break; - } - - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - - prev_block_ready = 0u; - - prev_block_base_id -= warp_size; - } - - const int prev_block_id = prev_block_base_id + warp_index; - - // ensure previous block_counts are ready - // this checks that block counts is ready for all blocks above - // the highest grid count that is ready - while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { - - if (prev_block_id >= 0) { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - } - - prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); - prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); - } - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // read one grid_count from a block with id grid_count_ready_id - // and read the block_counts from blocks with higher ids. - if (warp_index_mask > prev_grids_ready_ballot) { - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { - // accumulate grid_count for grid_count_ready_id - prev_grid_count += grid_counts[prev_block_id]; - } - - - WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage); - prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp - - if (last_thread) { - - if (!last_block) { - grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready - } - - s_temp_storage.prev_grid_count = prev_grid_count; - } - } - - __syncthreads(); - Index_type prev_grid_count = s_temp_storage.prev_grid_count; - - for (size_t ti = 0; ti < items_per_thread; ++ti) { - exclusive[ti] = prev_grid_count + exclusive[ti]; - inclusive[ti] = prev_grid_count + inclusive[ti]; - } +template < size_t block_size > +using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< + detail::hip::grid_scan_max_items_per_thread::value+1, + integer::LessEqual::value>>; - if (last_block) { - for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) { - while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); - } - } - } -} template < size_t block_size, size_t items_per_thread > __launch_bounds__(block_size) @@ -208,9 +38,9 @@ __global__ void indexlist(Real_ptr x, Index_type* len, Index_type iend) { - // blocks do start running in order in cuda and hip, so a block with a higher - // index can wait on a block with a lower index without deadlocking - // (replace with an atomicInc if this changes) + // It looks like blocks do not start running in order in hip, so a block + // with a higher index can't wait on a block with a lower index without + // deadlocking (have to replace with an atomicInc) const int block_id = blockIdx.x; Index_type vals[items_per_thread]; @@ -228,7 +58,7 @@ __global__ void indexlist(Real_ptr x, Index_type exclusives[items_per_thread]; Index_type inclusives[items_per_thread]; - grid_scan( + detail::hip::GridScan::grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); for (size_t ti = 0; ti < items_per_thread; ++ti) { @@ -246,7 +76,7 @@ __global__ void indexlist(Real_ptr x, } } -template < size_t block_size > +template < size_t block_size, size_t items_per_thread > void INDEXLIST::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -263,25 +93,26 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) const size_t shmem_size = 0; Index_type* len; - allocData(DataSpace::HipPinned, len, 1); + allocData(DataSpace::HipPinnedCoarse, len, 1); Index_type* block_counts; allocData(DataSpace::HipDevice, block_counts, grid_size); Index_type* grid_counts; allocData(DataSpace::HipDevice, grid_counts, grid_size); unsigned* block_readys; allocData(DataSpace::HipDevice, block_readys, grid_size); - hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - indexlist - <<>>( - x+ibegin, list+ibegin, - block_counts, grid_counts, block_readys, - len, iend-ibegin ); - hipErrchk( hipGetLastError() ); + hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, + res.get_stream()) ); + + RPlaunchHipKernel( (indexlist), + grid_size, block_size, + shmem_size, res.get_stream(), + x+ibegin, list+ibegin, + block_counts, grid_counts, block_readys, + len, iend-ibegin ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); m_len = *len; @@ -289,7 +120,7 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocData(DataSpace::HipPinned, len); + deallocData(DataSpace::HipPinnedCoarse, len); deallocData(DataSpace::HipDevice, block_counts); deallocData(DataSpace::HipDevice, grid_counts); deallocData(DataSpace::HipDevice, block_readys); @@ -299,7 +130,98 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INDEXLIST, Hip) + +void INDEXLIST::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + using hip_items_per_thread = hip_items_per_thread_type; + + if (camp::size::value == 0) { + + if (tune_idx == t) { + + runHipVariantImpl::value + >(vid); + + } + + t += 1; + + } + + seq_for(hip_items_per_thread{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + if (tune_idx == t) { + + runHipVariantImpl(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n INDEXLIST : Unknown Hip variant id = " << vid << std::endl; + + } +} + +void INDEXLIST::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + using hip_items_per_thread = hip_items_per_thread_type; + + if (camp::size::value == 0) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + } + + seq_for(hip_items_per_thread{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_" + "block_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index cb559c8b2..0336d0643 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -28,9 +28,11 @@ INDEXLIST::INDEXLIST(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Index_type) + 1*sizeof(Index_type)) + - (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 + // about 50% output - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Index_type) + + 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Index_type) + + 1*sizeof(Int_type) * getActualProblemSize() / 2 ); // about 50% output + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index 0836d8197..bdf00a446 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -1,7 +1,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. +// See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// @@ -63,14 +63,15 @@ class INDEXLIST : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > + + template < size_t block_size, size_t items_per_thread > void runCudaVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t block_size, size_t items_per_thread > void runHipVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Int_ptr m_list; diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 7b6a9ade6..e95524201 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -101,9 +101,11 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - indexlist_conditional<<>>( - x, counts, iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (indexlist_conditional), + grid_size, block_size, + shmem, stream, + x, counts, iend ); cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, @@ -114,9 +116,10 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) scan_size, stream)); - indexlist_make_list<<>>( - list, counts, len, iend ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (indexlist_make_list), + grid_size, block_size, + shmem, stream, + list, counts, len, iend ); cudaErrchk( cudaStreamSynchronize(stream) ); m_len = *len; @@ -133,34 +136,42 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_CUDA; + Index_type* len; + allocData(DataSpace::CudaPinned, len, 1); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum len(0); - RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; }); - RAJA::exclusive_scan_inplace< RAJA::cuda_exec >( res, - RAJA::make_span(counts+ibegin, iend+1-ibegin)); + RAJA::exclusive_scan_inplace< + RAJA::cuda_exec >( + res, + RAJA::make_span(counts+ibegin, iend+1-ibegin) ); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { if (counts[i] != counts[i+1]) { list[counts[i]] = i; - len += 1; + } + if (i == iend-1) { + *len = counts[i+1]; } }); - m_len = len.get(); + res.wait(); + m_len = *len; } stopTimer(); + deallocData(DataSpace::CudaPinned, len); + INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA; } else { diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index b4d0d26f8..e1de399a0 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -74,7 +74,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_HIP; Index_type* len; - allocData(DataSpace::HipPinned, len, 1); + allocData(DataSpace::HipPinnedCoarse, len, 1); hipStream_t stream = res.get_stream(); @@ -112,9 +112,11 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((indexlist_conditional), grid_size, block_size, shmem, stream, - x, counts, iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (indexlist_conditional), + grid_size, block_size, + shmem, stream, + x, counts, iend ); #if defined(__HIPCC__) hipErrchk(::rocprim::exclusive_scan(d_temp_storage, @@ -136,9 +138,10 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) stream)); #endif - hipLaunchKernelGGL((indexlist_make_list), grid_size, block_size, shmem, stream, - list, counts, len, iend ); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (indexlist_make_list), + grid_size, block_size, + shmem, stream, + list, counts, len, iend ); hipErrchk( hipStreamSynchronize(stream) ); m_len = *len; @@ -147,7 +150,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) stopTimer(); deallocData(DataSpace::HipDevice, temp_storage); - deallocData(DataSpace::HipPinned, len); + deallocData(DataSpace::HipPinnedCoarse, len); INDEXLIST_3LOOP_DATA_TEARDOWN_HIP; @@ -155,34 +158,42 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_HIP; + Index_type* len; + allocData(DataSpace::HipPinnedCoarse, len, 1); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum len(0); - RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; }); - RAJA::exclusive_scan_inplace< RAJA::hip_exec >( res, - RAJA::make_span(counts+ibegin, iend+1-ibegin)); + RAJA::exclusive_scan_inplace< + RAJA::hip_exec >( + res, + RAJA::make_span(counts+ibegin, iend+1-ibegin) ); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { if (counts[i] != counts[i+1]) { list[counts[i]] = i; - len += 1; + } + if (i == iend-1) { + *len = counts[i+1]; } }); - m_len = len.get(); + res.wait(); + m_len = *len; } stopTimer(); + deallocData(DataSpace::HipPinnedCoarse, len); + INDEXLIST_3LOOP_DATA_TEARDOWN_HIP; } else { diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index d84736ef7..57cb14c23 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -203,8 +203,6 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum len(0); - RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { @@ -219,11 +217,10 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG [=](Index_type i) { if (counts[i] != counts[i+1]) { list[counts[i]] = i; - len += 1; } }); - m_len = len.get(); + m_len = counts[iend]; } stopTimer(); diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp index 9de3f3393..3828c5652 100644 --- a/src/basic/INDEXLIST_3LOOP-Seq.cpp +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -117,8 +117,6 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum len(0); - RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { @@ -133,11 +131,10 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu [=](Index_type i) { if (counts[i] != counts[i+1]) { list[counts[i]] = i; - len += 1; } }); - m_len = len.get(); + m_len = counts[iend]; } stopTimer(); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 49117dc66..1759f10b0 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -28,14 +28,19 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setItsPerRep( 3 * getActualProblemSize() + 1 ); setKernelsPerRep(3); - setBytesPerRep( (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() + + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() + - (1*sizeof(Index_type) + 1*sizeof(Index_type)) + - (1*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) + + 1*sizeof(Index_type) + + 1*sizeof(Index_type) * (getActualProblemSize()+1) + - (0*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) + - (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 ); // about 50% output + 1*sizeof(Index_type) * (getActualProblemSize()+1) ); + setBytesWrittenPerRep( 1*sizeof(Index_type) * getActualProblemSize() + + + 1*sizeof(Index_type) + + 1*sizeof(Index_type) * (getActualProblemSize()+1) + + + 1*sizeof(Int_type) * (getActualProblemSize()+1) / 2 ); // about 50% output + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index e19ee5508..5cd2ac8ab 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -1,7 +1,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. +// See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// @@ -74,6 +74,7 @@ class INDEXLIST_3LOOP : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > @@ -81,7 +82,7 @@ class INDEXLIST_3LOOP : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_x; Int_ptr m_list; diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index a6f61d73a..8b3cb9bb7 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,9 +53,13 @@ void INIT3::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - init3<<>>( out1, out2, out3, in1, in2, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (init3), + grid_size, block_size, + shmem, res.get_stream(), + out1, out2, out3, + in1, in2, + iend ); } stopTimer(); @@ -65,13 +69,18 @@ void INIT3::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto init3_lambda = [=] __device__ (Index_type i) { + INIT3_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - INIT3_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, init3_lambda ); } stopTimer(); diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index 99f5eec2b..be0f2e74f 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,9 +53,13 @@ void INIT3::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((init3), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), out1, out2, out3, in1, in2, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (init3), + grid_size, block_size, + shmem, res.get_stream(), + out1, out2, out3, + in1, in2, + iend ); } stopTimer(); @@ -71,9 +75,12 @@ void INIT3::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, init3_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, init3_lambda ); } stopTimer(); diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp index 25d31585c..346a92399 100644 --- a/src/basic/INIT3-OMP.cpp +++ b/src/basic/INIT3-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index 825730bdc..0caee8c80 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index 398e986b1..20feb79a4 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp new file mode 100644 index 000000000..ea5277730 --- /dev/null +++ b/src/basic/INIT3-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT3.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +template +void INIT3::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + INIT3_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + INIT3_BODY + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + INIT3_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n INIT3 : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT3, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index bbf90da80..1f0da97f3 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ INIT3::INIT3(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (3*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); @@ -52,6 +54,9 @@ INIT3::INIT3(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index aed67bfeb..7e5f6a026 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -55,18 +55,24 @@ class INIT3 : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index ca6fbdf3c..e535ac041 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,8 +53,11 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - initview1d<<>>( a, v, iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (initview1d), + grid_size, block_size, + shmem, res.get_stream(), + a, v, iend ); } stopTimer(); @@ -64,13 +67,18 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto initview1d_lambda = [=] __device__ (Index_type i) { + INIT_VIEW1D_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - INIT_VIEW1D_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, initview1d_lambda ); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 0951d954f..130a62a42 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,9 +53,11 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((initview1d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - a, v, iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (initview1d), + grid_size, block_size, + shmem, res.get_stream(), + a, v, iend ); } stopTimer(); @@ -71,9 +73,12 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, initview1d_lambda ); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp index 742270ff6..52160ab13 100644 --- a/src/basic/INIT_VIEW1D-OMP.cpp +++ b/src/basic/INIT_VIEW1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index d9ad636e1..825fcd569 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp index 59c494c49..b1c761a5c 100644 --- a/src/basic/INIT_VIEW1D-Seq.cpp +++ b/src/basic/INIT_VIEW1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp new file mode 100644 index 000000000..ff06d2203 --- /dev/null +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -0,0 +1,83 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace basic +{ + +template +void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + INIT_VIEW1D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + INIT_VIEW1D_BODY + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + INIT_VIEW1D_VIEW_RAJA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + INIT_VIEW1D_BODY_RAJA; + }); + + } + stopTimer(); + + } else { + std::cout << "\n INIT_VIEW1D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT_VIEW1D, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 018811f34..eb24e8e8e 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); @@ -53,6 +55,9 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index f3770f69a..0a3be36c3 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -66,18 +66,24 @@ class INIT_VIEW1D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_type m_val; diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 7d9bee43b..3a13f5210 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -54,10 +54,12 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); constexpr size_t shmem = 0; - initview1d_offset<<>>( a, v, - ibegin, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (initview1d_offset), + grid_size, block_size, + shmem, res.get_stream(), + a, v, + ibegin, iend ); } stopTimer(); @@ -67,13 +69,18 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto initview1d_offset_lambda = [=] __device__ (Index_type i) { + INIT_VIEW1D_OFFSET_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - INIT_VIEW1D_OFFSET_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, initview1d_offset_lambda ); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 2fb16872f..2940bb59d 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -54,9 +54,12 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((initview1d_offset), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - a, v, ibegin, iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (initview1d_offset), + grid_size, block_size, + shmem, res.get_stream(), + a, v, + ibegin, iend ); } stopTimer(); @@ -72,9 +75,12 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_offset_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, initview1d_offset_lambda ); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp index 8fb7c0129..bb6834c17 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index d045462d7..f87fa2625 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp index c25511aa1..b7588350a 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp new file mode 100644 index 000000000..f586540f6 --- /dev/null +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D_OFFSET.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +template +void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize()+1; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + INIT_VIEW1D_OFFSET_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend-ibegin, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = ibegin + item.get_global_id(0); + if (i < iend) { + INIT_VIEW1D_OFFSET_BODY + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + INIT_VIEW1D_OFFSET_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT_VIEW1D_OFFSET, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 4daa109a6..1eef8fc3d 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); @@ -53,6 +55,9 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index d32f59c7b..92a75935d 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -65,18 +65,24 @@ class INIT_VIEW1D_OFFSET : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_type m_val; diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index f63af21d7..926c5f979 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -50,7 +50,7 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, template < size_t block_size > void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) { - constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); + constexpr Index_type tile_size = integer::sqrt(block_size); static_assert(tile_size*tile_size == block_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); @@ -73,9 +73,10 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - mat_mat_shared<<>>(N, C, A, B); - - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (mat_mat_shared), + gridDim, blockDim, + shmem, res.get_stream(), + N, C, A, B ); } stopTimer(); @@ -84,7 +85,8 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - lambda_cuda<<>>([=] __device__() { + auto mat_mat_shared_lambda = [=] __device__() { + auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { MAT_MAT_SHARED_BODY_0(tile_size) @@ -171,9 +173,13 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) Index_type by = blockIdx.y; if(by < Ny) outer_y(by); } - }); + }; - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (lambda_cuda), + gridDim, blockDim, + shmem, res.get_stream(), + mat_mat_shared_lambda ); } stopTimer(); diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index d548395e3..9c58d9267 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -50,7 +50,7 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, template < size_t block_size > void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) { - constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); + constexpr Index_type tile_size = integer::sqrt(block_size); static_assert(tile_size*tile_size == block_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); @@ -73,10 +73,10 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((mat_mat_shared), dim3(gridDim), dim3(blockDim), shmem, res.get_stream(), - N, C, A, B); - - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (mat_mat_shared), + gridDim, blockDim, + shmem, res.get_stream(), + N, C, A, B ); } stopTimer(); @@ -85,7 +85,7 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - auto mat_mat_shared_lam = [=] __device__() { + auto mat_mat_shared_lambda = [=] __device__() { auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { @@ -175,10 +175,11 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((lambda_hip), - gridDim, blockDim, shmem, res.get_stream(), mat_mat_shared_lam); - - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (lambda_hip), + gridDim, blockDim, + shmem, res.get_stream(), + mat_mat_shared_lambda ); } stopTimer(); diff --git a/src/basic/MAT_MAT_SHARED-Sycl.cpp b/src/basic/MAT_MAT_SHARED-Sycl.cpp new file mode 100644 index 000000000..174ac0952 --- /dev/null +++ b/src/basic/MAT_MAT_SHARED-Sycl.cpp @@ -0,0 +1,201 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_MAT_SHARED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +template < size_t work_group_size > +void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) +{ + constexpr Index_type tile_size = integer::sqrt(work_group_size); + static_assert(tile_size*tile_size == work_group_size, "Invalid block_size"); + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + + const Index_type Nx = RAJA_DIVIDE_CEILING_INT(N, tile_size); + const Index_type Ny = RAJA_DIVIDE_CEILING_INT(N, tile_size); + + //Right most is the fastest index + const ::sycl::range<3> workGroupSize(1, tile_size, tile_size); + const ::sycl::range<3> gridSize(1, Ny*tile_size, Nx*tile_size); + + constexpr size_t shmem = tile_size * tile_size; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MAT_MAT_SHARED_DATA_SETUP; + + if (vid == Base_SYCL) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&](cl::sycl::handler& h) { + + ::sycl::local_accessor As(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Bs(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, workGroupSize), + [=] (cl::sycl::nd_item<3> itm) { + + Index_type tx = itm.get_local_id(2); + Index_type ty = itm.get_local_id(1); + Index_type bx = itm.get_group(2); + Index_type by = itm.get_group(1); + + MAT_MAT_SHARED_BODY_1(tile_size) + + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) { + + MAT_MAT_SHARED_BODY_2(tile_size) + + itm.barrier(::sycl::access::fence_space::local_space); + + MAT_MAT_SHARED_BODY_3(tile_size) + + itm.barrier(::sycl::access::fence_space::local_space); + } + + MAT_MAT_SHARED_BODY_4(tile_size) + + }); + + }); + + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + constexpr bool async = true; + + const int local_mats = 3; + constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(double); + + using launch_policy = RAJA::LaunchPolicy>; + + using teams_x = RAJA::LoopPolicy; + + using teams_y = RAJA::LoopPolicy; + + using threads_x = RAJA::LoopPolicy; + + using threads_y = RAJA::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(Nx, Ny), + RAJA::Threads(tile_size, tile_size), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, Ny), + [&](Index_type by) { + RAJA::loop(ctx, RAJA::RangeSegment(0, Nx), + [&](Index_type bx) { + + //We only support dynamic shared memory in Sycl + //Thus requiring a different setup than other backends + //which use static shared memory + double * As_ptr = ctx.getSharedMemory(tile_size * tile_size); + double * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); + double * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); + double (*As)[tile_size] = (double (*)[tile_size]) As_ptr; + double (*Bs)[tile_size] = (double (*)[tile_size]) Bs_ptr; + double (*Cs)[tile_size] = (double (*)[tile_size]) Cs_ptr; + + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type ty) { + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type tx) { + MAT_MAT_SHARED_BODY_1(tile_size) + } + ); // RAJA::loop + } + ); // RAJA::loop + + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type ty) { + RAJA::loop(ctx, + RAJA::RangeSegment(0, tile_size), + [&](Index_type tx) { + MAT_MAT_SHARED_BODY_2(tile_size) + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type ty) { + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type tx) { + MAT_MAT_SHARED_BODY_3(tile_size) + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + } // for (k) + + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type ty) { + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type tx) { + MAT_MAT_SHARED_BODY_4(tile_size) + } + ); // RAJA::loop + } + ); // RAJA::loop + + } // lambda (bx) + ); // RAJA::loop + } // lambda (by) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + } else { + getCout() << "\n MAT_MAT_SHARED : Unknown Sycl variant id = " << vid + << std::endl; + } + +} + + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MAT_MAT_SHARED, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 2173a1bc6..61a85e898 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -24,15 +24,16 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) setDefaultProblemSize(m_N_default*m_N_default); setDefaultReps(5); - m_N = std::max(Index_type(std::sqrt(getTargetProblemSize())), Index_type(1)); + m_N = std::sqrt(getTargetProblemSize()) + std::sqrt(2)-1; setActualProblemSize(m_N * m_N); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( m_N*m_N*sizeof(Real_type) + - m_N*m_N*sizeof(Real_type) ); + setBytesReadPerRep( 2*sizeof(Real_type) * m_N*m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * m_N*m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); const Index_type no_tiles = (TL_SZ + m_N - 1) / TL_SZ; const Index_type no_blocks = RAJA_DIVIDE_CEILING_INT(m_N, TL_SZ); @@ -60,6 +61,9 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) setVariantDefined(Base_HIP); setVariantDefined(Lambda_HIP); setVariantDefined(RAJA_HIP); + + setVariantDefined(Base_SYCL); + setVariantDefined(RAJA_SYCL); } MAT_MAT_SHARED::~MAT_MAT_SHARED() {} diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index 095721c27..b543dd4f7 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -94,7 +94,7 @@ constexpr rajaperf::Index_type TL_SZ = 16; RAJA_TEAM_SHARED double Bs[tile_size][tile_size]; \ RAJA_TEAM_SHARED double Cs[tile_size][tile_size]; -#define MAT_MAT_SHARED_BODY_1(tile_size) \ +#define MAT_MAT_SHARED_BODY_1(tile_size) \ Cs[ty][tx] = 0; #define MAT_MAT_SHARED_BODY_2(tile_size) \ @@ -139,17 +139,22 @@ class MAT_MAT_SHARED : public KernelBase { void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = TL_SZ * TL_SZ; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_A; Real_ptr m_B; diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 3f0dec4dd..e39cd5a77 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,9 +53,13 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - muladdsub<<>>( out1, out2, out3, in1, in2, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (muladdsub), + grid_size, block_size, + shmem, res.get_stream(), + out1, out2, out3, + in1, in2, + iend ); } stopTimer(); @@ -65,13 +69,18 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto muladdsub_lambda = [=] __device__ (Index_type i) { + MULADDSUB_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - MULADDSUB_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, muladdsub_lambda ); } stopTimer(); diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 9d292001f..bb846eef5 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,9 +53,13 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((muladdsub), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - out1, out2, out3, in1, in2, iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (muladdsub), + grid_size, block_size, + shmem, res.get_stream(), + out1, out2, out3, + in1, in2, + iend ); } stopTimer(); @@ -71,9 +75,12 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, muladdsub_lambda ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, muladdsub_lambda ); } stopTimer(); diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp index 6c9bb2038..28f5edab1 100644 --- a/src/basic/MULADDSUB-OMP.cpp +++ b/src/basic/MULADDSUB-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index af691d008..f4d0e716a 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index 59ddf1ea1..40ede6d64 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp new file mode 100644 index 000000000..9bca65221 --- /dev/null +++ b/src/basic/MULADDSUB-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULADDSUB.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +template +void MULADDSUB::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MULADDSUB_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + MULADDSUB_BODY + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + MULADDSUB_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n MULADDSUB : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MULADDSUB, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index a5deb6049..4ab19194c 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ MULADDSUB::MULADDSUB(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (3*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(3 * getActualProblemSize()); setUsesFeature(Forall); @@ -52,6 +54,9 @@ MULADDSUB::MULADDSUB(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index e604a34c8..1846a49a7 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -58,18 +58,24 @@ class MULADDSUB : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp new file mode 100644 index 000000000..fa52f9e99 --- /dev/null +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -0,0 +1,302 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +constexpr Index_type warp_size = 32; + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values, + MULTI_REDUCE::Data_ptr data, + Index_ptr bins, + Index_type iend, + Index_type num_bins, + Index_type shared_replication, + Index_type global_replication) +{ + if (shared_replication > 0) { + + extern __shared__ MULTI_REDUCE::Data_type shared_values[]; + for (Index_type t = threadIdx.x; + t < Index_type(num_bins * shared_replication); + t += block_size) { + shared_values[t] = MULTI_REDUCE::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + for ( ; i < iend ; i += gridDim.x * block_size ) { + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); + RAJA::atomicAdd(&shared_values[offset], data[i]); + } + } + + __syncthreads(); + for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) { + auto block_sum = MULTI_REDUCE::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_values[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; + } + if (block_sum != MULTI_REDUCE::Data_type(0)) { + Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins; + RAJA::atomicAdd(&global_values[offset], block_sum); + } + } + + } else { + + Index_type i = blockIdx.x * block_size + threadIdx.x; + Index_type warp = i / warp_size; + for ( ; i < iend ; i += gridDim.x * block_size ) { + Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; + RAJA::atomicAdd(&global_values[offset], data[i]); + } + } +} + +template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication, + typename MappingHelper > +void MULTI_REDUCE::runCudaVariantAtomicRuntime(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + MULTI_REDUCE_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + auto* func = &multi_reduce_atomic_runtime; + + cudaFuncAttributes func_attr; + cudaErrchk(cudaFuncGetAttributes(&func_attr, (const void*)func)); + const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; + const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; + + const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); + const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); + + const Index_type max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, func, block_size, shmem); + const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const Index_type grid_size = std::min(normal_grid_size, max_grid_size); + + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, global_replication); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, global_replication); + + RPlaunchCudaKernel( func, + grid_size, block_size, + shmem, res.get_stream(), + values, + data, + bins, + iend, + num_bins, + shared_replication, + global_replication ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, global_replication); + for (Index_type bin = 0; bin < num_bins; ++bin) { + Data_type value_final = Data_type(0); + for (Index_type r = 0; r < global_replication; ++r) { + Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins; + value_final += hvalues[offset]; + } + values_final[bin] = value_final; + } + + } + stopTimer(); + + RAJAPERF_CUDA_REDUCER_TEARDOWN(values, hvalues); + + } else if ( vid == RAJA_CUDA ) { + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< + RAJA::cuda::MultiReduceTuning< + RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::thread_xyz<>, + RAJA::GetOffsetRight>, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::warp_global_xyz<>, + RAJA::GetOffsetLeft>>>; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES_RAJA(multi_reduce_policy); + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + MULTI_REDUCE_BODY; + }); + + MULTI_REDUCE_FINALIZE_VALUES_RAJA(multi_reduce_policy); + + } + stopTimer(); + + } else { + getCout() << "\n MULTI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + } + +} + +void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); + + } + + t += 1; + + } + + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { + + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); + + } + + t += 1; + + }); + + } + + }); + + }); + + } + + }); + + } else { + + getCout() << "\n MULTI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void MULTI_REDUCE::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + addVariantTuningName(vid, "atomic_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } + + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { + + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { + + addVariantTuningName(vid, "atomic_" + "shared("+std::to_string(shared_replication)+")_"+ + "global("+std::to_string(global_replication)+")_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } + + }); + + }); + + } + + }); + +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp new file mode 100644 index 000000000..e2106a79e --- /dev/null +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -0,0 +1,302 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +constexpr Index_type warp_size = 64; + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values, + MULTI_REDUCE::Data_ptr data, + Index_ptr bins, + Index_type iend, + Index_type num_bins, + Index_type shared_replication, + Index_type global_replication) +{ + if (shared_replication > 0) { + + extern __shared__ MULTI_REDUCE::Data_type shared_values[]; + for (Index_type t = threadIdx.x; + t < Index_type(num_bins * shared_replication); + t += block_size) { + shared_values[t] = MULTI_REDUCE::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + for ( ; i < iend ; i += gridDim.x * block_size ) { + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); + RAJA::atomicAdd(&shared_values[offset], data[i]); + } + } + + __syncthreads(); + for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) { + auto block_sum = MULTI_REDUCE::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_values[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; + } + if (block_sum != MULTI_REDUCE::Data_type(0)) { + Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins; + RAJA::atomicAdd(&global_values[offset], block_sum); + } + } + + } else { + + Index_type i = blockIdx.x * block_size + threadIdx.x; + Index_type warp = i / warp_size; + for ( ; i < iend ; i += gridDim.x * block_size ) { + Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; + RAJA::atomicAdd(&global_values[offset], data[i]); + } + } +} + +template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication, + typename MappingHelper > +void MULTI_REDUCE::runHipVariantAtomicRuntime(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + MULTI_REDUCE_DATA_SETUP; + + if ( vid == Base_HIP ) { + + auto* func = &multi_reduce_atomic_runtime; + + hipFuncAttributes func_attr; + hipErrchk(hipFuncGetAttributes(&func_attr, (const void*)func)); + const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; + const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; + + const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); + const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); + + const Index_type max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, func, block_size, shmem); + const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const Index_type grid_size = std::min(normal_grid_size, max_grid_size); + + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, global_replication); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, global_replication); + + RPlaunchHipKernel( func, + grid_size, block_size, + shmem, res.get_stream(), + values, + data, + bins, + iend, + num_bins, + shared_replication, + global_replication ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, global_replication); + for (Index_type bin = 0; bin < num_bins; ++bin) { + Data_type value_final = Data_type(0); + for (Index_type r = 0; r < global_replication; ++r) { + Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins; + value_final += hvalues[offset]; + } + values_final[bin] = value_final; + } + + } + stopTimer(); + + RAJAPERF_HIP_REDUCER_TEARDOWN(values, hvalues); + + } else if ( vid == RAJA_HIP ) { + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< + RAJA::hip::MultiReduceTuning< + RAJA::hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::thread_xyz<>, + RAJA::GetOffsetRight>, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::warp_global_xyz<>, + RAJA::GetOffsetLeft>>>; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES_RAJA(multi_reduce_policy); + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + MULTI_REDUCE_BODY; + }); + + MULTI_REDUCE_FINALIZE_VALUES_RAJA(multi_reduce_policy); + + } + stopTimer(); + + } else { + getCout() << "\n MULTI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + } + +} + +void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); + + } + + t += 1; + + } + + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { + + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); + + } + + t += 1; + + }); + + } + + }); + + }); + + } + + }); + + } else { + + getCout() << "\n MULTI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void MULTI_REDUCE::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + addVariantTuningName(vid, "atomic_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } + + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { + + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { + + addVariantTuningName(vid, "atomic_" + "shared("+std::to_string(shared_replication)+")_"+ + "global("+std::to_string(global_replication)+")_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } + + }); + + }); + + } + + }); + +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic/MULTI_REDUCE-OMP.cpp b/src/basic/MULTI_REDUCE-OMP.cpp new file mode 100644 index 000000000..2e2ebf5d4 --- /dev/null +++ b/src/basic/MULTI_REDUCE-OMP.cpp @@ -0,0 +1,121 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void MULTI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + MULTI_REDUCE_SETUP_VALUES; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + MULTI_REDUCE_BODY; + } + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + MULTI_REDUCE_TEARDOWN_VALUES; + + break; + } + + case Lambda_OpenMP : { + + MULTI_REDUCE_SETUP_VALUES; + + auto multi_reduce_base_lam = [=](Index_type i) { + #pragma omp atomic + MULTI_REDUCE_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + multi_reduce_base_lam(i); + } + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + MULTI_REDUCE_TEARDOWN_VALUES; + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::omp_multi_reduce); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + MULTI_REDUCE_BODY; + }); + + MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::omp_multi_reduce); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n MULTI_REDUCE : Unknown variant id = " << vid << std::endl; + } + + } + + MULTI_REDUCE_DATA_TEARDOWN; + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE-OMPTarget.cpp b/src/basic/MULTI_REDUCE-OMPTarget.cpp new file mode 100644 index 000000000..8c2e18060 --- /dev/null +++ b/src/basic/MULTI_REDUCE-OMPTarget.cpp @@ -0,0 +1,68 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void MULTI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(values, values_init, num_bins); + + #pragma omp target is_device_ptr(values, bins, data) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + MULTI_REDUCE_BODY; + } + + getOpenMPDeviceData(values_final, values, num_bins); + + } + stopTimer(); + + } else { + getCout() << "\n MULTI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl; + } + + MULTI_REDUCE_DATA_TEARDOWN; + +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic/MULTI_REDUCE-Seq.cpp b/src/basic/MULTI_REDUCE-Seq.cpp new file mode 100644 index 000000000..a771953aa --- /dev/null +++ b/src/basic/MULTI_REDUCE-Seq.cpp @@ -0,0 +1,114 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void MULTI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + MULTI_REDUCE_SETUP_VALUES; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + for (Index_type i = ibegin; i < iend; ++i ) { + MULTI_REDUCE_BODY; + } + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + MULTI_REDUCE_TEARDOWN_VALUES; + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + MULTI_REDUCE_SETUP_VALUES; + + auto multi_reduce_base_lam = [=](Index_type i) { + MULTI_REDUCE_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + for (Index_type i = ibegin; i < iend; ++i ) { + multi_reduce_base_lam(i); + } + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + MULTI_REDUCE_TEARDOWN_VALUES; + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::seq_multi_reduce); + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + MULTI_REDUCE_BODY; + }); + + MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::seq_multi_reduce); + + } + stopTimer(); + + break; + } +#endif + + default : { + getCout() << "\n MULTI_REDUCE : Unknown variant id = " << vid << std::endl; + } + + } + + MULTI_REDUCE_DATA_TEARDOWN; + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp new file mode 100644 index 000000000..8fc6ee6c6 --- /dev/null +++ b/src/basic/MULTI_REDUCE.cpp @@ -0,0 +1,154 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include +#include + +namespace rajaperf +{ +namespace basic +{ + + +MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) + : KernelBase(rajaperf::Basic_MULTI_REDUCE, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(50); + + setActualProblemSize( getTargetProblemSize() ); + + m_num_bins = params.getMultiReduceNumBins(); + m_bin_assignment_algorithm = params.getMultiReduceBinAssignmentAlgorithm(); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins + + 1*sizeof(Data_type) * getActualProblemSize() + + 1*sizeof(Index_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep(1 * getActualProblemSize()); + + setUsesFeature(Forall); + setUsesFeature(Atomic); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); +} + +MULTI_REDUCE::~MULTI_REDUCE() +{ +} + +void MULTI_REDUCE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocData(m_bins, getActualProblemSize(), vid); + allocAndInitDataRandValue(m_data, getActualProblemSize(), vid); + { + auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); + + const bool init_random_per_iterate = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Random); + const bool init_random_sizes = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsRandomSizes); + const bool init_even_sizes = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsEvenSizes); + const bool init_all_one = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Single); + + if (init_even_sizes || init_random_sizes || init_all_one) { + Real_ptr data = nullptr; + if (init_even_sizes) { + allocData(data, m_num_bins, Base_Seq); + for (Index_type b = 0; b < m_num_bins; ++b) { + data[b] = static_cast(b+1) / m_num_bins; + } + } else if (init_random_sizes) { + allocAndInitDataRandValue(data, m_num_bins, Base_Seq); + std::sort(data, data+m_num_bins); + } else if (init_all_one) { + allocData(data, m_num_bins, Base_Seq); + for (Index_type b = 0; b < m_num_bins; ++b) { + data[b] = static_cast(0); + } + } + + Index_type actual_prob_size = getActualProblemSize(); + Index_type bin = 0; + for (Index_type i = 0; i < actual_prob_size; ++i) { + Real_type pos = static_cast(i) / actual_prob_size; + while (bin+1 < m_num_bins && pos >= data[bin]) { + bin += 1; + } + m_bins[i] = bin; + } + + deallocData(data, Base_Seq); + + } else if (init_random_per_iterate) { + Real_ptr data; + allocAndInitDataRandValue(data, getActualProblemSize(), Base_Seq); + + for (Index_type i = 0; i < getActualProblemSize(); ++i) { + m_bins[i] = static_cast(data[i] * m_num_bins); + if (m_bins[i] >= m_num_bins) { + m_bins[i] = m_num_bins - 1; + } + if (m_bins[i] < 0) { + m_bins[i] = 0; + } + } + + deallocData(data, Base_Seq); + } else { + throw 1; + } + } + + m_values_init.resize(m_num_bins, 0.0); + m_values_final.resize(m_num_bins, 0.0); +} + +void MULTI_REDUCE::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid][tune_idx] += calcChecksum(m_values_final.data(), m_num_bins, vid); +} + +void MULTI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + deallocData(m_bins, vid); + deallocData(m_data, vid); + m_values_init.clear(); m_values_init.shrink_to_fit(); + m_values_final.clear(); m_values_final.shrink_to_fit(); +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp new file mode 100644 index 000000000..cf8d99185 --- /dev/null +++ b/src/basic/MULTI_REDUCE.hpp @@ -0,0 +1,139 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// MULTI_REDUCE kernel reference implementation: +/// +/// double* values = calloc(num_bins, sizeof(double)); +/// for (Index_type i = 0; i < N; ++i ) { +/// values[bins[i]] += data[i]; +/// } +/// + +#ifndef RAJAPerf_Basic_MULTI_REDUCE_HPP +#define RAJAPerf_Basic_MULTI_REDUCE_HPP + +#define MULTI_REDUCE_DATA_SETUP \ + Index_type num_bins = m_num_bins; \ + Index_ptr bins = m_bins; \ + Data_ptr data = m_data; \ + std::vector& values_init = m_values_init; \ + std::vector& values_final = m_values_final; + +#define MULTI_REDUCE_DATA_TEARDOWN + + +#define MULTI_REDUCE_SETUP_VALUES \ + Data_ptr values; \ + allocData(getReductionDataSpace(vid), values, num_bins); + +#define MULTI_REDUCE_TEARDOWN_VALUES \ + deallocData(values, vid); + +#define MULTI_REDUCE_INIT_VALUES \ + for (Index_type b = 0; b < num_bins; ++b ) { \ + values[b] = values_init[b]; \ + } + +#define MULTI_REDUCE_FINALIZE_VALUES \ + for (Index_type b = 0; b < num_bins; ++b ) { \ + values_final[b] = values[b]; \ + } + +#define MULTI_REDUCE_INIT_VALUES_RAJA(policy) \ + RAJA::MultiReduceSum values(values_init); + +#define MULTI_REDUCE_FINALIZE_VALUES_RAJA(policy) \ + values.get_all(values_final); + +#define MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication) \ + for (Index_type b = 0; b < (num_bins); ++b) { \ + Data_type val_final = 0; \ + for (size_t r = 0; r < (replication); ++r) { \ + val_final += (hvalues)[b*(replication) + r]; \ + } \ + values_final[b] = val_final; \ + } + + +#define MULTI_REDUCE_BODY \ + values[bins[i]] += data[i]; + +#define MULTI_REDUCE_RAJA_BODY(policy) \ + RAJA::atomicAdd(&values[bins[i]], data[i]); + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace basic +{ + +class MULTI_REDUCE : public KernelBase +{ +public: + using Data_type = Real_type; + using Data_ptr = Real_ptr; + + MULTI_REDUCE(const RunParams& params); + + ~MULTI_REDUCE(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication, + typename MappingHelper > + void runCudaVariantAtomicRuntime(VariantID vid); + template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication, + typename MappingHelper > + void runHipVariantAtomicRuntime(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + + static const size_t default_cuda_atomic_global_replication = 2; + static const size_t default_cuda_atomic_shared_replication = 16; + using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + + static const size_t default_hip_atomic_global_replication = 32; + static const size_t default_hip_atomic_shared_replication = 4; + using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + + Index_type m_num_bins; + RunParams::BinAssignmentAlgorithm m_bin_assignment_algorithm; + Index_ptr m_bins; + Data_ptr m_data; + std::vector m_values_init; + std::vector m_values_final; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 605778eb7..74e4136d2 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -89,10 +89,11 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) NESTED_INIT_NBLOCKS_CUDA; constexpr size_t shmem = 0; - nested_init - <<>>(array, - ni, nj, nk); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (nested_init), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + array, ni, nj, nk ); } stopTimer(); @@ -102,17 +103,23 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto nested_init_lambda = [=] __device__ (Index_type i, + Index_type j, + Index_type k) { + NESTED_INIT_BODY; + }; + NESTED_INIT_THREADS_PER_BLOCK_CUDA; NESTED_INIT_NBLOCKS_CUDA; constexpr size_t shmem = 0; - nested_init_lam - <<>>(ni, nj, nk, - [=] __device__ (Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - } - ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (nested_init_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, nk, + nested_init_lambda ); } stopTimer(); @@ -136,10 +143,11 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }); diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index b7d023d7f..f7ea66dd4 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -89,10 +89,11 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) NESTED_INIT_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((nested_init), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - array, ni, nj, nk); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (nested_init), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + array, ni, nj, nk ); } stopTimer(); @@ -102,19 +103,23 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - NESTED_INIT_THREADS_PER_BLOCK_HIP; - NESTED_INIT_NBLOCKS_HIP; - constexpr size_t shmem = 0; - - auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j, + auto nested_init_lambda = [=] __device__ (Index_type i, + Index_type j, Index_type k) { NESTED_INIT_BODY; }; - hipLaunchKernelGGL((nested_init_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nj, nk, nested_init_lambda); - hipErrchk( hipGetLastError() ); + NESTED_INIT_THREADS_PER_BLOCK_HIP; + NESTED_INIT_NBLOCKS_HIP; + constexpr size_t shmem = 0; + + RPlaunchHipKernel( + (nested_init_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, nk, + nested_init_lambda ); } stopTimer(); @@ -138,10 +143,11 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }); diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp index 3b1e07767..3fa73fa5b 100644 --- a/src/basic/NESTED_INIT-OMP.cpp +++ b/src/basic/NESTED_INIT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp index 607c8befe..6e6538dfd 100644 --- a/src/basic/NESTED_INIT-OMPTarget.cpp +++ b/src/basic/NESTED_INIT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp index d3ce50d65..bc277ce27 100644 --- a/src/basic/NESTED_INIT-Seq.cpp +++ b/src/basic/NESTED_INIT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp new file mode 100644 index 000000000..950a6b56b --- /dev/null +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -0,0 +1,109 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NESTED_INIT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define work-group shape for SYCL execution + // +#define i_wg_sz (32) +#define j_wg_sz (work_group_size / i_wg_sz) +#define k_wg_sz (1) + +template +void NESTED_INIT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + NESTED_INIT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim(k_wg_sz * RAJA_DIVIDE_CEILING_INT(nk, k_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, j_wg_sz), + i_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, i_wg_sz)); + sycl::range<3> wkgroup_dim(k_wg_sz, j_wg_sz, i_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (cl::sycl::handler& h) { + h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(2); + Index_type j = item.get_global_id(1); + Index_type k = item.get_global_id(0); + + if (i < ni && j < nj && k < nk) { + NESTED_INIT_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<2, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<0, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + res, + [=] (Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n NESTED_INIT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(NESTED_INIT, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index fc64f5a0d..67d1d017b 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -29,7 +29,7 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setDefaultProblemSize(m_n_init * m_n_init * m_n_init); setDefaultReps(1000); - auto n_final = std::cbrt( getTargetProblemSize() ); + auto n_final = std::cbrt( getTargetProblemSize() ) + std::cbrt(3)-1; m_ni = n_final; m_nj = n_final; m_nk = n_final; @@ -39,7 +39,9 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(3 * getActualProblemSize()); setUsesFeature(Kernel); @@ -63,6 +65,9 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index ccaf7079e..0c579dd3b 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -58,19 +58,25 @@ class NESTED_INIT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_array_length; diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 644d358dc..f49c53518 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -45,25 +45,29 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) auto res{getCudaResource()}; - PI_ATOMIC_DATA_SETUP; + PI_ATOMIC_GPU_DATA_SETUP; + + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1); if ( vid == Base_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - pi_atomic<<>>( pi, dx, iend ); - cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + RPlaunchCudaKernel( (pi_atomic), + grid_size, block_size, + shmem, res.get_stream(), + pi, + dx, + iend ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -73,22 +77,24 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); + + auto pi_atomic_lambda = [=] __device__ (Index_type i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - }); - cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, pi_atomic_lambda ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -98,8 +104,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -107,10 +112,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -118,6 +121,9 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) } else { getCout() << "\n PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; } + + RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi); + } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_ATOMIC, Cuda) diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 1db304a52..637c10156 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,7 +23,7 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) -__global__ void atomic_pi(Real_ptr pi, +__global__ void pi_atomic(Real_ptr pi, Real_type dx, Index_type iend) { @@ -45,25 +45,29 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) auto res{getHipResource()}; - PI_ATOMIC_DATA_SETUP; + PI_ATOMIC_GPU_DATA_SETUP; + + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1); if ( vid == Base_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((atomic_pi),grid_size, block_size, shmem, res.get_stream(), pi, dx, iend ); - hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + RPlaunchHipKernel( (pi_atomic), + grid_size, block_size, + shmem, res.get_stream(), + pi, + dx, + iend ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -73,24 +77,24 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); - auto atomic_pi_lambda = [=] __device__ (Index_type i) { + auto pi_atomic_lambda = [=] __device__ (Index_type i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, atomic_pi_lambda); - hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, pi_atomic_lambda ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -100,8 +104,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -109,10 +112,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -120,6 +121,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) } else { getCout() << "\n PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl; } + + RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi); } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_ATOMIC, Hip) diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index c031dcf32..d1f0eb784 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -99,6 +99,8 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } + PI_ATOMIC_DATA_TEARDOWN; + #else RAJA_UNUSED_VAR(vid); #endif diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index 9d4f2649f..5f3fe4c82 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -78,6 +78,9 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } else { getCout() << "\n PI_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; } + + PI_ATOMIC_DATA_TEARDOWN; + } } // end namespace basic diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index 9d3864713..698361107 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -95,6 +95,8 @@ void PI_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } + PI_ATOMIC_DATA_TEARDOWN; + } } // end namespace basic diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index af33d01fc..5a6a5bc04 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ PI_ATOMIC::PI_ATOMIC(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 0 ); + setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) ); setFLOPsPerRep(6 * getActualProblemSize() + 1); setUsesFeature(Forall); @@ -64,7 +65,6 @@ PI_ATOMIC::~PI_ATOMIC() void PI_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_dx = 1.0 / double(getActualProblemSize()); - allocAndInitDataConst(m_pi, 1, 0.0, vid); m_pi_init = 0.0; m_pi_final = -static_cast(vid); } @@ -77,7 +77,6 @@ void PI_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_pi, vid); } } // end namespace basic diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index fe26d9beb..26a3a7016 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -25,7 +25,14 @@ #define PI_ATOMIC_DATA_SETUP \ Real_type dx = m_dx; \ - Real_ptr pi = m_pi; + Real_ptr pi; \ + allocData(getReductionDataSpace(vid), pi, 1); + +#define PI_ATOMIC_DATA_TEARDOWN \ + deallocData(pi, vid); + +#define PI_ATOMIC_GPU_DATA_SETUP \ + Real_type dx = m_dx; #include "common/KernelBase.hpp" @@ -54,10 +61,12 @@ class PI_ATOMIC : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > @@ -65,10 +74,9 @@ class PI_ATOMIC : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_type m_dx; - Real_ptr m_pi; Real_type m_pi_init; Real_type m_pi_final; }; diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index b0577bc58..8529897c3 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,10 @@ #include "common/CudaDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -24,7 +28,7 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) __global__ void pi_reduce(Real_type dx, - Real_ptr dpi, Real_type pi_init, + Real_ptr pi, Real_type pi_init, Index_type iend) { extern __shared__ Real_type ppi[ ]; @@ -45,24 +49,16 @@ __global__ void pi_reduce(Real_type dx, __syncthreads(); } -#if 1 // serialized access to shared data; - if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( dpi, ppi[ 0 ] ); - } -#else // this doesn't work due to data races if ( threadIdx.x == 0 ) { - *dpi += ppi[ 0 ]; + RAJA::atomicAdd( pi, ppi[ 0 ] ); } -#endif } - -template < size_t block_size > -void PI_REDUCE::runCudaVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void PI_REDUCE::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -71,46 +67,113 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - Real_ptr dpi; - allocData(DataSpace::CudaDevice, dpi, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (pi_reduce), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - pi_reduce<<>>( dx, - dpi, m_pi_init, - iend ); - cudaErrchk( cudaGetLastError() ); + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - cudaErrchk( cudaMemcpyAsync( &m_pi, dpi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi *= 4.0; + RPlaunchCudaKernel( (pi_reduce), + grid_size, block_size, + shmem, res.get_stream(), + dx, + pi, m_pi_init, + iend ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi = hpi[0] * static_cast(4); } stopTimer(); - deallocData(DataSpace::CudaDevice, dpi); + RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi); - } else if ( vid == RAJA_CUDA ) { + } else { + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void PI_REDUCE::runCudaVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum pi(m_pi_init); + RAJA::ReduceSum pi(m_pi_init); - RAJA::forall< RAJA::cuda_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { PI_REDUCE_BODY; }); - m_pi = 4.0 * static_cast(pi.get()); + m_pi = static_cast(4) * static_cast(pi.get()); + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + } +} + + +template < size_t block_size, typename MappingHelper > +void PI_REDUCE::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; + + RAJA::forall< exec_policy >( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] __device__ (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + + m_pi = static_cast(tpi) * 4.0; } stopTimer(); @@ -120,7 +183,122 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Cuda) +void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + }); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index dd56426c2..ed2dfd8dd 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,10 @@ #include "common/HipDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -24,7 +28,7 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) __global__ void pi_reduce(Real_type dx, - Real_ptr dpi, Real_type pi_init, + Real_ptr pi, Real_type pi_init, Index_type iend) { HIP_DYNAMIC_SHARED(Real_type, ppi); @@ -45,24 +49,16 @@ __global__ void pi_reduce(Real_type dx, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { - RAJA::atomicAdd(RAJA::hip_atomic{}, dpi, ppi[ 0 ] ); - } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) i{ - *dpi += ppi[ 0 ]; + RAJA::atomicAdd( pi, ppi[ 0 ] ); } -#endif } - -template < size_t block_size > -void PI_REDUCE::runHipVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void PI_REDUCE::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -71,45 +67,72 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - Real_ptr dpi; - allocData(DataSpace::HipDevice, dpi, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (pi_reduce), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), - dx, dpi, m_pi_init, iend ); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (pi_reduce), + grid_size, block_size, + shmem, res.get_stream(), + dx, + pi, m_pi_init, + iend ); - hipErrchk( hipMemcpyAsync( &m_pi, dpi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi *= 4.0; + RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi = hpi[0] * static_cast(4); } stopTimer(); - deallocData(DataSpace::HipDevice, dpi); + RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi); + + } else { + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void PI_REDUCE::runHipVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; - } else if ( vid == RAJA_HIP ) { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum pi(m_pi_init); + RAJA::ReduceSum pi(m_pi_init); - RAJA::forall< RAJA::hip_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { PI_REDUCE_BODY; }); - m_pi = 4.0 * static_cast(pi.get()); + m_pi = static_cast(4) * static_cast(pi.get()); } stopTimer(); @@ -119,7 +142,162 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Hip) +template < size_t block_size, typename MappingHelper > +void PI_REDUCE::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; + + RAJA::forall< exec_policy >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] __device__ (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + } +} + +void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void PI_REDUCE::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + }); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index 44da3e5b5..5c83aba6f 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -77,21 +77,47 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * pi.get(); + + } + stopTimer(); + + } else if (tune_idx == 1) { - RAJA::ReduceSum pi(m_pi_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - PI_REDUCE_BODY; - }); + Real_type tpi = m_pi_init; - m_pi = 4.0 * pi.get(); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown OpenMP tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -104,8 +130,17 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void PI_REDUCE::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index f4c20a665..351580471 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -27,7 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -56,21 +56,47 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * pi.get(); + + } + stopTimer(); + + } else if (tune_idx == 1) { - RAJA::ReduceSum pi(m_pi_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - PI_REDUCE_BODY; - }); + Real_type tpi = m_pi_init; - m_pi = 4.0 * pi.get(); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown OMP Target tuning index = " << tune_idx << std::endl; } - stopTimer(); } else { getCout() << "\n PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl; @@ -78,6 +104,14 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } +void PI_REDUCE::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 4bd888dd0..4a5b28815 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,8 +18,11 @@ namespace basic { -void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -74,20 +77,45 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * pi.get(); + + } + stopTimer(); - RAJA::ReduceSum pi(m_pi_init); + } else if (tune_idx == 1) { - RAJA::forall( RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - PI_REDUCE_BODY; - }); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - m_pi = 4.0 * pi.get(); + Real_type tpi = m_pi_init; + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Seq tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -101,5 +129,13 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } +void PI_REDUCE::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Sycl.cpp b/src/basic/PI_REDUCE-Sycl.cpp new file mode 100644 index 000000000..c95e29583 --- /dev/null +++ b/src/basic/PI_REDUCE-Sycl.cpp @@ -0,0 +1,110 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "PI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include +#include +#include +#include + + +namespace rajaperf +{ +namespace basic +{ + + +template < size_t work_group_size > +void PI_REDUCE::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + PI_REDUCE_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + Real_ptr pi; + allocAndInitSyclDeviceData(pi, &m_pi_init, 1, qu); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(pi, &m_pi_init, 1, qu); + + qu->submit([&] (sycl::handler& hdl) { + + auto sum_reduction = sycl::reduction(pi, sycl::plus<>()); + + hdl.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sum_reduction, + [=] (sycl::nd_item<1> item, auto& pi) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + PI_REDUCE_BODY; + } + + }); + }); + + Real_type lpi; + Real_ptr plpi = &lpi; + getSyclDeviceData(plpi, pi, 1, qu); + m_pi = 4.0 * lpi; + + } + stopTimer(); + + deallocSyclDeviceData(pi, qu); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; + + RAJA::forall< RAJA::sycl_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 84c38ce67..a258ae8cd 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ PI_REDUCE::PI_REDUCE(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(6 * getActualProblemSize() + 1); setUsesFeature(Forall); @@ -51,6 +52,9 @@ PI_REDUCE::PI_REDUCE(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } PI_REDUCE::~PI_REDUCE() diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 49fca096d..ca6860350 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -56,17 +56,35 @@ class PI_REDUCE : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); + + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_type m_dx; Real_type m_pi; diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 6843dcab3..a8d68b31c 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,10 @@ #include "common/CudaDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -56,28 +60,18 @@ __global__ void reduce3int(Int_ptr vec, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( vsum, psum[ 0 ] ); RAJA::atomicMin( vmin, pmin[ 0 ] ); RAJA::atomicMax( vmax, pmax[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *vsum += psum[ 0 ]; - *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); - *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); - } -#endif } - -template < size_t block_size > -void REDUCE3_INT::runCudaVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void REDUCE3_INT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -86,55 +80,74 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - Int_ptr vmem_init; - allocData(DataSpace::CudaPinned, vmem_init, 3); + RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3, 1); - Int_ptr vmem; - allocData(DataSpace::CudaDevice, vmem, 3); + constexpr size_t shmem = 3*sizeof(Int_type)*block_size; + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (reduce3int), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - vmem_init[0] = m_vsum_init; - vmem_init[1] = m_vmin_init; - vmem_init[2] = m_vmax_init; - cudaErrchk( cudaMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - reduce3int<<>>(vec, - vmem + 0, m_vsum_init, - vmem + 1, m_vmin_init, - vmem + 2, m_vmax_init, - iend ); - cudaErrchk( cudaGetLastError() ); - - Int_type lmem[3]; - cudaErrchk( cudaMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_vsum += lmem[0]; - m_vmin = RAJA_MIN(m_vmin, lmem[1]); - m_vmax = RAJA_MAX(m_vmax, lmem[2]); + Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; + RAJAPERF_CUDA_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3, 1); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + + RPlaunchCudaKernel( (reduce3int), + grid_size, block_size, + shmem, res.get_stream(), + vec, + vmem + 0, m_vsum_init, + vmem + 1, m_vmin_init, + vmem + 2, m_vmax_init, + iend ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(vmem, hvmem, 3, 1); + m_vsum += hvmem[0]; + m_vmin = RAJA_MIN(m_vmin, hvmem[1]); + m_vmax = RAJA_MAX(m_vmax, hvmem[2]); } stopTimer(); - deallocData(DataSpace::CudaDevice, vmem); - deallocData(DataSpace::CudaPinned, vmem_init); + RAJAPERF_CUDA_REDUCER_TEARDOWN(vmem, hvmem); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE3_INT::runCudaVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; - } else if ( vid == RAJA_CUDA ) { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); - RAJA::forall< RAJA::cuda_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE3_INT_BODY_RAJA; }); @@ -151,7 +164,168 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Cuda) +template < size_t block_size, typename MappingHelper > +void REDUCE3_INT::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=] __device__ (Index_type i, + Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + }); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index bd524565a..12d172de7 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,10 @@ #include "common/HipDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -56,28 +60,18 @@ __global__ void reduce3int(Int_ptr vec, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( vsum, psum[ 0 ] ); RAJA::atomicMin( vmin, pmin[ 0 ] ); RAJA::atomicMax( vmax, pmax[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *vsum += psum[ 0 ]; - *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); - *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); - } -#endif } - -template < size_t block_size > -void REDUCE3_INT::runHipVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void REDUCE3_INT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -86,55 +80,74 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - Int_ptr vmem_init; - allocData(DataSpace::HipPinned, vmem_init, 3); + RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3, 1); - Int_ptr vmem; - allocData(DataSpace::HipDevice, vmem, 3); + constexpr size_t shmem = 3*sizeof(Int_type)*block_size; + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (reduce3int), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - vmem_init[0] = m_vsum_init; - vmem_init[1] = m_vmin_init; - vmem_init[2] = m_vmax_init; - hipErrchk( hipMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - vec, - vmem + 0, m_vsum_init, - vmem + 1, m_vmin_init, - vmem + 2, m_vmax_init, - iend ); - hipErrchk( hipGetLastError() ); - - Int_type lmem[3]; - hipErrchk( hipMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_vsum += lmem[0]; - m_vmin = RAJA_MIN(m_vmin, lmem[1]); - m_vmax = RAJA_MAX(m_vmax, lmem[2]); + Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; + RAJAPERF_HIP_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3, 1); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + + RPlaunchHipKernel( (reduce3int), + grid_size, block_size, + shmem, res.get_stream(), + vec, + vmem + 0, m_vsum_init, + vmem + 1, m_vmin_init, + vmem + 2, m_vmax_init, + iend ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(vmem, hvmem, 3, 1); + m_vsum += hvmem[0]; + m_vmin = RAJA_MIN(m_vmin, hvmem[1]); + m_vmax = RAJA_MAX(m_vmax, hvmem[2]); } stopTimer(); - deallocData(DataSpace::HipDevice, vmem); - deallocData(DataSpace::HipPinned, vmem_init); + RAJAPERF_HIP_REDUCER_TEARDOWN(vmem, hvmem); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE3_INT::runHipVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; - } else if ( vid == RAJA_HIP ) { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); - RAJA::forall< RAJA::hip_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE3_INT_BODY_RAJA; }); @@ -151,7 +164,166 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Hip) +template < size_t block_size, typename MappingHelper > +void REDUCE3_INT::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=] __device__ (Index_type i, + Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index 5428d6087..c9848ac98 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -91,24 +91,56 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else if (tune_idx == 1) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown OpenMP tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -121,8 +153,17 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void REDUCE3_INT::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index 0d261edec..d92d37667 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -27,7 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -62,31 +62,70 @@ void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else if (tune_idx == 1) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown OMP Target tuning index = " << tune_idx << std::endl; } - stopTimer(); } else { getCout() << "\n REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl; } } +void REDUCE3_INT::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index f204bd345..32bcfbef6 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -19,8 +19,11 @@ namespace basic { -void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE3_INT::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -84,24 +87,56 @@ void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + } + stopTimer(); + } else { + getCout() << "\n REDUCE3_INT : Unknown Seq tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -115,5 +150,13 @@ void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i } +void REDUCE3_INT::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp new file mode 100644 index 000000000..58ac6f082 --- /dev/null +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -0,0 +1,135 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE3_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +template +void REDUCE3_INT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + REDUCE3_INT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + Int_ptr hsum; + allocAndInitSyclDeviceData(hsum, &m_vsum_init, 1, qu); + Int_ptr hmin; + allocAndInitSyclDeviceData(hmin, &m_vmin_init, 1, qu); + Int_ptr hmax; + allocAndInitSyclDeviceData(hmax, &m_vmax_init, 1, qu); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(hsum, &m_vsum_init, 1, qu); + initSyclDeviceData(hmin, &m_vmin_init, 1, qu); + initSyclDeviceData(hmax, &m_vmax_init, 1, qu); + + qu->submit([&] (sycl::handler& h) { + + auto sum_reduction = sycl::reduction(hsum, sycl::plus<>()); + auto min_reduction = sycl::reduction(hmin, sycl::minimum<>()); + auto max_reduction = sycl::reduction(hmax, sycl::maximum<>()); + + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sum_reduction, min_reduction, max_reduction, + [=] (sycl::nd_item<1> item, auto& vsum, auto& vmin, auto& vmax) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + // REDUCE3_INT_BODY + vsum += vec[i]; + vmin.combine(vec[i]); + vmax.combine(vec[i]); + } + + }); + }); + + Int_type lsum; + Int_ptr plsum = &lsum; + getSyclDeviceData(plsum, hsum, 1, qu); + m_vsum += lsum; + + Int_type lmin; + Int_ptr plmin = &lmin; + getSyclDeviceData(plmin, hmin, 1, qu); + m_vmin = RAJA_MIN(m_vmin, lmin); + + Int_type lmax; + Int_ptr plmax = &lmax; + getSyclDeviceData(plmax, hmax, 1, qu); + m_vmax = RAJA_MAX(m_vmax, lmax); + + } // for (RepIndex_type irep = ... + stopTimer(); + + deallocSyclDeviceData(hsum, qu); + deallocSyclDeviceData(hmin, qu); + deallocSyclDeviceData(hmax, qu); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; + + RAJA::forall< RAJA::sycl_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=] (Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); + + } else { + std::cout << "\n REDUCE3_INT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 975bf8f24..3be262b77 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -33,8 +33,10 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (3*sizeof(Int_type) + 3*sizeof(Int_type)) + - (0*sizeof(Int_type) + 1*sizeof(Int_type)) * getActualProblemSize() ); + setBytesReadPerRep( 3*sizeof(Int_type) + + 1*sizeof(Int_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 3*sizeof(Int_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize() + 1); setUsesFeature(Forall); @@ -57,6 +59,9 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index e82c2cf05..a3719a845 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -70,18 +70,37 @@ class REDUCE3_INT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); + + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Int_ptr m_vec; Int_type m_vsum; diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 2961af4cc..898b453f0 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,10 @@ #include "common/CudaDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -83,7 +87,6 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, __syncthreads(); } -// serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( xsum, pxsum[ 0 ] ); RAJA::atomicMin( xmin, pxmin[ 0 ] ); @@ -95,11 +98,10 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } } -template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -108,64 +110,90 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(DataSpace::CudaDevice, mem,6); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6, 1); + + constexpr size_t shmem = 6*sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (reduce_struct), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream())); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - - reduce_struct<<>>( - points.x, points.y, - mem, mem+1, mem+2, // xcenter,xmin,xmax - mem+3, mem+4, mem+5, // ycenter,ymin,ymax - m_init_sum, m_init_min, m_init_max, - points.N); - cudaErrchk( cudaGetLastError() ); - - Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - cudaErrchk( cudaMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - - points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); - points.SetXMin(lmem[1]); - points.SetXMax(lmem[2]); - points.SetYMin(lmem[4]); - points.SetYMax(lmem[5]); + Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; + RAJAPERF_CUDA_REDUCER_INITIALIZE(imem, mem, hmem, 6, 1); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + + RPlaunchCudaKernel( (reduce_struct), + grid_size, block_size, + shmem, res.get_stream(), + points.x, points.y, + mem, mem+1, mem+2, // xcenter,xmin,xmax + mem+3, mem+4, mem+5, // ycenter,ymin,ymax + m_init_sum, m_init_min, m_init_max, + points.N ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(mem, hmem, 6, 1); + points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); + points.SetXMin(hmem[1]); + points.SetXMax(hmem[2]); + points.SetYMin(hmem[4]); + points.SetYMax(hmem[5]); m_points=points; } stopTimer(); - deallocData(DataSpace::CudaDevice, mem); + RAJAPERF_CUDA_REDUCER_TEARDOWN(mem, hmem); - } else if ( vid == RAJA_CUDA ) { + } else { + getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; + } + +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE_STRUCT::runCudaVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); - RAJA::forall< RAJA::cuda_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); points.SetCenter((xsum.get()/(points.N)), (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); + points.SetXMin((xmin.get())); points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); + points.SetYMin((ymin.get())); points.SetYMax((ymax.get())); m_points=points; @@ -178,7 +206,181 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_STRUCT, Cuda) +template < size_t block_size, typename MappingHelper > +void REDUCE_STRUCT::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=] __device__ (Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; + } + +} + +void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n REDUCE_STRUCT : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void REDUCE_STRUCT::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + }); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 236e3e7f2..17fe5ad83 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,10 @@ #include "common/HipDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -83,7 +87,6 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, __syncthreads(); } -// serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( xsum, pxsum[ 0 ] ); RAJA::atomicMin( xmin, pxmin[ 0 ] ); @@ -95,12 +98,10 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } } - -template < size_t block_size > -void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void REDUCE_STRUCT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -109,68 +110,92 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(DataSpace::HipDevice, mem,6); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6, 1); + + constexpr size_t shmem = 6*sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (reduce_struct), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream())); + Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; + RAJAPERF_HIP_REDUCER_INITIALIZE(imem, mem, hmem, 6, 1); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 6*sizeof(Real_type)*block_size; + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL((reduce_struct), - dim3(grid_size), dim3(block_size), + RPlaunchHipKernel( (reduce_struct), + grid_size, block_size, shmem, res.get_stream(), - points.x, points.y, + points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax m_init_sum, m_init_min, m_init_max, - points.N); - hipErrchk( hipGetLastError() ); - - Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - hipErrchk( hipMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + points.N ); - points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); - points.SetXMin(lmem[1]); - points.SetXMax(lmem[2]); - points.SetYMin(lmem[4]); - points.SetYMax(lmem[5]); - m_points=points; + RAJAPERF_HIP_REDUCER_COPY_BACK(mem, hmem, 6, 1); + points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); + points.SetXMin(hmem[1]); + points.SetXMax(hmem[2]); + points.SetYMin(hmem[4]); + points.SetYMax(hmem[5]); + m_points = points; } stopTimer(); - deallocData(DataSpace::HipDevice, mem); + RAJAPERF_HIP_REDUCER_TEARDOWN(mem, hmem); - } else if ( vid == RAJA_HIP ) { + } else { + getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; + } + +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE_STRUCT::runHipVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); - RAJA::forall< RAJA::hip_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); points.SetCenter((xsum.get()/(points.N)), (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); + points.SetXMin((xmin.get())); points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); + points.SetYMin((ymin.get())); points.SetYMax((ymax.get())); - m_points=points; + m_points = points; } stopTimer(); @@ -181,7 +206,179 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_STRUCT, Hip) +template < size_t block_size, typename MappingHelper > +void REDUCE_STRUCT::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=] __device__ (Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown HIP variant id = " << vid << std::endl; + } + +} + +void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void REDUCE_STRUCT::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 7ac22faa2..8c44d02c0 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -19,7 +19,7 @@ namespace basic { -void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -55,7 +55,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t points.SetXMax(xmax); points.SetYMin(ymin); points.SetYMax(ymax); - m_points=points; + m_points = points; } stopTimer(); @@ -100,7 +100,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t points.SetXMax(xmax); points.SetYMin(ymin); points.SetYMax(ymax); - m_points=points; + m_points = points; } stopTimer(); @@ -110,31 +110,75 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter((xsum.get()/(points.N)), - (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); - points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); - points.SetYMax((ymax.get())); - m_points=points; + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); + m_points = points; + + } + stopTimer(); + + } else if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=](Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); + } else { + getCout() << "\n REDUCE_STRUCT : Unknown OpenMP tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -147,8 +191,17 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void REDUCE_STRUCT::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index cfbcba44a..594c62ccb 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -27,8 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, - size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -36,80 +35,142 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, REDUCE_STRUCT_DATA_SETUP; - if ( vid == Base_OpenMPTarget ) { + switch ( vid ) { - Real_ptr xa = points.x; - Real_ptr ya = points.y; + case Base_OpenMPTarget : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Real_ptr xa = points.x; + Real_ptr ya = points.y; - Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; - Real_type xmin = m_init_min; Real_type ymin = m_init_min; - Real_type xmax = m_init_max; Real_type ymax = m_init_max; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp target is_device_ptr(xa, ya) device( did ) map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \ + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; + + #pragma omp target is_device_ptr(xa, ya) device( did ) \ + map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax) + #pragma omp teams distribute parallel for \ + thread_limit(threads_per_team) schedule(static,1) \ reduction(+:xsum) \ reduction(min:xmin) \ reduction(max:xmax), \ reduction(+:ysum), \ reduction(min:ymin), \ reduction(max:ymax) - for (Index_type i = ibegin; i < iend; ++i ) { - xsum += xa[i] ; - xmin = RAJA_MIN(xmin, xa[i]) ; - xmax = RAJA_MAX(xmax, xa[i]) ; - ysum += ya[i] ; - ymin = RAJA_MIN(ymin, ya[i]) ; - ymax = RAJA_MAX(ymax, ya[i]) ; - } + for (Index_type i = ibegin; i < iend; ++i ) { + xsum += xa[i] ; + xmin = RAJA_MIN(xmin, xa[i]) ; + xmax = RAJA_MAX(xmax, xa[i]) ; + ysum += ya[i] ; + ymin = RAJA_MIN(ymin, ya[i]) ; + ymax = RAJA_MAX(ymax, ya[i]) ; + } + + points.SetCenter(xsum/points.N, ysum/points.N); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); + m_points = points; - points.SetCenter(xsum/points.N, ysum/points.N); - points.SetXMin(xmin); - points.SetXMax(xmax); - points.SetYMin(ymin); - points.SetYMax(ymax); - m_points=points; + } + stopTimer(); + break; } - stopTimer(); - - } else if ( vid == RAJA_OpenMPTarget ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter(xsum.get()/(points.N), - ysum.get()/(points.N)); - points.SetXMin(xmin.get()); - points.SetXMax(xmax.get()); - points.SetYMin(ymin.get()); - points.SetYMax(ymax.get()); - m_points=points; + case RAJA_OpenMPTarget : { + + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter(xsum.get()/(points.N), + ysum.get()/(points.N)); + points.SetXMin(xmin.get()); + points.SetXMax(xmax.get()); + points.SetYMin(ymin.get()); + points.SetYMax(ymax.get()); + m_points = points; + + } + stopTimer(); + + } else if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=](Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown OMP Target tuning index = " << tune_idx << std::endl; + } + + break; } - stopTimer(); - } else { + default: getCout() << "\n REDUCE_STRUCT : Unknown OMP Target variant id = " << vid << std::endl; } } +void REDUCE_STRUCT::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 377b19b84..1e2a68d43 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -19,8 +19,11 @@ namespace basic { -void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -47,7 +50,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune points.SetXMax(xmax); points.SetYMin(ymin); points.SetYMax(ymax); - m_points=points; + m_points = points; } stopTimer(); @@ -87,7 +90,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune points.SetXMax(xmax); points.SetYMin(ymin); points.SetYMax(ymax); - m_points=points; + m_points = points; } stopTimer(); @@ -97,31 +100,75 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter(xsum.get()/(points.N), + ysum.get()/(points.N)); + points.SetXMin(xmin.get()); + points.SetXMax(xmax.get()); + points.SetYMin(ymin.get()); + points.SetYMax(ymax.get()); + m_points = points; + + } + stopTimer(); + + } else if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=](Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter(xsum.get()/(points.N), - ysum.get()/(points.N)); - points.SetXMin(xmin.get()); - points.SetXMax(xmax.get()); - points.SetYMin(ymin.get()); - points.SetYMax(ymax.get()); - m_points=points; + } + stopTimer(); + } else { + getCout() << "\n REDUCE_STRUCT : Unknown Seq tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -132,7 +179,14 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune } } +} +void REDUCE_STRUCT::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } } } // end namespace basic diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index f18319eb2..764e82f67 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -33,7 +33,10 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( 6*sizeof(Real_type) + 2*sizeof(Real_type)*getActualProblemSize()); + setBytesReadPerRep( 6*sizeof(Real_type) + + 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 6*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize() + 2); diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 425e7796e..658d9eae4 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -87,15 +87,28 @@ class REDUCE_STRUCT : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); struct PointsType { - Int_type N; + Index_type N; Real_ptr x, y; Real_ptr GetCenter(){return ¢er[0];}; @@ -118,7 +131,7 @@ class REDUCE_STRUCT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; Real_type m_init_sum; Real_type m_init_min; diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index f4de3cf6a..e58e86923 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -12,30 +12,21 @@ #if defined(RAJA_ENABLE_CUDA) +#include "TRAP_INT-func.hpp" + #include "common/CudaDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -RAJA_DEVICE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - template < size_t block_size > __launch_bounds__(block_size) @@ -64,25 +55,16 @@ __global__ void trapint(Real_type x0, Real_type xp, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( sumx, psumx[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *sumx += psumx[ 0 ]; - } -#endif - } - -template < size_t block_size > -void TRAP_INT::runCudaVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void TRAP_INT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -91,44 +73,69 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - Real_ptr sumx; - allocData(DataSpace::CudaDevice, sumx, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (trapint), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - trapint<<>>(x0, xp, - y, yp, - h, - sumx, - iend); - cudaErrchk( cudaGetLastError() ); - - Real_type lsumx; - cudaErrchk( cudaMemcpyAsync( &lsumx, sumx, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_sumx += lsumx * h; + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1, 1); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + + RPlaunchCudaKernel( (trapint), + grid_size, block_size, + shmem, res.get_stream(), + x0, xp, + y, yp, + h, + sumx, + iend); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(sumx, hsumx, 1, 1); + m_sumx += hsumx[0] * h; } stopTimer(); - deallocData(DataSpace::CudaDevice, sumx); + RAJAPERF_CUDA_REDUCER_TEARDOWN(sumx, hsumx); + + } else { + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void TRAP_INT::runCudaVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; - } else if ( vid == RAJA_CUDA ) { + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); + RAJA::ReduceSum sumx(m_sumx_init); - RAJA::forall< RAJA::cuda_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { TRAP_INT_BODY; }); @@ -143,7 +150,162 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Cuda) +template < size_t block_size, typename MappingHelper > +void TRAP_INT::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] __device__ (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void TRAP_INT::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + }); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 1b5f4b2be..e60b3ccff 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -12,30 +12,21 @@ #if defined(RAJA_ENABLE_HIP) +#include "TRAP_INT-func.hpp" + #include "common/HipDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -RAJA_DEVICE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - template < size_t block_size > __launch_bounds__(block_size) @@ -64,25 +55,16 @@ __global__ void trapint(Real_type x0, Real_type xp, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( sumx, psumx[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *sumx += psumx[ 0 ]; - } -#endif - } - -template < size_t block_size > -void TRAP_INT::runHipVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void TRAP_INT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -91,43 +73,69 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - Real_ptr sumx; - allocData(DataSpace::HipDevice, sumx, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (trapint), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1, 1); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x0, xp, - y, yp, - h, - sumx, - iend); - hipErrchk( hipGetLastError() ); + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - Real_type lsumx; - hipErrchk( hipMemcpyAsync( &lsumx, sumx, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_sumx += lsumx * h; + RPlaunchHipKernel( (trapint), + grid_size, block_size, + shmem, res.get_stream(), + x0, xp, + y, yp, + h, + sumx, + iend); + + RAJAPERF_HIP_REDUCER_COPY_BACK(sumx, hsumx, 1, 1); + m_sumx += hsumx[0] * h; } stopTimer(); - deallocData(DataSpace::HipDevice, sumx); + RAJAPERF_HIP_REDUCER_TEARDOWN(sumx, hsumx); + + } else { + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void TRAP_INT::runHipVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; - } else if ( vid == RAJA_HIP ) { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); + RAJA::ReduceSum sumx(m_sumx_init); - RAJA::forall< RAJA::hip_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { TRAP_INT_BODY; }); @@ -142,7 +150,161 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Hip) +template < size_t block_size, typename MappingHelper > +void TRAP_INT::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] __device__ (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void TRAP_INT::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index dadaa5baa..f1961483a 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#include "TRAP_INT-func.hpp" + #include namespace rajaperf @@ -17,22 +19,8 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - -void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void TRAP_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -91,20 +79,46 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum sumx(m_sumx_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); + RAJA::ReduceSum sumx(m_sumx_init); - m_sumx += static_cast(sumx.get()) * h; + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else if (tune_idx == 1) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown OpenMP tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -117,8 +131,17 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void TRAP_INT::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index b9bdcd6a6..2c5e4cf56 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -12,6 +12,8 @@ #if defined(RAJA_ENABLE_TARGET_OPENMP) +#include "TRAP_INT-func.hpp" + #include "common/OpenMPTargetDataUtils.hpp" #include @@ -21,27 +23,13 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - // // Define threads per team for target execution // const size_t threads_per_team = 256; -void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -58,7 +46,8 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( Real_type sumx = m_sumx_init; - #pragma omp target teams distribute parallel for map(tofrom: sumx) reduction(+:sumx) \ + #pragma omp target teams distribute parallel for \ + map(tofrom: sumx) reduction(+:sumx) \ thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { @@ -74,23 +63,57 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum sumx(m_sumx_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; - m_sumx += static_cast(sumx.get()) * h; + } + stopTimer(); + } else { + getCout() << "\n TRAP_INT : Unknown OMP Target tuning index = " << tune_idx << std::endl; } - stopTimer(); } else { - getCout() << "\n TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl; + getCout() << "\n TRAP_INT : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +void TRAP_INT::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); } } diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index 9b1264b4d..fa74efdcf 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#include "TRAP_INT-func.hpp" + #include namespace rajaperf @@ -17,23 +19,12 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - -void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void TRAP_INT::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -88,20 +79,46 @@ void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum sumx(m_sumx_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; - m_sumx += static_cast(sumx.get()) * h; + } + stopTimer(); + } else { + getCout() << "\n TRAP_INT : Unknown Seq tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -115,5 +132,13 @@ void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) } +void TRAP_INT::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp new file mode 100644 index 000000000..a9795c77e --- /dev/null +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -0,0 +1,108 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRAP_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "TRAP_INT-func.hpp" + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +template +void TRAP_INT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + TRAP_INT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + Real_ptr sumx; + allocAndInitSyclDeviceData(sumx, &m_sumx_init, 1, qu); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(sumx, &m_sumx_init, 1, qu); + + qu->submit([&] (sycl::handler& hdl) { + + auto sum_reduction = sycl::reduction(sumx, sycl::plus<>()); + + hdl.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sum_reduction, + [=] (sycl::nd_item<1> item, auto& sumx) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + TRAP_INT_BODY + } + + }); + }); + + Real_type lsumx; + Real_ptr plsumx = &lsumx; + getSyclDeviceData(plsumx, sumx, 1, qu); + m_sumx += lsumx * h; + + } + stopTimer(); + + deallocSyclDeviceData(sumx, qu); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall< RAJA::sycl_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); + + } else { + std::cout << "\n TRAP_INT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/TRAP_INT-func.hpp b/src/basic/TRAP_INT-func.hpp new file mode 100644 index 000000000..9c4b90c52 --- /dev/null +++ b/src/basic/TRAP_INT-func.hpp @@ -0,0 +1,35 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_Basic_TRAP_INT_FUNC_HPP +#define RAJAPerf_Basic_TRAP_INT_FUNC_HPP + +namespace rajaperf +{ +namespace basic +{ + +// +// Function used in TRAP_INT loop in each variant. +// +RAJA_INLINE +RAJA_HOST_DEVICE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index eaac3ffda..09da695ea 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ TRAP_INT::TRAP_INT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(10 * getActualProblemSize()); // 1 sqrt setUsesFeature(Forall); @@ -52,6 +53,9 @@ TRAP_INT::TRAP_INT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index e64932dbe..8f8ca9337 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -67,18 +67,37 @@ class TRAP_INT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); + + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_type m_x0; Real_type m_xp; diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt new file mode 100644 index 000000000..9298e7bce --- /dev/null +++ b/src/comm/CMakeLists.txt @@ -0,0 +1,43 @@ +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +blt_add_library( + NAME comm + SOURCES HALO_base.cpp + HALO_PACKING.cpp + HALO_PACKING-Seq.cpp + HALO_PACKING-Hip.cpp + HALO_PACKING-Cuda.cpp + HALO_PACKING-OMP.cpp + HALO_PACKING-OMPTarget.cpp + HALO_PACKING_FUSED.cpp + HALO_PACKING_FUSED-Seq.cpp + HALO_PACKING_FUSED-Hip.cpp + HALO_PACKING_FUSED-Cuda.cpp + HALO_PACKING_FUSED-OMP.cpp + HALO_PACKING_FUSED-OMPTarget.cpp + HALO_SENDRECV.cpp + HALO_SENDRECV-Seq.cpp + HALO_SENDRECV-Hip.cpp + HALO_SENDRECV-Cuda.cpp + HALO_SENDRECV-OMP.cpp + HALO_SENDRECV-OMPTarget.cpp + HALO_EXCHANGE.cpp + HALO_EXCHANGE-Seq.cpp + HALO_EXCHANGE-Hip.cpp + HALO_EXCHANGE-Cuda.cpp + HALO_EXCHANGE-OMP.cpp + HALO_EXCHANGE-OMPTarget.cpp + HALO_EXCHANGE_FUSED.cpp + HALO_EXCHANGE_FUSED-Seq.cpp + HALO_EXCHANGE_FUSED-Hip.cpp + HALO_EXCHANGE_FUSED-Cuda.cpp + HALO_EXCHANGE_FUSED-OMP.cpp + HALO_EXCHANGE_FUSED-OMPTarget.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/comm/HALO_EXCHANGE-Cuda.cpp b/src/comm/HALO_EXCHANGE-Cuda.cpp new file mode 100644 index 000000000..ad5482d18 --- /dev/null +++ b/src/comm/HALO_EXCHANGE-Cuda.cpp @@ -0,0 +1,208 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, + Index_type len) +{ + Index_type i = threadIdx.x + blockIdx.x * block_size; + + if (i < len) { + HALO_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, + Index_type len) +{ + Index_type i = threadIdx.x + blockIdx.x * block_size; + + if (i < len) { + HALO_UNPACK_BODY; + } +} + + +template < size_t block_size > +void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + HALO_EXCHANGE_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len + block_size-1) / block_size); + constexpr size_t shmem = 0; + RPlaunchCudaKernel( (halo_exchange_pack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len); + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len + block_size-1) / block_size); + constexpr size_t shmem = 0; + RPlaunchCudaKernel( (halo_exchange_unpack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len); + buffer += len; + } + } + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + using EXEC_POL = RAJA::cuda_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_pack_base_lam = [=] __device__ (Index_type i) { + HALO_PACK_BODY; + }; + RAJA::forall( res, + RAJA::TypedRangeSegment(0, len), + halo_exchange_pack_base_lam ); + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + res.wait(); + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_unpack_base_lam = [=] __device__ (Index_type i) { + HALO_UNPACK_BODY; + }; + RAJA::forall( res, + RAJA::TypedRangeSegment(0, len), + halo_exchange_unpack_base_lam ); + buffer += len; + } + } + res.wait(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_EXCHANGE : Unknown Cuda variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_EXCHANGE, Cuda) + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/comm/HALO_EXCHANGE-Hip.cpp b/src/comm/HALO_EXCHANGE-Hip.cpp new file mode 100644 index 000000000..1b7ffb04a --- /dev/null +++ b/src/comm/HALO_EXCHANGE-Hip.cpp @@ -0,0 +1,208 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, + Index_type len) +{ + Index_type i = threadIdx.x + blockIdx.x * block_size; + + if (i < len) { + HALO_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, + Index_type len) +{ + Index_type i = threadIdx.x + blockIdx.x * block_size; + + if (i < len) { + HALO_UNPACK_BODY; + } +} + + +template < size_t block_size > +void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + HALO_EXCHANGE_DATA_SETUP; + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len + block_size-1) / block_size); + constexpr size_t shmem = 0; + RPlaunchHipKernel( (halo_exchange_pack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len); + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len + block_size-1) / block_size); + constexpr size_t shmem = 0; + RPlaunchHipKernel( (halo_exchange_unpack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len); + buffer += len; + } + } + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + using EXEC_POL = RAJA::hip_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_pack_base_lam = [=] __device__ (Index_type i) { + HALO_PACK_BODY; + }; + RAJA::forall( res, + RAJA::TypedRangeSegment(0, len), + halo_exchange_pack_base_lam ); + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + res.wait(); + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_unpack_base_lam = [=] __device__ (Index_type i) { + HALO_UNPACK_BODY; + }; + RAJA::forall( res, + RAJA::TypedRangeSegment(0, len), + halo_exchange_unpack_base_lam ); + buffer += len; + } + } + res.wait(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_EXCHANGE : Unknown Hip variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_EXCHANGE, Hip) + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/comm/HALO_EXCHANGE-OMP.cpp b/src/comm/HALO_EXCHANGE-OMP.cpp new file mode 100644 index 000000000..922151704 --- /dev/null +++ b/src/comm/HALO_EXCHANGE-OMP.cpp @@ -0,0 +1,254 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + #pragma omp parallel for + for (Index_type i = 0; i < len; i++) { + HALO_PACK_BODY; + } + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + #pragma omp parallel for + for (Index_type i = 0; i < len; i++) { + HALO_UNPACK_BODY; + } + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; + }; + #pragma omp parallel for + for (Index_type i = 0; i < len; i++) { + halo_exchange_pack_base_lam(i); + } + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; + }; + #pragma omp parallel for + for (Index_type i = 0; i < len; i++) { + halo_exchange_unpack_base_lam(i); + } + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + using EXEC_POL = RAJA::omp_parallel_for_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + halo_exchange_pack_base_lam ); + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + halo_exchange_unpack_base_lam ); + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n HALO_EXCHANGE : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_EXCHANGE-OMPTarget.cpp b/src/comm/HALO_EXCHANGE-OMPTarget.cpp new file mode 100644 index 000000000..f83eb2826 --- /dev/null +++ b/src/comm/HALO_EXCHANGE-OMPTarget.cpp @@ -0,0 +1,176 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void HALO_EXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + #pragma omp target is_device_ptr(buffer, list, var) device( did ) + #pragma omp teams distribute parallel for schedule(static, 1) + for (Index_type i = 0; i < len; i++) { + HALO_PACK_BODY; + } + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + #pragma omp target is_device_ptr(buffer, list, var) device( did ) + #pragma omp teams distribute parallel for schedule(static, 1) + for (Index_type i = 0; i < len; i++) { + HALO_UNPACK_BODY; + } + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + using EXEC_POL = RAJA::omp_target_parallel_for_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + halo_exchange_pack_base_lam ); + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + halo_exchange_unpack_base_lam ); + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_EXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/comm/HALO_EXCHANGE-Seq.cpp b/src/comm/HALO_EXCHANGE-Seq.cpp new file mode 100644 index 000000000..b5cbbf6f6 --- /dev/null +++ b/src/comm/HALO_EXCHANGE-Seq.cpp @@ -0,0 +1,247 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void HALO_EXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + for (Index_type i = 0; i < len; i++) { + HALO_PACK_BODY; + } + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + for (Index_type i = 0; i < len; i++) { + HALO_UNPACK_BODY; + } + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; + }; + for (Index_type i = 0; i < len; i++) { + halo_exchange_pack_base_lam(i); + } + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; + }; + for (Index_type i = 0; i < len; i++) { + halo_exchange_unpack_base_lam(i); + } + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + using EXEC_POL = RAJA::seq_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + halo_exchange_pack_base_lam ); + buffer += len; + } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto halo_exchange_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + halo_exchange_unpack_base_lam ); + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n HALO_EXCHANGE : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_EXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp new file mode 100644 index 000000000..bbca8851f --- /dev/null +++ b/src/comm/HALO_EXCHANGE.cpp @@ -0,0 +1,165 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +namespace rajaperf +{ +namespace comm +{ + +HALO_EXCHANGE::HALO_EXCHANGE(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_EXCHANGE, params) +{ + m_mpi_size = params.getMPISize(); + m_my_mpi_rank = params.getMPIRank(); + m_mpi_dims = params.getMPI3DDivision(); + + setDefaultReps(200); + + m_num_vars = params.getHaloNumVars(); + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); + setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); + setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() + // pack + 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() + // send + + 1*sizeof(Int_type) * getItsPerRep() + // unpack + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() + // recv + + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep(0); + + setUsesFeature(Forall); + setUsesFeature(MPI); + + if (params.validMPI3DDivision()) { + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + } +} + +HALO_EXCHANGE::~HALO_EXCHANGE() +{ +} + +void HALO_EXCHANGE::setUp(VariantID vid, size_t tune_idx) +{ + setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); + + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_pack_buffers.resize(s_num_neighbors, nullptr); + m_send_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len); + m_send_buffers[l] = m_pack_buffers[l]; + } + } + + m_unpack_buffers.resize(s_num_neighbors, nullptr); + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len); + m_recv_buffers[l] = m_unpack_buffers[l]; + } + } +} + +void HALO_EXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } +} + +void HALO_EXCHANGE::tearDown(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + deallocData(getDataSpace(vid), m_unpack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]); + } + } + m_recv_buffers.clear(); + m_unpack_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + deallocData(getDataSpace(vid), m_pack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_pack_buffers[l]); + } + } + m_send_buffers.clear(); + m_pack_buffers.clear(); + + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); + + tearDown_base(vid, tune_idx); +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_EXCHANGE.hpp b/src/comm/HALO_EXCHANGE.hpp new file mode 100644 index 000000000..8f3cf1cda --- /dev/null +++ b/src/comm/HALO_EXCHANGE.hpp @@ -0,0 +1,147 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// HALO_EXCHANGE kernel reference implementation: +/// +/// // post a recv for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Index_type len = unpack_index_list_lengths[l]; +/// MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); +/// } +/// +/// // pack a buffer for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = pack_buffers[l]; +/// Int_ptr list = pack_index_lists[l]; +/// Index_type len = pack_index_list_lengths[l]; +/// // pack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// buffer[i] = var[list[i]]; +/// } +/// buffer += len; +/// } +/// // send buffer to neighbor +/// MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); +/// } +/// +/// // unpack a buffer for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// // receive buffer from neighbor +/// MPI_Wait(&unpack_mpi_requests[l], MPI_STATUS_IGNORE); +/// Real_ptr buffer = unpack_buffers[l]; +/// Int_ptr list = unpack_index_lists[l]; +/// Index_type len = unpack_index_list_lengths[l]; +/// // unpack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// var[list[i]] = buffer[i]; +/// } +/// buffer += len; +/// } +/// } +/// +/// // wait for all sends to complete +/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); +/// + + +#ifndef RAJAPerf_Comm_HALO_EXCHANGE_HPP +#define RAJAPerf_Comm_HALO_EXCHANGE_HPP + +#define HALO_EXCHANGE_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ + \ + Index_type num_vars = m_num_vars; \ + std::vector vars = m_vars; \ + \ + std::vector mpi_ranks = m_mpi_ranks; \ + \ + std::vector pack_mpi_requests(num_neighbors); \ + std::vector unpack_mpi_requests(num_neighbors); \ + \ + const DataSpace dataSpace = getDataSpace(vid); \ + \ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \ + \ + std::vector pack_buffers = m_pack_buffers; \ + std::vector unpack_buffers = m_unpack_buffers; \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; + + +#include "HALO_base.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include +#include + +namespace rajaperf +{ +namespace comm +{ + +class HALO_EXCHANGE : public HALO_base +{ +public: + + HALO_EXCHANGE(const RunParams& params); + + ~HALO_EXCHANGE(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + + int m_mpi_size = -1; + int m_my_mpi_rank = -1; + std::array m_mpi_dims = {-1, -1, -1}; + + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_vars; + + std::vector m_pack_buffers; + std::vector m_unpack_buffers; + + std::vector m_send_buffers; + std::vector m_recv_buffers; +}; + +} // end namespace comm +} // end namespace rajaperf + +#endif +#endif // closing endif for header file include guard diff --git a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp new file mode 100644 index 000000000..a9d161183 --- /dev/null +++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp @@ -0,0 +1,416 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ + Real_ptr* pack_buffer_ptrs; \ + Int_ptr* pack_list_ptrs; \ + Real_ptr* pack_var_ptrs; \ + Index_type* pack_len_ptrs; \ + allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_len_ptrs, num_neighbors * num_vars); \ + Real_ptr* unpack_buffer_ptrs; \ + Int_ptr* unpack_list_ptrs; \ + Real_ptr* unpack_var_ptrs; \ + Index_type* unpack_len_ptrs; \ + allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ + deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_len_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_len_ptrs); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, + Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = pack_buffer_ptrs[j]; + Int_ptr list = pack_list_ptrs[j]; + Real_ptr var = pack_var_ptrs[j]; + Index_type len = pack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALO_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, + Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = unpack_buffer_ptrs[j]; + Int_ptr list = unpack_list_ptrs[j]; + Real_ptr var = unpack_var_ptrs[j]; + Index_type len = unpack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALO_UNPACK_BODY; + } +} + + +template < size_t block_size > +void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + constexpr size_t shmem = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + Index_type pack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_buffer_ptrs[pack_index] = buffer; + pack_list_ptrs[pack_index] = list; + pack_var_ptrs[pack_index] = var; + pack_len_ptrs[pack_index] = len; + pack_len_sum += len; + pack_index += 1; + buffer += len; + } + } + Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; + dim3 pack_nthreads_per_block(block_size); + dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); + RPlaunchCudaKernel( (halo_exchange_fused_pack), + pack_nblocks, pack_nthreads_per_block, + shmem, res.get_stream(), + pack_buffer_ptrs, + pack_list_ptrs, + pack_var_ptrs, + pack_len_ptrs); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + Index_type unpack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_buffer_ptrs[unpack_index] = buffer; + unpack_list_ptrs[unpack_index] = list; + unpack_var_ptrs[unpack_index] = var; + unpack_len_ptrs[unpack_index] = len; + unpack_len_sum += len; + unpack_index += 1; + buffer += len; + } + } + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + dim3 unpack_nthreads_per_block(block_size); + dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); + RPlaunchCudaKernel( (halo_exchange_fused_unpack), + unpack_nblocks, unpack_nthreads_per_block, + shmem, res.get_stream(), + unpack_buffer_ptrs, + unpack_list_ptrs, + unpack_var_ptrs, + unpack_len_ptrs); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + + } else { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename dispatch_helper > +void HALO_EXCHANGE_FUSED::runCudaVariantWorkGroup(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + dispatch_policy >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + res.wait(); + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void HALO_EXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runCudaVariantDirect(vid); + + } + + t += 1; + + } + + }); + + } + + if (vid == RAJA_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runCudaVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } + + }); + + } + +} + +void HALO_EXCHANGE_FUSED::setCudaTuningDefinitions(VariantID vid) +{ + if (vid == Base_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "direct_"+std::to_string(block_size)); + + } + + }); + + } + + if (vid == RAJA_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size)); + + }); + + } + + }); + + } + +} +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp new file mode 100644 index 000000000..2ac30479b --- /dev/null +++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp @@ -0,0 +1,416 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ + Real_ptr* pack_buffer_ptrs; \ + Int_ptr* pack_list_ptrs; \ + Real_ptr* pack_var_ptrs; \ + Index_type* pack_len_ptrs; \ + allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs, num_neighbors * num_vars); \ + Real_ptr* unpack_buffer_ptrs; \ + Int_ptr* unpack_list_ptrs; \ + Real_ptr* unpack_var_ptrs; \ + Index_type* unpack_len_ptrs; \ + allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ + deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, + Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = pack_buffer_ptrs[j]; + Int_ptr list = pack_list_ptrs[j]; + Real_ptr var = pack_var_ptrs[j]; + Index_type len = pack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALO_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, + Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = unpack_buffer_ptrs[j]; + Int_ptr list = unpack_list_ptrs[j]; + Real_ptr var = unpack_var_ptrs[j]; + Index_type len = unpack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALO_UNPACK_BODY; + } +} + + +template < size_t block_size > +void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + constexpr size_t shmem = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + Index_type pack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_buffer_ptrs[pack_index] = buffer; + pack_list_ptrs[pack_index] = list; + pack_var_ptrs[pack_index] = var; + pack_len_ptrs[pack_index] = len; + pack_len_sum += len; + pack_index += 1; + buffer += len; + } + } + Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; + dim3 pack_nthreads_per_block(block_size); + dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); + RPlaunchHipKernel( (halo_exchange_fused_pack), + pack_nblocks, pack_nthreads_per_block, + shmem, res.get_stream(), + pack_buffer_ptrs, + pack_list_ptrs, + pack_var_ptrs, + pack_len_ptrs); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + Index_type unpack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_buffer_ptrs[unpack_index] = buffer; + unpack_list_ptrs[unpack_index] = list; + unpack_var_ptrs[unpack_index] = var; + unpack_len_ptrs[unpack_index] = len; + unpack_len_sum += len; + unpack_index += 1; + buffer += len; + } + } + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + dim3 unpack_nthreads_per_block(block_size); + dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); + RPlaunchHipKernel( (halo_exchange_fused_unpack), + unpack_nblocks, unpack_nthreads_per_block, + shmem, res.get_stream(), + unpack_buffer_ptrs, + unpack_list_ptrs, + unpack_var_ptrs, + unpack_len_ptrs); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + + } else { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename dispatch_helper > +void HALO_EXCHANGE_FUSED::runHipVariantWorkGroup(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + dispatch_policy >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + res.wait(); + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + + +void HALO_EXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runHipVariantDirect(vid); + + } + + t += 1; + + } + + }); + + } + + if (vid == RAJA_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runHipVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } + + }); + + } +} + +void HALO_EXCHANGE_FUSED::setHipTuningDefinitions(VariantID vid) +{ + if (vid == Base_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "direct_"+std::to_string(block_size)); + + } + + }); + + } + + if (vid == RAJA_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size)); + + }); + + } + + }); + + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp new file mode 100644 index 000000000..1af5d4bb9 --- /dev/null +++ b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp @@ -0,0 +1,477 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void HALO_EXCHANGE_FUSED::runOpenMPVariantDirect(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + } + } + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < pack_index; j++) { + #pragma omp task firstprivate(j) + { + Real_ptr buffer = pack_ptr_holders[j].buffer; + Int_ptr list = pack_ptr_holders[j].list; + Real_ptr var = pack_ptr_holders[j].var; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALO_PACK_BODY; + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < pack_index; j++) { + Real_ptr buffer = pack_ptr_holders[j].buffer; + Int_ptr list = pack_ptr_holders[j].list; + Real_ptr var = pack_ptr_holders[j].var; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALO_PACK_BODY; + } + } +#endif + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + } + } + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < unpack_index; j++) { + #pragma omp task firstprivate(j) + { + Real_ptr buffer = unpack_ptr_holders[j].buffer; + Int_ptr list = unpack_ptr_holders[j].list; + Real_ptr var = unpack_ptr_holders[j].var; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALO_UNPACK_BODY; + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < unpack_index; j++) { + Real_ptr buffer = unpack_ptr_holders[j].buffer; + Int_ptr list = unpack_ptr_holders[j].list; + Real_ptr var = unpack_ptr_holders[j].var; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALO_UNPACK_BODY; + } + } +#endif + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + + break; + } + + case Lambda_OpenMP : { + + HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + } + } + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < pack_index; j++) { + #pragma omp task firstprivate(j) + { + auto pack_lambda = pack_lambdas[j]; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + pack_lambda(i); + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < pack_index; j++) { + auto pack_lambda = pack_lambdas[j]; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + pack_lambda(i); + } + } +#endif + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var)); + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + } + } + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < unpack_index; j++) { + #pragma omp task firstprivate(j) + { + auto unpack_lambda = unpack_lambdas[j]; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + unpack_lambda(i); + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < unpack_index; j++) { + auto unpack_lambda = unpack_lambdas[j]; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + unpack_lambda(i); + } + } +#endif + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + + break; + } + + default : { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +template < typename dispatch_helper > +void HALO_EXCHANGE_FUSED::runOpenMPVariantWorkGroup(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + + case RAJA_OpenMP : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + dispatch_policy >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +void HALO_EXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_OpenMP || vid == Lambda_OpenMP) { + + if (tune_idx == t) { + + runOpenMPVariantDirect(vid); + + } + + t += 1; + + } + + if (vid == RAJA_OpenMP) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runOpenMPVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } +} + +void HALO_EXCHANGE_FUSED::setOpenMPTuningDefinitions(VariantID vid) +{ + if (vid == Base_OpenMP || vid == Lambda_OpenMP) { + + addVariantTuningName(vid, "direct"); + + } + + if (vid == RAJA_OpenMP) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); + + }); + + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp new file mode 100644 index 000000000..18c32437d --- /dev/null +++ b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp @@ -0,0 +1,358 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + + // + // Define threads per team for target execution (unused) + // +//const size_t threads_per_team = 256; + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ + void** pack_ptrs; \ + allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \ + Real_ptr* pack_buffer_ptrs = reinterpret_cast(pack_ptrs) + 0 * num_neighbors * num_vars; \ + Int_ptr* pack_list_ptrs = reinterpret_cast(pack_ptrs) + 1 * num_neighbors * num_vars; \ + Real_ptr* pack_var_ptrs = reinterpret_cast(pack_ptrs) + 2 * num_neighbors * num_vars; \ + Index_type* pack_len_ptrs = reinterpret_cast(pack_ptrs) + 3 * num_neighbors * num_vars; \ + void** h_pack_ptrs = new void*[4 * num_neighbors * num_vars]; \ + Real_ptr* h_pack_buffer_ptrs = reinterpret_cast(h_pack_ptrs) + 0 * num_neighbors * num_vars; \ + Int_ptr* h_pack_list_ptrs = reinterpret_cast(h_pack_ptrs) + 1 * num_neighbors * num_vars; \ + Real_ptr* h_pack_var_ptrs = reinterpret_cast(h_pack_ptrs) + 2 * num_neighbors * num_vars; \ + Index_type* h_pack_len_ptrs = reinterpret_cast(h_pack_ptrs) + 3 * num_neighbors * num_vars; \ + void** unpack_ptrs; \ + allocData(DataSpace::OmpTarget, unpack_ptrs, 4 * num_neighbors * num_vars); \ + Real_ptr* unpack_buffer_ptrs = reinterpret_cast(unpack_ptrs) + 0 * num_neighbors * num_vars; \ + Int_ptr* unpack_list_ptrs = reinterpret_cast(unpack_ptrs) + 1 * num_neighbors * num_vars; \ + Real_ptr* unpack_var_ptrs = reinterpret_cast(unpack_ptrs) + 2 * num_neighbors * num_vars; \ + Index_type* unpack_len_ptrs = reinterpret_cast(unpack_ptrs) + 3 * num_neighbors * num_vars; \ + void** h_unpack_ptrs = new void*[4 * num_neighbors * num_vars]; \ + Real_ptr* h_unpack_buffer_ptrs = reinterpret_cast(h_unpack_ptrs) + 0 * num_neighbors * num_vars; \ + Int_ptr* h_unpack_list_ptrs = reinterpret_cast(h_unpack_ptrs) + 1 * num_neighbors * num_vars; \ + Real_ptr* h_unpack_var_ptrs = reinterpret_cast(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \ + Index_type* h_unpack_len_ptrs = reinterpret_cast(h_unpack_ptrs) + 3 * num_neighbors * num_vars; + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ + initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars); + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ + initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars); + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ + deallocData(DataSpace::OmpTarget, pack_ptrs); \ + delete[] h_pack_ptrs; \ + deallocData(DataSpace::OmpTarget, unpack_ptrs); \ + delete[] h_unpack_ptrs; + + +void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantDirect(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + Index_type pack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + h_pack_buffer_ptrs[pack_index] = buffer; + h_pack_list_ptrs[pack_index] = list; + h_pack_var_ptrs[pack_index] = var; + h_pack_len_ptrs[pack_index] = len; + pack_len_sum += len; + pack_index += 1; + buffer += len; + } + } + HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; + Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; + #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did ) + #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) + for (Index_type j = 0; j < pack_index; j++) { + for (Index_type ii = 0; ii < pack_len_ave; ii++) { + + Real_ptr buffer = pack_buffer_ptrs[j]; + Int_ptr list = pack_list_ptrs[j]; + Real_ptr var = pack_var_ptrs[j]; + Index_type len = pack_len_ptrs[j]; + + for (Index_type i = ii; i < len; i += pack_len_ave) { + HALO_PACK_BODY; + } + } + } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + Index_type unpack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + h_unpack_buffer_ptrs[unpack_index] = buffer; + h_unpack_list_ptrs[unpack_index] = list; + h_unpack_var_ptrs[unpack_index] = var; + h_unpack_len_ptrs[unpack_index] = len; + unpack_len_sum += len; + unpack_index += 1; + buffer += len; + } + } + HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did ) + #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) + for (Index_type j = 0; j < unpack_index; j++) { + for (Index_type ii = 0; ii < unpack_len_ave; ii++) { + + Real_ptr buffer = unpack_buffer_ptrs[j]; + Int_ptr list = unpack_list_ptrs[j]; + Real_ptr var = unpack_var_ptrs[j]; + Index_type len = unpack_len_ptrs[j]; + + for (Index_type i = ii; i < len; i += unpack_len_ave) { + HALO_UNPACK_BODY; + } + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + + } else { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +template < typename dispatch_helper > +void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_OpenMPTarget ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_target_work /**/, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + dispatch_policy >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +void HALO_PACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_OpenMPTarget) { + + if (tune_idx == t) { + + runOpenMPTargetVariantDirect(vid); + + } + + t += 1; + + } + + if (vid == RAJA_OpenMPTarget) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runOpenMPTargetVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } +} + +void HALO_PACKING_FUSED::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + if (vid == Base_OpenMPTarget) { + + addVariantTuningName(vid, "direct"); + + } + + if (vid == RAJA_OpenMPTarget) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); + + }); + + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp new file mode 100644 index 000000000..bca51de0d --- /dev/null +++ b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp @@ -0,0 +1,399 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void HALO_EXCHANGE_FUSED::runSeqVariantDirect(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + } + } + for (Index_type j = 0; j < pack_index; j++) { + Real_ptr buffer = pack_ptr_holders[j].buffer; + Int_ptr list = pack_ptr_holders[j].list; + Real_ptr var = pack_ptr_holders[j].var; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALO_PACK_BODY; + } + } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + } + } + for (Index_type j = 0; j < unpack_index; j++) { + Real_ptr buffer = unpack_ptr_holders[j].buffer; + Int_ptr list = unpack_ptr_holders[j].list; + Real_ptr var = unpack_ptr_holders[j].var; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALO_UNPACK_BODY; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + } + } + for (Index_type j = 0; j < pack_index; j++) { + auto pack_lambda = pack_lambdas[j]; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + pack_lambda(i); + } + } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var)); + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + } + } + for (Index_type j = 0; j < unpack_index; j++) { + auto unpack_lambda = unpack_lambdas[j]; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + unpack_lambda(i); + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +} + +template < typename dispatch_helper > +void HALO_EXCHANGE_FUSED::runSeqVariantWorkGroup(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + HALO_EXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + +#if defined(RUN_RAJA_SEQ) + case RAJA_Seq : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::seq_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + dispatch_policy >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +} + +void HALO_EXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_Seq || vid == Lambda_Seq) { + + if (tune_idx == t) { + + runSeqVariantDirect(vid); + + } + + t += 1; + + } + + if (vid == RAJA_Seq) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runSeqVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } +} + +void HALO_EXCHANGE_FUSED::setSeqTuningDefinitions(VariantID vid) +{ + if (vid == Base_Seq || vid == Lambda_Seq) { + + addVariantTuningName(vid, "direct"); + + } + + if (vid == RAJA_Seq) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); + + }); + + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_EXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp new file mode 100644 index 000000000..be76571a2 --- /dev/null +++ b/src/comm/HALO_EXCHANGE_FUSED.cpp @@ -0,0 +1,165 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_EXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +namespace rajaperf +{ +namespace comm +{ + +HALO_EXCHANGE_FUSED::HALO_EXCHANGE_FUSED(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_EXCHANGE_FUSED, params) +{ + m_mpi_size = params.getMPISize(); + m_my_mpi_rank = params.getMPIRank(); + m_mpi_dims = params.getMPI3DDivision(); + + setDefaultReps(200); + + m_num_vars = params.getHaloNumVars(); + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); + setKernelsPerRep( 2 ); + setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() + // pack + 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() + // send + + 1*sizeof(Int_type) * getItsPerRep() + // unpack + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() + // recv + + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep(0); + + setUsesFeature(Workgroup); + setUsesFeature(MPI); + + if (params.validMPI3DDivision()) { + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + } +} + +HALO_EXCHANGE_FUSED::~HALO_EXCHANGE_FUSED() +{ +} + +void HALO_EXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) +{ + setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); + + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_pack_buffers.resize(s_num_neighbors, nullptr); + m_send_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len); + m_send_buffers[l] = m_pack_buffers[l]; + } + } + + m_unpack_buffers.resize(s_num_neighbors, nullptr); + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len); + m_recv_buffers[l] = m_unpack_buffers[l]; + } + } +} + +void HALO_EXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } +} + +void HALO_EXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + deallocData(getDataSpace(vid), m_unpack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]); + } + } + m_recv_buffers.clear(); + m_unpack_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + deallocData(getDataSpace(vid), m_pack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_pack_buffers[l]); + } + } + m_send_buffers.clear(); + m_pack_buffers.clear(); + + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); + + tearDown_base(vid, tune_idx); +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_EXCHANGE_FUSED.hpp b/src/comm/HALO_EXCHANGE_FUSED.hpp new file mode 100644 index 000000000..a0962be3a --- /dev/null +++ b/src/comm/HALO_EXCHANGE_FUSED.hpp @@ -0,0 +1,209 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// HALO_EXCHANGE_FUSED kernel reference implementation: +/// +/// // post a recv for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Index_type len = unpack_index_list_lengths[l]; +/// MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); +/// } +/// +/// // pack buffers for neighbors +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = pack_buffers[l]; +/// Int_ptr list = pack_index_lists[l]; +/// Index_type len = pack_index_list_lengths[l]; +/// // pack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// buffer[i] = var[list[i]]; +/// } +/// buffer += len; +/// } +/// } +/// +/// // send buffers to neighbors +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); +/// } +/// +/// // wait for all recvs to complete +/// MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); +/// +/// // unpack buffers for neighbors +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = unpack_buffers[l]; +/// Int_ptr list = unpack_index_lists[l]; +/// Index_type len = unpack_index_list_lengths[l]; +/// // unpack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// var[list[i]] = buffer[i]; +/// } +/// buffer += len; +/// } +/// } +/// +/// // wait for all sends to complete +/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); +/// + +#ifndef RAJAPerf_Comm_HALO_EXCHANGE_FUSED_HPP +#define RAJAPerf_Comm_HALO_EXCHANGE_FUSED_HPP + +#define HALO_EXCHANGE_FUSED_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ + \ + Index_type num_vars = m_num_vars; \ + std::vector vars = m_vars; \ + \ + std::vector mpi_ranks = m_mpi_ranks; \ + \ + std::vector pack_mpi_requests(num_neighbors); \ + std::vector unpack_mpi_requests(num_neighbors); \ + \ + const DataSpace dataSpace = getDataSpace(vid); \ + \ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \ + \ + std::vector pack_buffers = m_pack_buffers; \ + std::vector unpack_buffers = m_unpack_buffers; \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP \ + struct ptr_holder { \ + Real_ptr buffer; \ + Int_ptr list; \ + Real_ptr var; \ + }; \ + ptr_holder* pack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ + Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \ + ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ + Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; + +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \ + delete[] pack_ptr_holders; \ + delete[] pack_lens; \ + delete[] unpack_ptr_holders; \ + delete[] unpack_lens; + + +#define HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ + auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ + return [=](Index_type i) { \ + HALO_PACK_BODY; \ + }; \ + }; \ + using pack_lambda_type = decltype(make_pack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ + pack_lambda_type* pack_lambdas = reinterpret_cast( \ + malloc(sizeof(pack_lambda_type) * (num_neighbors * num_vars))); \ + Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \ + auto make_unpack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ + return [=](Index_type i) { \ + HALO_UNPACK_BODY; \ + }; \ + }; \ + using unpack_lambda_type = decltype(make_unpack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ + unpack_lambda_type* unpack_lambdas = reinterpret_cast( \ + malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \ + Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; + +#define HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ + free(pack_lambdas); \ + delete[] pack_lens; \ + free(unpack_lambdas); \ + delete[] unpack_lens; + + +#include "HALO_base.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +namespace rajaperf +{ +namespace comm +{ + +class HALO_EXCHANGE_FUSED : public HALO_base +{ +public: + + HALO_EXCHANGE_FUSED(const RunParams& params); + + ~HALO_EXCHANGE_FUSED(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + + void runSeqVariantDirect(VariantID vid); + void runOpenMPVariantDirect(VariantID vid); + void runOpenMPTargetVariantDirect(VariantID vid); + template < size_t block_size > + void runCudaVariantDirect(VariantID vid); + template < size_t block_size > + void runHipVariantDirect(VariantID vid); + + template < typename dispatch_helper > + void runSeqVariantWorkGroup(VariantID vid); + template < typename dispatch_helper > + void runOpenMPVariantWorkGroup(VariantID vid); + template < typename dispatch_helper > + void runOpenMPTargetVariantWorkGroup(VariantID vid); + template < size_t block_size, typename dispatch_helper > + void runCudaVariantWorkGroup(VariantID vid); + template < size_t block_size, typename dispatch_helper > + void runHipVariantWorkGroup(VariantID vid); + +private: + static const size_t default_gpu_block_size = 1024; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + + int m_mpi_size = -1; + int m_my_mpi_rank = -1; + std::array m_mpi_dims = {-1, -1, -1}; + + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_vars; + + std::vector m_pack_buffers; + std::vector m_unpack_buffers; + + std::vector m_send_buffers; + std::vector m_recv_buffers; +}; + +} // end namespace comm +} // end namespace rajaperf + +#endif +#endif // closing endif for header file include guard diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/comm/HALO_PACKING-Cuda.cpp similarity index 54% rename from src/apps/HALOEXCHANGE-Cuda.cpp rename to src/comm/HALO_PACKING-Cuda.cpp index 3a8ae049b..6e09d0805 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/comm/HALO_PACKING-Cuda.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -18,42 +18,42 @@ namespace rajaperf { -namespace apps +namespace comm { template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_packing_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_packing_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } template < size_t block_size > -void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) +void HALO_PACKING::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - HALOEXCHANGE_DATA_SETUP; + HALO_PACKING_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -61,32 +61,49 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - haloexchange_pack<<>>(buffer, list, var, len); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (halo_packing_pack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - haloexchange_unpack<<>>(buffer, list, var, len); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (halo_packing_unpack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len ); buffer += len; } } @@ -103,34 +120,47 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + auto halo_packing_pack_base_lam = [=] __device__ (Index_type i) { + HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + res.wait(); } - res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + auto halo_packing_unpack_base_lam = [=] __device__ (Index_type i) { + HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } @@ -140,13 +170,13 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_PACKING, Cuda) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_CUDA diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/comm/HALO_PACKING-Hip.cpp similarity index 54% rename from src/apps/HALOEXCHANGE-Hip.cpp rename to src/comm/HALO_PACKING-Hip.cpp index 9831b6a69..583804396 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/comm/HALO_PACKING-Hip.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -18,42 +18,42 @@ namespace rajaperf { -namespace apps +namespace comm { template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_packing_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_packing_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } template < size_t block_size > -void HALOEXCHANGE::runHipVariantImpl(VariantID vid) +void HALO_PACKING::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - HALOEXCHANGE_DATA_SETUP; + HALO_PACKING_DATA_SETUP; if ( vid == Base_HIP ) { @@ -61,34 +61,49 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), - buffer, list, var, len); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (halo_packing_pack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), - buffer, list, var, len); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (halo_packing_unpack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len ); buffer += len; } } @@ -105,34 +120,47 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + auto halo_packing_pack_base_lam = [=] __device__ (Index_type i) { + HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + res.wait(); } - res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + auto halo_packing_unpack_base_lam = [=] __device__ (Index_type i) { + HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } @@ -142,13 +170,13 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_PACKING, Hip) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_HIP diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/comm/HALO_PACKING-OMP.cpp similarity index 55% rename from src/apps/HALOEXCHANGE-OMP.cpp rename to src/comm/HALO_PACKING-OMP.cpp index 050046479..bb760f479 100644 --- a/src/apps/HALOEXCHANGE-OMP.cpp +++ b/src/comm/HALO_PACKING-OMP.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -14,17 +14,17 @@ namespace rajaperf { -namespace apps +namespace comm { -void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALO_PACKING_DATA_SETUP; switch ( vid ) { @@ -34,28 +34,40 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -73,34 +85,46 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + auto halo_packing_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - haloexchange_pack_base_lam(i); + halo_packing_pack_base_lam(i); } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + auto halo_packing_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - haloexchange_unpack_base_lam(i); + halo_packing_unpack_base_lam(i); } buffer += len; } @@ -120,33 +144,45 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + auto halo_packing_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + auto halo_packing_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } @@ -158,7 +194,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } default : { - getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown variant id = " << vid << std::endl; } } @@ -168,5 +204,5 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu #endif } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/comm/HALO_PACKING-OMPTarget.cpp similarity index 60% rename from src/apps/HALOEXCHANGE-OMPTarget.cpp rename to src/comm/HALO_PACKING-OMPTarget.cpp index e4f0f561e..d25f4f747 100644 --- a/src/apps/HALOEXCHANGE-OMPTarget.cpp +++ b/src/comm/HALO_PACKING-OMPTarget.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { // @@ -27,11 +27,11 @@ namespace apps const size_t threads_per_team = 256; -void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALO_PACKING_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -39,30 +39,42 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp target is_device_ptr(buffer, list, var) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp target is_device_ptr(buffer, list, var) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -79,33 +91,45 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + auto halo_packing_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + auto halo_packing_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } @@ -114,11 +138,11 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown OMP Target variant id = " << vid << std::endl; } } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/comm/HALO_PACKING-Seq.cpp similarity index 53% rename from src/apps/HALOEXCHANGE-Seq.cpp rename to src/comm/HALO_PACKING-Seq.cpp index fa9eb591f..066116433 100644 --- a/src/apps/HALOEXCHANGE-Seq.cpp +++ b/src/comm/HALO_PACKING-Seq.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -14,15 +14,15 @@ namespace rajaperf { -namespace apps +namespace comm { -void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALO_PACKING_DATA_SETUP; switch ( vid ) { @@ -31,27 +31,40 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -70,32 +83,44 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + auto halo_packing_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; }; for (Index_type i = 0; i < len; i++) { - haloexchange_pack_base_lam(i); + halo_packing_pack_base_lam(i); } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + auto halo_packing_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; }; for (Index_type i = 0; i < len; i++) { - haloexchange_unpack_base_lam(i); + halo_packing_unpack_base_lam(i); } buffer += len; } @@ -115,33 +140,45 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + auto halo_packing_pack_base_lam = [=](Index_type i) { + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + auto halo_packing_unpack_base_lam = [=](Index_type i) { + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } @@ -154,12 +191,12 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #endif // RUN_RAJA_SEQ default : { - getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown variant id = " << vid << std::endl; } } } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp new file mode 100644 index 000000000..f1569d3aa --- /dev/null +++ b/src/comm/HALO_PACKING.cpp @@ -0,0 +1,163 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_PACKING.hpp" + +#include "RAJA/RAJA.hpp" + +namespace rajaperf +{ +namespace comm +{ + +HALO_PACKING::HALO_PACKING(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_PACKING, params) +{ + setDefaultReps(200); + + m_num_vars = params.getHaloNumVars(); + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); + setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); + setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() + // pack + 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Int_type) * getItsPerRep() + // unpack + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep(0); + + setUsesFeature(Forall); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); +} + +HALO_PACKING::~HALO_PACKING() +{ +} + +void HALO_PACKING::setUp(VariantID vid, size_t tune_idx) +{ + int my_mpi_rank = 0; + const int mpi_dims[3] = {1,1,1}; + setUp_base(my_mpi_rank, mpi_dims, vid, tune_idx); + + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_pack_buffers.resize(s_num_neighbors, nullptr); + m_send_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len); + m_send_buffers[l] = m_pack_buffers[l]; + } + } + + m_unpack_buffers.resize(s_num_neighbors, nullptr); + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len); + m_recv_buffers[l] = m_unpack_buffers[l]; + } + } +} + +void HALO_PACKING::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_send_buffers[l], buffer_len, vid); + } else { + checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_send_buffers[l], buffer_len, vid); + } + } +} + +void HALO_PACKING::tearDown(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + deallocData(getDataSpace(vid), m_unpack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]); + } + } + m_recv_buffers.clear(); + m_unpack_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + deallocData(getDataSpace(vid), m_pack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_pack_buffers[l]); + } + } + m_send_buffers.clear(); + m_pack_buffers.clear(); + + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); + + tearDown_base(vid, tune_idx); +} + +} // end namespace comm +} // end namespace rajaperf diff --git a/src/comm/HALO_PACKING.hpp b/src/comm/HALO_PACKING.hpp new file mode 100644 index 000000000..7b4531e74 --- /dev/null +++ b/src/comm/HALO_PACKING.hpp @@ -0,0 +1,118 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// HALO_PACKING kernel reference implementation: +/// +/// // pack a buffer for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = pack_buffers[l]; +/// Int_ptr list = pack_index_lists[l]; +/// Index_type len = pack_index_list_lengths[l]; +/// // pack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// buffer[i] = var[list[i]]; +/// } +/// buffer += len; +/// } +/// } +/// +/// // unpack a buffer for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = unpack_buffers[l]; +/// Int_ptr list = unpack_index_lists[l]; +/// Index_type len = unpack_index_list_lengths[l]; +/// // unpack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// var[list[i]] = buffer[i]; +/// } +/// buffer += len; +/// } +/// } +/// + +#ifndef RAJAPerf_Comm_HALO_PACKING_HPP +#define RAJAPerf_Comm_HALO_PACKING_HPP + +#define HALO_PACKING_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ + \ + Index_type num_vars = m_num_vars; \ + std::vector vars = m_vars; \ + \ + const DataSpace dataSpace = getDataSpace(vid); \ + \ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \ + \ + std::vector pack_buffers = m_pack_buffers; \ + std::vector unpack_buffers = m_unpack_buffers; \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; + + +#include "HALO_base.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + +class HALO_PACKING : public HALO_base +{ +public: + + HALO_PACKING(const RunParams& params); + + ~HALO_PACKING(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_vars; + + std::vector m_pack_buffers; + std::vector m_unpack_buffers; + + std::vector m_send_buffers; + std::vector m_recv_buffers; +}; + +} // end namespace comm +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_PACKING_FUSED-Cuda.cpp similarity index 54% rename from src/apps/HALOEXCHANGE_FUSED-Cuda.cpp rename to src/comm/HALO_PACKING_FUSED-Cuda.cpp index 791742b72..7541a30ef 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALO_PACKING_FUSED-Cuda.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -18,10 +18,10 @@ namespace rajaperf { -namespace apps +namespace comm { -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace apps allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ @@ -51,8 +51,10 @@ namespace apps template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, - Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) +__global__ void halo_packing_fused_pack(Real_ptr* pack_buffer_ptrs, + Int_ptr* pack_list_ptrs, + Real_ptr* pack_var_ptrs, + Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -64,14 +66,16 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALO_PACK_BODY; } } template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, - Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) +__global__ void halo_packing_fused_unpack(Real_ptr* unpack_buffer_ptrs, + Int_ptr* unpack_list_ptrs, + Real_ptr* unpack_var_ptrs, + Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -83,23 +87,23 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALO_UNPACK_BODY; } } template < size_t block_size > -void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) +void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; if ( vid == Base_CUDA ) { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -110,9 +114,9 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type pack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_buffer_ptrs[pack_index] = buffer; @@ -127,18 +131,36 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - haloexchange_fused_pack<<>>( - pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (halo_packing_fused_pack), + pack_nblocks, pack_nthreads_per_block, + shmem, res.get_stream(), + pack_buffer_ptrs, + pack_list_ptrs, + pack_var_ptrs, + pack_len_ptrs ); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); Index_type unpack_index = 0; Index_type unpack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; unpack_buffer_ptrs[unpack_index] = buffer; @@ -150,30 +172,57 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) buffer += len; } } - Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / + unpack_index; dim3 unpack_nthreads_per_block(block_size); - dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - haloexchange_fused_unpack<<>>( - unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); - cudaErrchk( cudaGetLastError() ); + dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, + unpack_index); + RPlaunchCudaKernel( (halo_packing_fused_unpack), + unpack_nblocks, unpack_nthreads_per_block, + shmem, res.get_stream(), + unpack_buffer_ptrs, + unpack_list_ptrs, + unpack_var_ptrs, + unpack_len_ptrs ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { + } else { + getCout() << "\n HALO_PACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename dispatch_helper > +void HALO_PACKING_FUSED::runCudaVariantWorkGroup(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + HALO_PACKING_FUSED_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { using AllocatorHolder = RAJAPoolAllocatorHolder; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::cuda_work_async, RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -199,36 +248,40 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(res); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -240,13 +293,102 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Cuda) +void HALO_PACKING_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runCudaVariantDirect(vid); + + } + + t += 1; + + } + + }); + + } + + if (vid == RAJA_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runCudaVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } + + }); + + } + +} + +void HALO_PACKING_FUSED::setCudaTuningDefinitions(VariantID vid) +{ + if (vid == Base_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "direct_"+std::to_string(block_size)); + + } + + }); + + } + + if (vid == RAJA_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size)); + + }); + + } + + }); + + } + +} -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_CUDA diff --git a/src/comm/HALO_PACKING_FUSED-Hip.cpp b/src/comm/HALO_PACKING_FUSED-Hip.cpp new file mode 100644 index 000000000..7b4d9b064 --- /dev/null +++ b/src/comm/HALO_PACKING_FUSED-Hip.cpp @@ -0,0 +1,391 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_PACKING_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP \ + Real_ptr* pack_buffer_ptrs; \ + Int_ptr* pack_list_ptrs; \ + Real_ptr* pack_var_ptrs; \ + Index_type* pack_len_ptrs; \ + allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs, num_neighbors * num_vars); \ + Real_ptr* unpack_buffer_ptrs; \ + Int_ptr* unpack_list_ptrs; \ + Real_ptr* unpack_var_ptrs; \ + Index_type* unpack_len_ptrs; \ + allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); + +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ + deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_packing_fused_pack(Real_ptr* pack_buffer_ptrs, + Int_ptr* pack_list_ptrs, + Real_ptr* pack_var_ptrs, + Index_type* pack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = pack_buffer_ptrs[j]; + Int_ptr list = pack_list_ptrs[j]; + Real_ptr var = pack_var_ptrs[j]; + Index_type len = pack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALO_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void halo_packing_fused_unpack(Real_ptr* unpack_buffer_ptrs, + Int_ptr* unpack_list_ptrs, + Real_ptr* unpack_var_ptrs, + Index_type* unpack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = unpack_buffer_ptrs[j]; + Int_ptr list = unpack_list_ptrs[j]; + Real_ptr var = unpack_var_ptrs[j]; + Index_type len = unpack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALO_UNPACK_BODY; + } +} + + +template < size_t block_size > +void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + HALO_PACKING_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + constexpr size_t shmem = 0; + + Index_type pack_index = 0; + Index_type pack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_buffer_ptrs[pack_index] = buffer; + pack_list_ptrs[pack_index] = list; + pack_var_ptrs[pack_index] = var; + pack_len_ptrs[pack_index] = len; + pack_len_sum += len; + pack_index += 1; + buffer += len; + } + } + Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; + dim3 pack_nthreads_per_block(block_size); + dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); + RPlaunchHipKernel( (halo_packing_fused_pack), + pack_nblocks, pack_nthreads_per_block, + shmem, res.get_stream(), + pack_buffer_ptrs, + pack_list_ptrs, + pack_var_ptrs, + pack_len_ptrs ); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + Index_type unpack_index = 0; + Index_type unpack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_buffer_ptrs[unpack_index] = buffer; + unpack_list_ptrs[unpack_index] = list; + unpack_var_ptrs[unpack_index] = var; + unpack_len_ptrs[unpack_index] = len; + unpack_len_sum += len; + unpack_index += 1; + buffer += len; + } + } + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / + unpack_index; + dim3 unpack_nthreads_per_block(block_size); + dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, + unpack_index); + RPlaunchHipKernel( (halo_packing_fused_unpack), + unpack_nblocks, unpack_nthreads_per_block, + shmem, res.get_stream(), + unpack_buffer_ptrs, + unpack_list_ptrs, + unpack_var_ptrs, + unpack_len_ptrs ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + } + stopTimer(); + + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + + } else { + getCout() << "\n HALO_PACKING_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename dispatch_helper > +void HALO_PACKING_FUSED::runHipVariantWorkGroup(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + HALO_PACKING_FUSED_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + dispatch_policy >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + res.wait(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_PACKING_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +void HALO_PACKING_FUSED::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runHipVariantDirect(vid); + + } + + t += 1; + + } + + }); + + } + + if (vid == RAJA_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runHipVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } + + }); + + } +} + +void HALO_PACKING_FUSED::setHipTuningDefinitions(VariantID vid) +{ + if (vid == Base_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "direct_"+std::to_string(block_size)); + + } + + }); + + } + + if (vid == RAJA_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size)); + + }); + + } + + }); + + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_PACKING_FUSED-OMP.cpp similarity index 64% rename from src/apps/HALOEXCHANGE_FUSED-OMP.cpp rename to src/comm/HALO_PACKING_FUSED-OMP.cpp index 6f228a8f6..143a65501 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMP.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -14,23 +14,23 @@ namespace rajaperf { -namespace apps +namespace comm { -void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING_FUSED::runOpenMPVariantDirect(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; switch ( vid ) { case Base_OpenMP : { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -38,9 +38,9 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; @@ -61,7 +61,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALO_PACK_BODY; } } } @@ -73,17 +73,31 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALO_PACK_BODY; } } #endif + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; @@ -104,7 +118,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALO_UNPACK_BODY; } } } @@ -116,7 +130,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALO_UNPACK_BODY; } } #endif @@ -124,14 +138,14 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN; break; } case Lambda_OpenMP : { - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -139,9 +153,9 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); @@ -174,13 +188,27 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } } #endif + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var)); @@ -217,11 +245,33 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } + default : { + getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +template < typename dispatch_helper > +void HALO_PACKING_FUSED::runOpenMPVariantWorkGroup(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + HALO_PACKING_FUSED_DATA_SETUP; + + switch ( vid ) { + case RAJA_OpenMP : { using AllocatorHolder = RAJAPoolAllocatorHolder< @@ -230,10 +280,17 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::omp_work, RAJA::ordered, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -259,35 +316,39 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -301,7 +362,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } default : { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl; } } @@ -311,5 +372,57 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ #endif } -} // end namespace apps +void HALO_PACKING_FUSED::runOpenMPVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_OpenMP || vid == Lambda_OpenMP) { + + if (tune_idx == t) { + + runOpenMPVariantDirect(vid); + + } + + t += 1; + + } + + if (vid == RAJA_OpenMP) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runOpenMPVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } +} + +void HALO_PACKING_FUSED::setOpenMPTuningDefinitions(VariantID vid) +{ + if (vid == Base_OpenMP || vid == Lambda_OpenMP) { + + addVariantTuningName(vid, "direct"); + + } + + if (vid == RAJA_OpenMP) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); + + }); + + } +} + +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp similarity index 68% rename from src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp rename to src/comm/HALO_PACKING_FUSED-OMPTarget.cpp index 4dd2dad31..ab0b075b4 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { // @@ -26,7 +26,7 @@ namespace apps // //const size_t threads_per_team = 256; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ void** pack_ptrs; \ allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \ Real_ptr* pack_buffer_ptrs = reinterpret_cast(pack_ptrs) + 0 * num_neighbors * num_vars; \ @@ -50,28 +50,28 @@ namespace apps Real_ptr* h_unpack_var_ptrs = reinterpret_cast(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \ Index_type* h_unpack_len_ptrs = reinterpret_cast(h_unpack_ptrs) + 3 * num_neighbors * num_vars; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ deallocData(DataSpace::OmpTarget, pack_ptrs); \ delete[] h_pack_ptrs; \ deallocData(DataSpace::OmpTarget, unpack_ptrs); \ delete[] h_unpack_ptrs; -void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING_FUSED::runOpenMPTargetVariantDirect(VariantID vid) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -80,7 +80,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type pack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -94,7 +94,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U buffer += len; } } - HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; + HALO_PACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -107,18 +107,32 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type len = pack_len_ptrs[j]; for (Index_type i = ii; i < len; i += pack_len_ave) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALO_PACK_BODY; } } } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; Index_type unpack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; h_unpack_buffer_ptrs[unpack_index] = buffer; @@ -130,7 +144,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U buffer += len; } } - HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; + HALO_PACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -143,7 +157,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type len = unpack_len_ptrs[j]; for (Index_type i = ii; i < len; i += unpack_len_ave) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALO_UNPACK_BODY; } } } @@ -151,9 +165,21 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + + } else { + getCout() << "\n HALO_PACKING_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +template < typename dispatch_helper > +void HALO_PACKING_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) +{ + const Index_type run_reps = getRunReps(); - } else if ( vid == RAJA_OpenMPTarget ) { + HALO_PACKING_FUSED_DATA_SETUP; + + if ( vid == RAJA_OpenMPTarget ) { using AllocatorHolder = RAJAPoolAllocatorHolder< RAJA::basic_mempool::MemPool>; @@ -161,10 +187,17 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::omp_target_work /**/, RAJA::ordered, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -190,35 +223,39 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -229,11 +266,63 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U stopTimer(); } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +void HALO_PACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_OpenMPTarget) { + + if (tune_idx == t) { + + runOpenMPTargetVariantDirect(vid); + + } + + t += 1; + + } + + if (vid == RAJA_OpenMPTarget) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runOpenMPTargetVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } +} + +void HALO_PACKING_FUSED::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + if (vid == Base_OpenMPTarget) { + + addVariantTuningName(vid, "direct"); + + } + + if (vid == RAJA_OpenMPTarget) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); + + }); + } } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_PACKING_FUSED-Seq.cpp similarity index 58% rename from src/apps/HALOEXCHANGE_FUSED-Seq.cpp rename to src/comm/HALO_PACKING_FUSED-Seq.cpp index e6aa5fdbe..f7c16e253 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/comm/HALO_PACKING_FUSED-Seq.cpp @@ -1,12 +1,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -14,21 +14,21 @@ namespace rajaperf { -namespace apps +namespace comm { -void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING_FUSED::runSeqVariantDirect(VariantID vid) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; switch ( vid ) { case Base_Seq : { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -36,9 +36,9 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; @@ -53,16 +53,30 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALO_PACK_BODY; + } + } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); } } Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; @@ -77,14 +91,14 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALO_UNPACK_BODY; } } } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN; break; } @@ -92,7 +106,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -100,9 +114,9 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); @@ -118,13 +132,27 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG pack_lambda(i); } } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var)); @@ -144,23 +172,49 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +} + +template < typename dispatch_helper > +void HALO_PACKING_FUSED::runSeqVariantWorkGroup(VariantID vid) +{ + switch ( vid ) { case RAJA_Seq : { +#if defined(RUN_RAJA_SEQ) + const Index_type run_reps = getRunReps(); + + HALO_PACKING_FUSED_DATA_SETUP; + using AllocatorHolder = RAJAPoolAllocatorHolder< RAJA::basic_mempool::MemPool>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::seq_work, RAJA::ordered, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -186,35 +240,39 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -223,18 +281,70 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); +#endif // RUN_RAJA_SEQ break; } -#endif // RUN_RAJA_SEQ default : { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl; } } } -} // end namespace apps +void HALO_PACKING_FUSED::runSeqVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (vid == Base_Seq || vid == Lambda_Seq) { + + if (tune_idx == t) { + + runSeqVariantDirect(vid); + + } + + t += 1; + + } + + if (vid == RAJA_Seq) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + if (tune_idx == t) { + + runSeqVariantWorkGroup(vid); + + } + + t += 1; + + }); + + } +} + +void HALO_PACKING_FUSED::setSeqTuningDefinitions(VariantID vid) +{ + if (vid == Base_Seq || vid == Lambda_Seq) { + + addVariantTuningName(vid, "direct"); + + } + + if (vid == RAJA_Seq) { + + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); + + }); + + } +} + +} // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp new file mode 100644 index 000000000..93d29dfbc --- /dev/null +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -0,0 +1,163 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_PACKING_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +namespace rajaperf +{ +namespace comm +{ + +HALO_PACKING_FUSED::HALO_PACKING_FUSED(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_PACKING_FUSED, params) +{ + setDefaultReps(200); + + m_num_vars = params.getHaloNumVars(); + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); + setKernelsPerRep( 2 ); + setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() + // pack + 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Int_type) * getItsPerRep() + // unpack + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep(0); + + setUsesFeature(Workgroup); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); +} + +HALO_PACKING_FUSED::~HALO_PACKING_FUSED() +{ +} + +void HALO_PACKING_FUSED::setUp(VariantID vid, size_t tune_idx) +{ + int my_mpi_rank = 0; + const int mpi_dims[3] = {1,1,1}; + setUp_base(my_mpi_rank, mpi_dims, vid, tune_idx); + + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_pack_buffers.resize(s_num_neighbors, nullptr); + m_send_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len); + m_send_buffers[l] = m_pack_buffers[l]; + } + } + + m_unpack_buffers.resize(s_num_neighbors, nullptr); + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len); + m_recv_buffers[l] = m_unpack_buffers[l]; + } + } +} + +void HALO_PACKING_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_send_buffers[l], buffer_len, vid); + } else { + checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_send_buffers[l], buffer_len, vid); + } + } +} + +void HALO_PACKING_FUSED::tearDown(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + deallocData(getDataSpace(vid), m_unpack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]); + } + } + m_recv_buffers.clear(); + m_unpack_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + deallocData(getDataSpace(vid), m_pack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_pack_buffers[l]); + } + } + m_send_buffers.clear(); + m_pack_buffers.clear(); + + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); + + tearDown_base(vid, tune_idx); +} + +} // end namespace comm +} // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp similarity index 50% rename from src/apps/HALOEXCHANGE_FUSED.hpp rename to src/comm/HALO_PACKING_FUSED.hpp index b0af7e60e..065c0be3a 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALO_PACKING_FUSED.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -7,56 +7,59 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOEXCHANGE_FUSED kernel reference implementation: +/// HALO_PACKING_FUSED kernel reference implementation: /// -/// // pack message for each neighbor +/// // pack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; -/// Index_type len = pack_index_list_lengths[l]; +/// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_FUSED_PACK_BODY; +/// buffer[i] = var[list[i]]; /// } /// buffer += len; /// } -/// // send message to neighbor /// } /// -/// // unpack messages for each neighbor +/// // unpack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive message from neighbor -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; -/// Index_type len = unpack_index_list_lengths[l]; +/// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_FUSED_UNPACK_BODY; +/// var[list[i]] = buffer[i]; /// } /// buffer += len; /// } /// } /// -#ifndef RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP -#define RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP +#ifndef RAJAPerf_Comm_HALO_PACKING_FUSED_HPP +#define RAJAPerf_Comm_HALO_PACKING_FUSED_HPP -#define HALOEXCHANGE_FUSED_DATA_SETUP \ - std::vector vars = m_vars; \ - std::vector buffers = m_buffers; \ -\ - Index_type num_neighbors = s_num_neighbors; \ +#define HALO_PACKING_FUSED_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ + \ Index_type num_vars = m_num_vars; \ - std::vector pack_index_lists = m_pack_index_lists; \ - std::vector pack_index_list_lengths = m_pack_index_list_lengths; \ - std::vector unpack_index_lists = m_unpack_index_lists; \ - std::vector unpack_index_list_lengths = m_unpack_index_list_lengths; - -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ + std::vector vars = m_vars; \ + \ + const DataSpace dataSpace = getDataSpace(vid); \ + \ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \ + \ + std::vector pack_buffers = m_pack_buffers; \ + std::vector unpack_buffers = m_unpack_buffers; \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; + +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP \ struct ptr_holder { \ Real_ptr buffer; \ Int_ptr list; \ @@ -67,23 +70,17 @@ ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN \ delete[] pack_ptr_holders; \ delete[] pack_lens; \ delete[] unpack_ptr_holders; \ delete[] unpack_lens; -#define HALOEXCHANGE_FUSED_PACK_BODY \ - buffer[i] = var[list[i]]; -#define HALOEXCHANGE_FUSED_UNPACK_BODY \ - var[list[i]] = buffer[i]; - - -#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ +#define HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ - HALOEXCHANGE_FUSED_PACK_BODY; \ + HALO_PACK_BODY; \ }; \ }; \ using pack_lambda_type = decltype(make_pack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ @@ -92,7 +89,7 @@ Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \ auto make_unpack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ - HALOEXCHANGE_FUSED_UNPACK_BODY; \ + HALO_UNPACK_BODY; \ }; \ }; \ using unpack_lambda_type = decltype(make_unpack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ @@ -100,14 +97,14 @@ malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ +#define HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ free(pack_lambdas); \ delete[] pack_lens; \ free(unpack_lambdas); \ delete[] unpack_lens; -#include "common/KernelBase.hpp" +#include "HALO_base.hpp" #include "RAJA/RAJA.hpp" @@ -117,16 +114,16 @@ namespace rajaperf { class RunParams; -namespace apps +namespace comm { -class HALOEXCHANGE_FUSED : public KernelBase +class HALO_PACKING_FUSED : public HALO_base { public: - HALOEXCHANGE_FUSED(const RunParams& params); + HALO_PACKING_FUSED(const RunParams& params); - ~HALOEXCHANGE_FUSED(); + ~HALO_PACKING_FUSED(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); @@ -138,58 +135,48 @@ class HALOEXCHANGE_FUSED : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + + void runSeqVariantDirect(VariantID vid); + void runOpenMPVariantDirect(VariantID vid); + void runOpenMPTargetVariantDirect(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantDirect(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runHipVariantDirect(VariantID vid); + + template < typename dispatch_helper > + void runSeqVariantWorkGroup(VariantID vid); + template < typename dispatch_helper > + void runOpenMPVariantWorkGroup(VariantID vid); + template < typename dispatch_helper > + void runOpenMPTargetVariantWorkGroup(VariantID vid); + template < size_t block_size, typename dispatch_helper > + void runCudaVariantWorkGroup(VariantID vid); + template < size_t block_size, typename dispatch_helper > + void runHipVariantWorkGroup(VariantID vid); private: static const size_t default_gpu_block_size = 1024; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; - static const int s_num_neighbors = 26; - - Index_type m_grid_dims[3]; - Index_type m_halo_width; Index_type m_num_vars; - - Index_type m_grid_dims_default[3]; - Index_type m_halo_width_default; - Index_type m_num_vars_default; - - Index_type m_grid_plus_halo_dims[3]; Index_type m_var_size; - Index_type m_var_halo_size; std::vector m_vars; - std::vector m_buffers; - - std::vector m_pack_index_lists; - std::vector m_pack_index_list_lengths; - std::vector m_unpack_index_lists; - std::vector m_unpack_index_list_lengths; - - void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - void destroy_pack_lists(std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid); - void create_unpack_lists(std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - void destroy_unpack_lists(std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid); + + std::vector m_pack_buffers; + std::vector m_unpack_buffers; + + std::vector m_send_buffers; + std::vector m_recv_buffers; }; -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // closing endif for header file include guard diff --git a/src/comm/HALO_SENDRECV-Cuda.cpp b/src/comm/HALO_SENDRECV-Cuda.cpp new file mode 100644 index 000000000..6d8d1bf56 --- /dev/null +++ b/src/comm/HALO_SENDRECV-Cuda.cpp @@ -0,0 +1,63 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_SENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void HALO_SENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + HALO_SENDRECV_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_SENDRECV : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/comm/HALO_SENDRECV-Hip.cpp b/src/comm/HALO_SENDRECV-Hip.cpp new file mode 100644 index 000000000..7db6baf83 --- /dev/null +++ b/src/comm/HALO_SENDRECV-Hip.cpp @@ -0,0 +1,63 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_SENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void HALO_SENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + HALO_SENDRECV_DATA_SETUP; + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_SENDRECV : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/comm/HALO_SENDRECV-OMP.cpp b/src/comm/HALO_SENDRECV-OMP.cpp new file mode 100644 index 000000000..347756d81 --- /dev/null +++ b/src/comm/HALO_SENDRECV-OMP.cpp @@ -0,0 +1,74 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_SENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void HALO_SENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + HALO_SENDRECV_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n HALO_SENDRECV : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_SENDRECV-OMPTarget.cpp b/src/comm/HALO_SENDRECV-OMPTarget.cpp new file mode 100644 index 000000000..42f62289f --- /dev/null +++ b/src/comm/HALO_SENDRECV-OMPTarget.cpp @@ -0,0 +1,68 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_SENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void HALO_SENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + HALO_SENDRECV_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n HALO_SENDRECV : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/comm/HALO_SENDRECV-Seq.cpp b/src/comm/HALO_SENDRECV-Seq.cpp new file mode 100644 index 000000000..ab64c9415 --- /dev/null +++ b/src/comm/HALO_SENDRECV-Seq.cpp @@ -0,0 +1,69 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_SENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void HALO_SENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + HALO_SENDRECV_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n HALO_SENDRECV : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_SENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp new file mode 100644 index 000000000..0c57b2c3a --- /dev/null +++ b/src/comm/HALO_SENDRECV.cpp @@ -0,0 +1,128 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_SENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +namespace rajaperf +{ +namespace comm +{ + +HALO_SENDRECV::HALO_SENDRECV(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_SENDRECV, params) +{ + m_mpi_size = params.getMPISize(); + m_my_mpi_rank = params.getMPIRank(); + m_mpi_dims = params.getMPI3DDivision(); + + setDefaultReps(200); + + m_num_vars = params.getHaloNumVars(); + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); + setKernelsPerRep( 0 ); + setBytesReadPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // send + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // recv + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep(0); + + setUsesFeature(Forall); + setUsesFeature(MPI); + + if (params.validMPI3DDivision()) { + setVariantDefined( Base_Seq ); + + setVariantDefined( Base_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + + setVariantDefined( Base_HIP ); + } +} + +HALO_SENDRECV::~HALO_SENDRECV() +{ +} + +void HALO_SENDRECV::setUp(VariantID vid, size_t tune_idx) +{ + setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_send_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_send_buffers[l], buffer_len); + } + } + + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_recv_buffers[l], buffer_len); + } + } +} + +void HALO_SENDRECV::updateChecksum(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_recv_buffers[l], buffer_len, vid); + } else { + checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_recv_buffers[l], buffer_len, vid); + } + } +} + +void HALO_SENDRECV::tearDown(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_recv_buffers[l]); + } + } + m_recv_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_send_buffers[l]); + } + } + m_send_buffers.clear(); + + tearDown_base(vid, tune_idx); +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/HALO_SENDRECV.hpp b/src/comm/HALO_SENDRECV.hpp new file mode 100644 index 000000000..da2a1d1cc --- /dev/null +++ b/src/comm/HALO_SENDRECV.hpp @@ -0,0 +1,124 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// HALO_SENDRECV kernel reference implementation: +/// +/// // post a recv for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Index_type len = unpack_index_list_lengths[l]; +/// MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); +/// } +/// +/// // pack a buffer for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = pack_buffers[l]; +/// Int_ptr list = pack_index_lists[l]; +/// Index_type len = pack_index_list_lengths[l]; +/// // pack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// buffer[i] = var[list[i]]; +/// } +/// buffer += len; +/// } +/// // send buffer to neighbor +/// MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); +/// } +/// +/// // unpack a buffer for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// // receive buffer from neighbor +/// MPI_Wait(&unpack_mpi_requests[l], MPI_STATUS_IGNORE); +/// Real_ptr buffer = unpack_buffers[l]; +/// Int_ptr list = unpack_index_lists[l]; +/// Index_type len = unpack_index_list_lengths[l]; +/// // unpack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// var[list[i]] = buffer[i]; +/// } +/// buffer += len; +/// } +/// } +/// +/// // wait for all sends to complete +/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); +/// + + +#ifndef RAJAPerf_Comm_HALO_SENDRECV_HPP +#define RAJAPerf_Comm_HALO_SENDRECV_HPP + +#define HALO_SENDRECV_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ + \ + Index_type num_vars = m_num_vars; \ + \ + std::vector mpi_ranks = m_mpi_ranks; \ + \ + std::vector pack_mpi_requests(num_neighbors); \ + std::vector unpack_mpi_requests(num_neighbors); \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; + + +#include "HALO_base.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include +#include + +namespace rajaperf +{ +namespace comm +{ + +class HALO_SENDRECV : public HALO_base +{ +public: + + HALO_SENDRECV(const RunParams& params); + + ~HALO_SENDRECV(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + +private: + int m_mpi_size = -1; + int m_my_mpi_rank = -1; + std::array m_mpi_dims = {-1, -1, -1}; + + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_send_buffers; + std::vector m_recv_buffers; +}; + +} // end namespace comm +} // end namespace rajaperf + +#endif +#endif // closing endif for header file include guard diff --git a/src/comm/HALO_base.cpp b/src/comm/HALO_base.cpp new file mode 100644 index 000000000..84845114c --- /dev/null +++ b/src/comm/HALO_base.cpp @@ -0,0 +1,311 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALO_base.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include +#include + +namespace rajaperf +{ +namespace comm +{ + +Index_type HALO_base::s_grid_dims_default[3] {100, 100, 100}; + +HALO_base::HALO_base(KernelID kid, const RunParams& params) + : KernelBase(kid, params) +{ + setDefaultProblemSize( s_grid_dims_default[0] * + s_grid_dims_default[1] * + s_grid_dims_default[2] ); + + double cbrt_run_size = std::cbrt(getTargetProblemSize()) + std::cbrt(3)-1; + + m_grid_dims[0] = cbrt_run_size; + m_grid_dims[1] = cbrt_run_size; + m_grid_dims[2] = cbrt_run_size; + m_halo_width = params.getHaloWidth(); + + m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width; + m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width; + m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width; + m_grid_plus_halo_size = m_grid_plus_halo_dims[0] * + m_grid_plus_halo_dims[1] * + m_grid_plus_halo_dims[2] ; + + setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); +} + +HALO_base::~HALO_base() +{ +} + +void HALO_base::setUp_base(const int my_mpi_rank, const int* mpi_dims, + VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + m_mpi_ranks.resize(s_num_neighbors, -1); + m_send_tags.resize(s_num_neighbors, -1); + m_pack_index_lists.resize(s_num_neighbors, nullptr); + m_pack_index_list_lengths.resize(s_num_neighbors, 0); + m_recv_tags.resize(s_num_neighbors, -1); + m_unpack_index_lists.resize(s_num_neighbors, nullptr); + m_unpack_index_list_lengths.resize(s_num_neighbors, 0); + create_lists(my_mpi_rank, mpi_dims, m_mpi_ranks, + m_send_tags, m_pack_index_lists, m_pack_index_list_lengths, + m_recv_tags, m_unpack_index_lists, m_unpack_index_list_lengths, + m_halo_width, m_grid_dims, + s_num_neighbors, vid); +} + +void HALO_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + destroy_lists(m_pack_index_lists, m_unpack_index_lists, s_num_neighbors, vid); + m_unpack_index_list_lengths.clear(); + m_unpack_index_lists.clear(); + m_recv_tags.clear(); + m_pack_index_list_lengths.clear(); + m_pack_index_lists.clear(); + m_send_tags.clear(); + m_mpi_ranks.clear(); +} + + +const int HALO_base::s_boundary_offsets[HALO_base::s_num_neighbors][3]{ + + // faces + {-1, 0, 0}, + { 1, 0, 0}, + { 0, -1, 0}, + { 0, 1, 0}, + { 0, 0, -1}, + { 0, 0, 1}, + + // edges + {-1, -1, 0}, + {-1, 1, 0}, + { 1, -1, 0}, + { 1, 1, 0}, + {-1, 0, -1}, + {-1, 0, 1}, + { 1, 0, -1}, + { 1, 0, 1}, + { 0, -1, -1}, + { 0, -1, 1}, + { 0, 1, -1}, + { 0, 1, 1}, + + // corners + {-1, -1, -1}, + {-1, -1, 1}, + {-1, 1, -1}, + {-1, 1, 1}, + { 1, -1, -1}, + { 1, -1, 1}, + { 1, 1, -1}, + { 1, 1, 1} + +}; + +HALO_base::Extent HALO_base::make_boundary_extent( + const HALO_base::message_type msg_type, + const int (&boundary_offset)[3], + const Index_type halo_width, const Index_type* grid_dims) +{ + if (msg_type != message_type::send && + msg_type != message_type::recv) { + throw std::runtime_error("make_boundary_extent: Invalid message type"); + } + auto get_bounds = [&](int offset, Index_type dim_size) { + std::pair bounds; + switch (offset) { + case -1: + if (msg_type == message_type::send) { + bounds.first = halo_width; + bounds.second = halo_width + halo_width; + } else { // (msg_type == message_type::recv) + bounds.first = 0; + bounds.second = halo_width; + } + break; + case 0: + bounds.first = halo_width; + bounds.second = halo_width + dim_size; + break; + case 1: + if (msg_type == message_type::send) { + bounds.first = halo_width + dim_size - halo_width; + bounds.second = halo_width + dim_size; + } else { // (msg_type == message_type::recv) + bounds.first = halo_width + dim_size; + bounds.second = halo_width + dim_size + halo_width; + } + break; + default: + throw std::runtime_error("make_extent: Invalid location"); + } + return bounds; + }; + auto x_bounds = get_bounds(boundary_offset[0], grid_dims[0]); + auto y_bounds = get_bounds(boundary_offset[1], grid_dims[1]); + auto z_bounds = get_bounds(boundary_offset[2], grid_dims[2]); + return {x_bounds.first, x_bounds.second, + y_bounds.first, y_bounds.second, + z_bounds.first, z_bounds.second}; +} + + +// +// Function to generate mpi decomposition and index lists for packing and unpacking. +// +void HALO_base::create_lists( + int my_mpi_rank, + const int* mpi_dims, + std::vector& mpi_ranks, + std::vector& send_tags, + std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + std::vector& recv_tags, + std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid) +{ + int my_mpi_idx[3]{-1,-1,-1}; + my_mpi_idx[2] = my_mpi_rank / (mpi_dims[0]*mpi_dims[1]); + my_mpi_idx[1] = (my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1])) / mpi_dims[0]; + my_mpi_idx[0] = my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1]) - my_mpi_idx[1]*mpi_dims[0]; + + auto get_boundary_idx = [&](const int (&boundary_offset)[3]) { + return (boundary_offset[0]+1) + 3*(boundary_offset[1]+1) + 9*(boundary_offset[2]+1); + }; + + std::map boundary_idx_to_tag; + for (Index_type l = 0; l < num_neighbors; ++l) { + boundary_idx_to_tag[get_boundary_idx(s_boundary_offsets[l])] = l; + } + + const Index_type grid_i_stride = 1; + const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; + const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); + + for (Index_type l = 0; l < num_neighbors; ++l) { + + const int (&boundary_offset)[3] = s_boundary_offsets[l]; + + int neighbor_boundary_offset[3]{-1, -1, -1}; + for (int dim = 0; dim < 3; ++dim) { + neighbor_boundary_offset[dim] = -boundary_offset[dim]; + } + + int neighbor_mpi_idx[3] = {my_mpi_idx[0]+boundary_offset[0], + my_mpi_idx[1]+boundary_offset[1], + my_mpi_idx[2]+boundary_offset[2]}; + + // fix neighbor mpi index on periodic boundaries + for (int dim = 0; dim < 3; ++dim) { + if (neighbor_mpi_idx[dim] >= mpi_dims[dim]) { + neighbor_mpi_idx[dim] = 0; + } else if (neighbor_mpi_idx[dim] < 0) { + neighbor_mpi_idx[dim] = mpi_dims[dim]-1; + } + } + + mpi_ranks[l] = neighbor_mpi_idx[0] + mpi_dims[0]*(neighbor_mpi_idx[1] + mpi_dims[1]*neighbor_mpi_idx[2]); + + { + // pack and send + send_tags[l] = boundary_idx_to_tag[get_boundary_idx(boundary_offset)]; + Extent extent = make_boundary_extent(message_type::send, + boundary_offset, + halo_width, grid_dims); + + pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * + (extent.j_max - extent.j_min) * + (extent.k_max - extent.k_min) ; + + allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); + + Int_ptr pack_list = pack_index_lists[l]; + + Index_type list_idx = 0; + for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { + for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { + for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { + + Index_type pack_idx = ii * grid_i_stride + + jj * grid_j_stride + + kk * grid_k_stride ; + + pack_list[list_idx] = pack_idx; + + list_idx += 1; + } + } + } + } + + { + // receive and unpack + recv_tags[l] = boundary_idx_to_tag[get_boundary_idx(neighbor_boundary_offset)]; + Extent extent = make_boundary_extent(message_type::recv, + boundary_offset, + halo_width, grid_dims); + + unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * + (extent.j_max - extent.j_min) * + (extent.k_max - extent.k_min) ; + + allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + + Int_ptr unpack_list = unpack_index_lists[l]; + + Index_type list_idx = 0; + for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { + for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { + for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { + + Index_type unpack_idx = ii * grid_i_stride + + jj * grid_j_stride + + kk * grid_k_stride ; + + unpack_list[list_idx] = unpack_idx; + + list_idx += 1; + } + } + } + } + } +} + +// +// Function to destroy packing and unpacking index lists. +// +void HALO_base::destroy_lists( + std::vector& pack_index_lists, + std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid) +{ + for (Index_type l = 0; l < num_neighbors; ++l) { + deallocData(pack_index_lists[l], vid); + } + for (Index_type l = 0; l < num_neighbors; ++l) { + deallocData(unpack_index_lists[l], vid); + } +} + +} // end namespace comm +} // end namespace rajaperf diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp new file mode 100644 index 000000000..fea021a87 --- /dev/null +++ b/src/comm/HALO_base.hpp @@ -0,0 +1,176 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// HALO_base provides a common starting point for the other HALO_ classes. +/// + +#ifndef RAJAPerf_Comm_HALO_BASE_HPP +#define RAJAPerf_Comm_HALO_BASE_HPP + +#define HALO_BASE_DATA_SETUP \ + Index_type num_neighbors = s_num_neighbors; \ + std::vector send_tags = m_send_tags; \ + std::vector pack_index_lists = m_pack_index_lists; \ + std::vector pack_index_list_lengths = m_pack_index_list_lengths; \ + std::vector recv_tags = m_recv_tags; \ + std::vector unpack_index_lists = m_unpack_index_lists; \ + std::vector unpack_index_list_lengths = m_unpack_index_list_lengths; + +#define HALO_PACK_BODY \ + buffer[i] = var[list[i]]; + +#define HALO_UNPACK_BODY \ + var[list[i]] = buffer[i]; + + +#include "common/KernelBase.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +class RunParams; + +struct direct_dispatch_helper +{ + template < typename... Ts > + using dispatch_policy = RAJA::direct_dispatch; + static std::string get_name() { return "direct"; } +}; + +struct indirect_function_call_dispatch_helper +{ + template < typename... Ts > + using dispatch_policy = RAJA::indirect_function_call_dispatch; + static std::string get_name() { return "funcptr"; } +}; + +struct indirect_virtual_function_dispatch_helper +{ + template < typename... Ts > + using dispatch_policy = RAJA::indirect_virtual_function_dispatch; + static std::string get_name() { return "virtfunc"; } +}; + +using workgroup_dispatch_helpers = camp::list< + direct_dispatch_helper, + indirect_function_call_dispatch_helper, + indirect_virtual_function_dispatch_helper >; + +using hip_workgroup_dispatch_helpers = camp::list< + direct_dispatch_helper +#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL + ,indirect_function_call_dispatch_helper + ,indirect_virtual_function_dispatch_helper +#endif + >; + +namespace comm +{ + +class HALO_base : public KernelBase +{ +public: + + HALO_base(KernelID kid, const RunParams& params); + + ~HALO_base(); + + void setUp_base(const int my_mpi_rank, const int* mpi_dims, + VariantID vid, size_t tune_idx); + void tearDown_base(VariantID vid, size_t tune_idx); + + struct Packer { + Real_ptr buffer; + Real_ptr var; + Int_ptr list; + RAJA_HOST_DEVICE void operator()(Index_type i) const { + HALO_PACK_BODY; + } + }; + + struct UnPacker { + Real_ptr buffer; + Real_ptr var; + Int_ptr list; + RAJA_HOST_DEVICE void operator()(Index_type i) const { + HALO_UNPACK_BODY; + } + }; + +protected: + enum struct message_type : int + { + send, + recv + }; + + struct Extent + { + Index_type i_min; + Index_type i_max; + Index_type j_min; + Index_type j_max; + Index_type k_min; + Index_type k_max; + }; + + static const int s_num_neighbors = 26; + static const int s_boundary_offsets[s_num_neighbors][3]; + + static Index_type s_grid_dims_default[3]; + + Index_type m_grid_dims[3]; + Index_type m_halo_width; + + Index_type m_grid_plus_halo_dims[3]; + Index_type m_grid_plus_halo_size; + + std::vector m_mpi_ranks; + + std::vector m_send_tags; + std::vector m_pack_index_lists; + std::vector m_pack_index_list_lengths; + + std::vector m_recv_tags; + std::vector m_unpack_index_lists; + std::vector m_unpack_index_list_lengths; + + Extent make_boundary_extent( + const message_type msg_type, + const int (&boundary_offset)[3], + const Index_type halo_width, const Index_type* grid_dims); + + void create_lists( + int my_mpi_rank, + const int* mpi_dims, + std::vector& mpi_ranks, + std::vector& send_tags, + std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + std::vector& recv_tags, + std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid); + + void destroy_lists( + std::vector& pack_index_lists, + std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid); +}; + +} // end namespace comm +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 9dff522bd..f14076398 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 0c3504d69..dea54acf2 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -42,6 +42,33 @@ __device__ __forceinline__ unsigned long long device_timer() return global_timer; } +/*! + * \brief Method for launching a CUDA kernel with given configuration. + * + * Note: method checks whether number of args and their types in + * kernel signature matches args passed to this method. + */ +template +void RPlaunchCudaKernel(void (*kernel)(KernArgs...), + const dim3& numBlocks, const dim3& dimBlocks, + std::uint32_t sharedMemBytes, cudaStream_t stream, + Args const&... args) +{ + static_assert(sizeof...(KernArgs) == sizeof...(Args), + "Number of kernel args doesn't match what's passed to method"); + + static_assert(conjunction, std::decay_t>...>::value, + "Kernel arg types don't match what's passed to method"); + + constexpr size_t count = sizeof...(Args); + void* arg_arr[count]{(void*)&args...}; + + auto k = reinterpret_cast(kernel); + cudaErrchk( cudaLaunchKernel(k, numBlocks, dimBlocks, + arg_arr, + sharedMemBytes, stream) ); +} + /*! * \brief Simple forall cuda kernel that runs a lambda. */ @@ -84,19 +111,56 @@ __global__ void lambda_cuda(Lambda body) namespace detail { +/*! + * \brief Get current cuda device. + */ +inline int getCudaDevice() +{ + int device = -1; + cudaErrchk( cudaGetDevice( &device ) ); + return device; +} + +/*! + * \brief Get properties of the current cuda device. + */ +inline cudaDeviceProp getCudaDeviceProp() +{ + cudaDeviceProp prop; + cudaErrchk(cudaGetDeviceProperties(&prop, getCudaDevice())); + return prop; +} + +/*! + * \brief Get max occupancy in blocks for the given kernel for the current + * cuda device. + */ +template < typename Func > +RAJA_INLINE +int getCudaOccupancyMaxBlocks(Func&& func, int num_threads, size_t shmem_size) +{ + int max_blocks = -1; + cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks, func, num_threads, shmem_size)); + + size_t multiProcessorCount = getCudaDeviceProp().multiProcessorCount; + + return max_blocks * multiProcessorCount; +} + /* * Copy memory len bytes from src to dst. */ -inline void copyCudaData(void* dst_ptr, const void* src_ptr, size_t len) +inline void copyCudaData(void* dst_ptr, const void* src_ptr, Size_type len) { cudaErrchk( cudaMemcpy( dst_ptr, src_ptr, len, cudaMemcpyDefault ) ); } /*! - * \brief Allocate CUDA device data array (dptr). + * \brief Allocate CUDA device data array. */ -inline void* allocCudaDeviceData(size_t len) +inline void* allocCudaDeviceData(Size_type len) { void* dptr = nullptr; cudaErrchk( cudaMalloc( &dptr, len ) ); @@ -104,19 +168,65 @@ inline void* allocCudaDeviceData(size_t len) } /*! - * \brief Allocate CUDA managed data array (dptr). + * \brief Allocate CUDA managed data array. + */ +inline void* allocCudaManagedData(Size_type len) +{ + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + return mptr; +} + +/*! + * \brief Allocate CUDA managed host preferred data array. + */ +inline void* allocCudaManagedHostPreferredData(Size_type len) +{ + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId ) ); + return mptr; +} + +/*! + * \brief Allocate CUDA managed device preferred data array. */ -inline void* allocCudaManagedData(size_t len) +inline void* allocCudaManagedDevicePreferredData(Size_type len) { void* mptr = nullptr; cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, getCudaDevice() ) ); return mptr; } /*! - * \brief Allocate CUDA pinned data array (pptr). + * \brief Allocate CUDA managed host preferred host accessed data array. */ -inline void* allocCudaPinnedData(size_t len) +inline void* allocCudaManagedHostPreferredDeviceAccessedData(Size_type len) +{ + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetAccessedBy, getCudaDevice() ) ); + return mptr; +} + +/*! + * \brief Allocate CUDA managed device preferred host accessed data array. + */ +inline void* allocCudaManagedDevicePreferredHostAccessedData(Size_type len) +{ + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, getCudaDevice() ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId ) ); + return mptr; +} + +/*! + * \brief Allocate CUDA pinned data array. + */ +inline void* allocCudaPinnedData(Size_type len) { void* pptr = nullptr; cudaErrchk( cudaHostAlloc( &pptr, len, cudaHostAllocMapped ) ); @@ -125,7 +235,7 @@ inline void* allocCudaPinnedData(size_t len) /*! - * \brief Free device data array. + * \brief Free CUDA device data array. */ inline void deallocCudaDeviceData(void* dptr) { @@ -133,7 +243,7 @@ inline void deallocCudaDeviceData(void* dptr) } /*! - * \brief Free managed data array. + * \brief Free CUDA managed data array. */ inline void deallocCudaManagedData(void* mptr) { @@ -141,7 +251,39 @@ inline void deallocCudaManagedData(void* mptr) } /*! - * \brief Free pinned data array. + * \brief Free CUDA managed host preferred data array. + */ +inline void deallocCudaManagedHostPreferredData(void* mptr) +{ + cudaErrchk( cudaFree( mptr ) ); +} + +/*! + * \brief Free CUDA managed device preferred data array. + */ +inline void deallocCudaManagedDevicePreferredData(void* mptr) +{ + cudaErrchk( cudaFree( mptr ) ); +} + +/*! + * \brief Free CUDA managed host preferred host accessed data array. + */ +inline void deallocCudaManagedHostPreferredDeviceAccessedData(void* mptr) +{ + cudaErrchk( cudaFree( mptr ) ); +} + +/*! + * \brief Free CUDA managed device preferred host accessed data array. + */ +inline void deallocCudaManagedDevicePreferredHostAccessedData(void* mptr) +{ + cudaErrchk( cudaFree( mptr ) ); +} + +/*! + * \brief Free CUDA pinned data array. */ inline void deallocCudaPinnedData(void* pptr) { diff --git a/src/common/CudaGridScan.hpp b/src/common/CudaGridScan.hpp new file mode 100644 index 000000000..f2c8f2cd1 --- /dev/null +++ b/src/common/CudaGridScan.hpp @@ -0,0 +1,245 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#if defined(RAJA_ENABLE_CUDA) + +#include +#include +#include +#include + +namespace rajaperf +{ +namespace detail +{ +namespace cuda +{ + +// +// Define magic numbers for CUDA execution +// +const size_t warp_size = 32; +const size_t max_static_shmem = 49154; + + +// perform a grid scan on val and returns the result at each thread +// in exclusive and inclusive, note that val is used as scratch space +template < typename DataType, size_t block_size, size_t items_per_thread > +struct GridScan +{ + using BlockScan = cub::BlockScan; //, cub::BLOCK_SCAN_WARP_SCANS>; + using BlockExchange = cub::BlockExchange; + using WarpReduce = cub::WarpReduce; + + union SharedStorage { + typename BlockScan::TempStorage block_scan_storage; + typename BlockExchange::TempStorage block_exchange_storage; + typename WarpReduce::TempStorage warp_reduce_storage; + volatile DataType prev_grid_count; + }; + + static constexpr size_t shmem_size = sizeof(SharedStorage); + + __device__ + static void grid_scan(const int block_id, + DataType (&val)[items_per_thread], + DataType (&exclusive)[items_per_thread], + DataType (&inclusive)[items_per_thread], + DataType* block_counts, + DataType* grid_counts, + unsigned* block_readys) + { + const bool first_block = (block_id == 0); + const bool last_block = (block_id == gridDim.x-1); + const bool last_thread = (threadIdx.x == block_size-1); + const bool last_warp = (threadIdx.x >= block_size - warp_size); + const int warp_index = (threadIdx.x % warp_size); + const unsigned warp_index_mask = (1u << warp_index); + const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u); + + __shared__ SharedStorage s_temp_storage; + + + BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val); + __syncthreads(); + + + BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive); + __syncthreads(); + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + inclusive[ti] = exclusive[ti] + val[ti]; + } + + BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive); + __syncthreads(); + BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive); + __syncthreads(); + if (first_block) { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready + } + + } else { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 1u); // write block_counts is ready + } + + // get prev_grid_count using last warp in block + if (last_warp) { + + DataType prev_grid_count = 0; + + // accumulate previous block counts into registers of warp + + int prev_block_base_id = block_id - warp_size; + + unsigned prev_block_ready = 0u; + unsigned prev_blocks_ready_ballot = 0u; + unsigned prev_grids_ready_ballot = 0u; + + // accumulate full warp worths of block counts + // stop if run out of full warps of a grid count is ready + while (prev_block_base_id >= 0) { + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + do { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + + prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + + } while (prev_blocks_ready_ballot != 0xffffffffu); + + prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); + + if (prev_grids_ready_ballot != 0u) { + break; + } + + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + + prev_block_ready = 0u; + + prev_block_base_id -= warp_size; + } + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + // this checks that block counts is ready for all blocks above + // the highest grid count that is ready + while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { + + if (prev_block_id >= 0) { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + } + + prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); + } + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // read one grid_count from a block with id grid_count_ready_id + // and read the block_counts from blocks with higher ids. + if (warp_index_mask > prev_grids_ready_ballot) { + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // accumulate grid_count for grid_count_ready_id + prev_grid_count += grid_counts[prev_block_id]; + } + + + prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count); + prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp + + if (last_thread) { + + if (!last_block) { + grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } + + s_temp_storage.prev_grid_count = prev_grid_count; + } + } + + __syncthreads(); + DataType prev_grid_count = s_temp_storage.prev_grid_count; + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + exclusive[ti] = prev_grid_count + exclusive[ti]; + inclusive[ti] = prev_grid_count + inclusive[ti]; + } + } + } + +}; + + +namespace detail +{ + +template < typename T, size_t block_size, size_t max_items_per_thread > +struct grid_scan_max_items_per_thread + : std::conditional_t< (GridScan::shmem_size <= max_static_shmem), + grid_scan_max_items_per_thread, + std::integral_constant > +{ +}; + +} + +template < typename T, size_t block_size > +struct grid_scan_max_items_per_thread + : detail::grid_scan_max_items_per_thread +{ +}; + + +// tune grid scan to maximize throughput while minimizing items_per_thread + +// default tuning for unknown DataType or cuda_arch +template < typename DataType, size_t block_size, size_t cuda_arch, typename enable = void > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = + grid_scan_max_items_per_thread::value / 2; +}; + +// tuning for sm_70 +template < typename DataType, size_t block_size > +struct grid_scan_default_items_per_thread< + DataType, block_size, 700, std::enable_if_t > +{ + static constexpr size_t value = + (block_size <= 64) ? 13 : + (block_size <= 128) ? 9 : + (block_size <= 256) ? 6 : + (block_size <= 512) ? 5 : + (block_size <= 1024) ? 5 : 1; +}; + +} // end namespace cuda +} // end namespace detail +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index f1831cc1f..607f5aa00 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -10,7 +10,9 @@ #include "CudaDataUtils.hpp" #include "HipDataUtils.hpp" #include "OpenMPTargetDataUtils.hpp" +#include "SyclDataUtils.hpp" +#include "KernelBase.hpp" #include "RAJA/internal/MemUtils_CPU.hpp" @@ -72,6 +74,10 @@ bool isCudaDataSpace(DataSpace dataSpace) switch (dataSpace) { case DataSpace::CudaPinned: case DataSpace::CudaManaged: + case DataSpace::CudaManagedHostPreferred: + case DataSpace::CudaManagedDevicePreferred: + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + case DataSpace::CudaManagedDevicePreferredHostAccessed: case DataSpace::CudaDevice: return true; default: @@ -101,6 +107,21 @@ bool isHipDataSpace(DataSpace dataSpace) } } +/*! + * \brief Get if the data space is a sycl DataSpace. + */ +bool isSyclDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::SyclPinned: + case DataSpace::SyclManaged: + case DataSpace::SyclDevice: + return true; + default: + return false; + } +} + static int data_init_count = 0; @@ -123,7 +144,7 @@ void incDataInitCount() /* * Copy memory len bytes from src to dst. */ -void copyHostData(void* dst_ptr, const void* src_ptr, size_t len) +void copyHostData(void* dst_ptr, const void* src_ptr, Size_type len) { std::memcpy(dst_ptr, src_ptr, len); } @@ -132,7 +153,7 @@ void copyHostData(void* dst_ptr, const void* src_ptr, size_t len) /* * Allocate data arrays of given type. */ -void* allocHostData(size_t len, size_t align) +void* allocHostData(Size_type len, Size_type align) { return RAJA::allocate_aligned_type( align, len); @@ -153,7 +174,7 @@ void deallocHostData(void* ptr) /* * Allocate data arrays of given dataSpace. */ -void* allocData(DataSpace dataSpace, int nbytes, int align) +void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align) { void* ptr = nullptr; @@ -186,6 +207,22 @@ void* allocData(DataSpace dataSpace, int nbytes, int align) { ptr = detail::allocCudaManagedData(nbytes); } break; + case DataSpace::CudaManagedHostPreferred: + { + ptr = detail::allocCudaManagedHostPreferredData(nbytes); + } break; + case DataSpace::CudaManagedDevicePreferred: + { + ptr = detail::allocCudaManagedDevicePreferredData(nbytes); + } break; + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + { + ptr = detail::allocCudaManagedHostPreferredDeviceAccessedData(nbytes); + } break; + case DataSpace::CudaManagedDevicePreferredHostAccessed: + { + ptr = detail::allocCudaManagedDevicePreferredHostAccessedData(nbytes); + } break; case DataSpace::CudaDevice: { ptr = detail::allocCudaDeviceData(nbytes); @@ -243,6 +280,25 @@ void* allocData(DataSpace dataSpace, int nbytes, int align) } break; #endif +#if defined(RAJA_ENABLE_SYCL) + case DataSpace::SyclPinned: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + ptr = detail::allocSyclPinnedData(nbytes, qu); + } break; + case DataSpace::SyclManaged: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + ptr = detail::allocSyclManagedData(nbytes, qu); + } break; + case DataSpace::SyclDevice: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + ptr = detail::allocSyclDeviceData(nbytes, qu); + } break; +#endif + + default: { throw std::invalid_argument("allocData : Unknown data space"); @@ -257,10 +313,10 @@ void* allocData(DataSpace dataSpace, int nbytes, int align) */ void copyData(DataSpace dst_dataSpace, void* dst_ptr, DataSpace src_dataSpace, const void* src_ptr, - size_t nbytes) + Size_type nbytes) { - if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace && - hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) { + if (hostCopyDataSpace(dst_dataSpace) == dst_dataSpace && + hostCopyDataSpace(src_dataSpace) == src_dataSpace) { detail::copyHostData(dst_ptr, src_ptr, nbytes); } @@ -290,6 +346,14 @@ void copyData(DataSpace dst_dataSpace, void* dst_ptr, } #endif +#if defined(RAJA_ENABLE_SYCL) + else if (isSyclDataSpace(dst_dataSpace) || + isSyclDataSpace(src_dataSpace)) { + auto qu = camp::resources::Sycl::get_default().get_queue(); + detail::copySyclData(dst_ptr, src_ptr, nbytes, qu); + } +#endif + else { throw std::invalid_argument("copyData : Unknown data space"); } @@ -329,6 +393,22 @@ void deallocData(DataSpace dataSpace, void* ptr) { detail::deallocCudaManagedData(ptr); } break; + case DataSpace::CudaManagedHostPreferred: + { + detail::deallocCudaManagedHostPreferredData(ptr); + } break; + case DataSpace::CudaManagedDevicePreferred: + { + detail::deallocCudaManagedDevicePreferredData(ptr); + } break; + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + { + detail::deallocCudaManagedHostPreferredDeviceAccessedData(ptr); + } break; + case DataSpace::CudaManagedDevicePreferredHostAccessed: + { + detail::deallocCudaManagedDevicePreferredHostAccessedData(ptr); + } break; case DataSpace::CudaDevice: { detail::deallocCudaDeviceData(ptr); @@ -357,6 +437,26 @@ void deallocData(DataSpace dataSpace, void* ptr) } break; #endif +#if defined(RAJA_ENABLE_SYCL) + case DataSpace::SyclPinned: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + detail::deallocSyclPinnedData(ptr, qu); + } break; + case DataSpace::SyclManaged: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + detail::deallocSyclManagedData(ptr, qu); + } break; + case DataSpace::SyclDevice: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + detail::deallocSyclDeviceData(ptr, qu); + } break; +#endif + + + default: { throw std::invalid_argument("deallocData : Unknown data space"); @@ -369,23 +469,23 @@ void deallocData(DataSpace dataSpace, void* ptr) * \brief Initialize Int_type data array to * randomly signed positive and negative values. */ -void initData(Int_ptr& ptr, int len) +void initData(Int_ptr& ptr, Size_type len) { srand(4793); Real_type signfact = 0.0; - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { signfact = Real_type(rand())/RAND_MAX; ptr[i] = ( signfact < 0.5 ? -1 : 1 ); }; signfact = Real_type(rand())/RAND_MAX; - Int_type ilo = len * signfact; + Size_type ilo = len * signfact; ptr[ilo] = -58; signfact = Real_type(rand())/RAND_MAX; - Int_type ihi = len * signfact; + Size_type ihi = len * signfact; ptr[ihi] = 19; incDataInitCount(); @@ -396,11 +496,11 @@ void initData(Int_ptr& ptr, int len) * positive values (0.0, 1.0) based on their array position * (index) and the order in which this method is called. */ -void initData(Real_ptr& ptr, int len) +void initData(Real_ptr& ptr, Size_type len) { Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 ); - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); } @@ -410,9 +510,9 @@ void initData(Real_ptr& ptr, int len) /* * Initialize Real_type data array to constant values. */ -void initDataConst(Real_ptr& ptr, int len, Real_type val) +void initDataConst(Real_ptr& ptr, Size_type len, Real_type val) { - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = val; }; @@ -422,9 +522,9 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val) /* * Initialize Index_type data array to constant values. */ -void initDataConst(Index_type*& ptr, int len, Index_type val) +void initDataConst(Index_type*& ptr, Size_type len, Index_type val) { - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = val; }; @@ -434,13 +534,13 @@ void initDataConst(Index_type*& ptr, int len, Index_type val) /* * Initialize Real_type data array with random sign. */ -void initDataRandSign(Real_ptr& ptr, int len) +void initDataRandSign(Real_ptr& ptr, Size_type len) { Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 ); srand(4793); - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { Real_type signfact = Real_type(rand())/RAND_MAX; signfact = ( signfact < 0.5 ? -1.0 : 1.0 ); ptr[i] = signfact*factor*(i + 1.1)/(i + 1.12345); @@ -452,11 +552,11 @@ void initDataRandSign(Real_ptr& ptr, int len) /* * Initialize Real_type data array with random values. */ -void initDataRandValue(Real_ptr& ptr, int len) +void initDataRandValue(Real_ptr& ptr, Size_type len) { srand(4793); - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = Real_type(rand())/RAND_MAX; }; @@ -466,12 +566,12 @@ void initDataRandValue(Real_ptr& ptr, int len) /* * Initialize Complex_type data array. */ -void initData(Complex_ptr& ptr, int len) +void initData(Complex_ptr& ptr, Size_type len) { Complex_type factor = ( data_init_count % 2 ? Complex_type(0.1,0.2) : Complex_type(0.2,0.3) ); - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); } @@ -492,13 +592,14 @@ void initData(Real_type& d) /* * Calculate and return checksum for data arrays. */ -long double calcChecksum(Int_ptr ptr, int len, - Real_type scale_factor) +template < typename Data_getter > +long double calcChecksumImpl(Data_getter data, Size_type len, + Real_type scale_factor) { long double tchk = 0.0; long double ckahan = 0.0; - for (Index_type j = 0; j < len; ++j) { - long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j]; + for (Size_type j = 0; j < len; ++j) { + long double x = (std::abs(std::sin(j+1.0))+0.5) * data(j); long double y = x - ckahan; volatile long double t = tchk + y; volatile long double z = t - tchk; @@ -514,84 +615,138 @@ long double calcChecksum(Int_ptr ptr, int len, return tchk; } -long double calcChecksum(Real_ptr ptr, int len, +long double calcChecksum(Int_ptr ptr, Size_type len, Real_type scale_factor) { - long double tchk = 0.0; - long double ckahan = 0.0; - for (Index_type j = 0; j < len; ++j) { - long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j]; - long double y = x - ckahan; - volatile long double t = tchk + y; - volatile long double z = t - tchk; - ckahan = z - y; - tchk = t; -#if 0 // RDH DEBUG - if ( (j % 100) == 0 ) { - getCout() << "j : tchk = " << j << " : " << tchk << std::endl; - } -#endif - } - tchk *= scale_factor; - return tchk; + return calcChecksumImpl([=](Size_type j) { + return static_cast(ptr[j]); + }, len, scale_factor); } -long double calcChecksum(Complex_ptr ptr, int len, +long double calcChecksum(unsigned long long* ptr, Size_type len, Real_type scale_factor) { - long double tchk = 0.0; - long double ckahan = 0.0; - for (Index_type j = 0; j < len; ++j) { - long double x = (std::abs(std::sin(j+1.0))+0.5) * (real(ptr[j])+imag(ptr[j])); - long double y = x - ckahan; - volatile long double t = tchk + y; - volatile long double z = t - tchk; - ckahan = z - y; - tchk = t; -#if 0 // RDH DEBUG - if ( (j % 100) == 0 ) { - getCout() << "j : tchk = " << j << " : " << tchk << std::endl; - } -#endif - } - tchk *= scale_factor; - return tchk; + return calcChecksumImpl([=](Size_type j) { + return static_cast(ptr[j]); + }, len, scale_factor); +} + +long double calcChecksum(Real_ptr ptr, Size_type len, + Real_type scale_factor) +{ + return calcChecksumImpl([=](Size_type j) { + return static_cast(ptr[j]); + }, len, scale_factor); +} + +long double calcChecksum(Complex_ptr ptr, Size_type len, + Real_type scale_factor) +{ + return calcChecksumImpl([=](Size_type j) { + return static_cast(real(ptr[j])+imag(ptr[j])); + }, len, scale_factor); } } // closing brace for detail namespace /*! - * \brief Get an host accessible data space for this dataSpace. + * \brief Get a host data space to use when making a host copy of data in the given + * dataSpace. + * + * The returned host data space should reside in memory attached to the host. + * + * The intention is to get a data space with high performance on the host. + * Return the given data space if its already performant and fall back on a + * host data space that performs well in explicit copy operations with the + * given space. */ -DataSpace hostAccessibleDataSpace(DataSpace dataSpace) +DataSpace hostCopyDataSpace(DataSpace dataSpace) { switch (dataSpace) { case DataSpace::Host: case DataSpace::Omp: case DataSpace::CudaPinned: + case DataSpace::CudaManagedHostPreferred: + case DataSpace::CudaManagedHostPreferredDeviceAccessed: case DataSpace::HipHostAdviseFine: case DataSpace::HipHostAdviseCoarse: case DataSpace::HipPinned: case DataSpace::HipPinnedFine: case DataSpace::HipPinnedCoarse: + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseCoarse: + case DataSpace::SyclPinned: return dataSpace; case DataSpace::OmpTarget: return DataSpace::Host; case DataSpace::CudaManaged: + case DataSpace::CudaManagedDevicePreferred: + case DataSpace::CudaManagedDevicePreferredHostAccessed: case DataSpace::CudaDevice: return DataSpace::CudaPinned; + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + return DataSpace::HipPinned; + + case DataSpace::SyclManaged: + case DataSpace::SyclDevice: + return DataSpace::SyclPinned; + + default: + { + throw std::invalid_argument("hostCopyDataSpace : Unknown data space"); + } break; + } +} + +/*! + * \brief Get a data space accessible to the host for the given dataSpace. + * + * The returned host data space may reside in memory attached to another device. + * + * The intention is to get a data space accessible on the host even if it is not + * performant. Return the given data space if its already accessible and fall + * back on a space that is host accessible and performs well in explicit copy + * operations with the given space. + */ +DataSpace hostAccessibleDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::Host: + case DataSpace::Omp: + case DataSpace::CudaPinned: + case DataSpace::CudaManaged: + case DataSpace::CudaManagedHostPreferred: + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + case DataSpace::CudaManagedDevicePreferred: + case DataSpace::CudaManagedDevicePreferredHostAccessed: + case DataSpace::HipHostAdviseFine: + case DataSpace::HipHostAdviseCoarse: + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: case DataSpace::HipManaged: case DataSpace::HipManagedAdviseFine: case DataSpace::HipManagedAdviseCoarse: - return dataSpace; - case DataSpace::HipDevice: case DataSpace::HipDeviceFine: - return DataSpace::HipPinned; + case DataSpace::SyclPinned: + case DataSpace::SyclManaged: + return dataSpace; + + case DataSpace::OmpTarget: + return DataSpace::Host; + + case DataSpace::CudaDevice: + return DataSpace::CudaPinned; + + case DataSpace::SyclDevice: + return DataSpace::SyclPinned; default: { diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 1b233e574..b2fd990af 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -27,6 +27,9 @@ #if defined(RAJA_ENABLE_HIP) #include "RAJA/policy/hip/MemUtils_HIP.hpp" #endif +#if defined(RAJA_ENABLE_HIP) +#include "RAJA/policy/sycl/MemUtils_SYCL.hpp" +#endif namespace rajaperf { @@ -44,12 +47,12 @@ void resetDataInitCount(); */ void incDataInitCount(); -void copyHostData(void* dst_ptr, const void* src_ptr, size_t len); +void copyHostData(void* dst_ptr, const void* src_ptr, Size_type len); /*! * \brief Allocate data arrays. */ -void* allocHostData(size_t len, size_t align); +void* allocHostData(Size_type len, Size_type align); /*! * \brief Free data arrays. @@ -60,14 +63,14 @@ void deallocHostData(void* ptr); /*! * \brief Allocate data array in dataSpace. */ -void* allocData(DataSpace dataSpace, int nbytes, int align); +void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align); /*! * \brief Copy data from one dataSpace to another. */ void copyData(DataSpace dst_dataSpace, void* dst_ptr, DataSpace src_dataSpace, const void* src_ptr, - size_t nbytes); + Size_type nbytes); /*! * \brief Free data arrays in dataSpace. @@ -82,7 +85,7 @@ void deallocData(DataSpace dataSpace, void* ptr); * Then, two randomly-chosen entries are reset, one to * a value > 1, one to a value < -1. */ -void initData(Int_ptr& ptr, int len); +void initData(Int_ptr& ptr, Size_type len); /*! * \brief Initialize Real_type data array. @@ -91,21 +94,21 @@ void initData(Int_ptr& ptr, int len); * in the interval (0.0, 1.0) based on their array position (index) * and the order in which this method is called. */ -void initData(Real_ptr& ptr, int len); +void initData(Real_ptr& ptr, Size_type len); /*! * \brief Initialize Real_type data array. * * Array entries are set to given constant value. */ -void initDataConst(Real_ptr& ptr, int len, Real_type val); +void initDataConst(Real_ptr& ptr, Size_type len, Real_type val); /*! * \brief Initialize Index_type data array. * * Array entries are set to given constant value. */ -void initDataConst(Index_type*& ptr, int len, Index_type val); +void initDataConst(Index_type*& ptr, Size_type len, Index_type val); /*! * \brief Initialize Real_type data array with random sign. @@ -113,14 +116,14 @@ void initDataConst(Index_type*& ptr, int len, Index_type val); * Array entries are initialized in the same way as the method * initData(Real_ptr& ptr...) above, but with random sign. */ -void initDataRandSign(Real_ptr& ptr, int len); +void initDataRandSign(Real_ptr& ptr, Size_type len); /*! * \brief Initialize Real_type data array with random values. * * Array entries are initialized with random values in the interval [0.0, 1.0]. */ -void initDataRandValue(Real_ptr& ptr, int len); +void initDataRandValue(Real_ptr& ptr, Size_type len); /*! * \brief Initialize Complex_type data array. @@ -128,7 +131,7 @@ void initDataRandValue(Real_ptr& ptr, int len); * Real and imaginary array entries are initialized in the same way as the * method allocAndInitData(Real_ptr& ptr...) above. */ -void initData(Complex_ptr& ptr, int len); +void initData(Complex_ptr& ptr, Size_type len); /*! * \brief Initialize Real_type scalar data. @@ -147,23 +150,43 @@ void initData(Real_type& d); * * Checksumn is multiplied by given scale factor. */ -long double calcChecksum(Int_ptr d, int len, +long double calcChecksum(Int_ptr d, Size_type len, Real_type scale_factor); /// -long double calcChecksum(Real_ptr d, int len, +long double calcChecksum(unsigned long long* d, Size_type len, Real_type scale_factor); /// -long double calcChecksum(Complex_ptr d, int len, +long double calcChecksum(Real_ptr d, Size_type len, + Real_type scale_factor); +/// +long double calcChecksum(Complex_ptr d, Size_type len, Real_type scale_factor); } // closing brace for detail namespace /*! - * \brief Get an host accessible data space for this dataSpace. + * \brief Get a host data space to use when making a host copy of data in the given + * dataSpace. + * + * The returned host data space should reside in memory attached to the host. + * + * The intention is to get a data space with high performance on the host. + * Return the given data space if its already performant and fall back on a + * host data space that performs well in explicit copy operations with the + * given space. + */ +DataSpace hostCopyDataSpace(DataSpace dataSpace); + +/*! + * \brief Get a data space accessible to the host for the given dataSpace. + * + * The returned host data space may reside in memory attached to another device. * - * Intended to be a space that is quick to copy to from the given space if - * the given space is not accessible on the Host. + * The intention is to get a data space accessible on the host even if it is not + * performant. Return the given data space if its already accessible and fall + * back on a space that is host accessible and performs well in explicit copy + * operations with the given space. */ DataSpace hostAccessibleDataSpace(DataSpace dataSpace); @@ -171,16 +194,16 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace); * \brief Allocate data array (ptr). */ template -inline void allocData(DataSpace dataSpace, T*& ptr_ref, int len, int align) +inline void allocData(DataSpace dataSpace, T*& ptr_ref, Size_type len, Size_type align) { - size_t nbytes = len*sizeof(T); + Size_type nbytes = len*sizeof(T); T* ptr = static_cast(detail::allocData(dataSpace, nbytes, align)); #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) if (dataSpace == DataSpace::Omp) { // perform first touch on Omp Data #pragma omp parallel for - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = T{}; }; } @@ -205,9 +228,9 @@ inline void deallocData(DataSpace dataSpace, T*& ptr) template inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, DataSpace src_dataSpace, const T* src_ptr, - int len) + Size_type len) { - size_t nbytes = len*sizeof(T); + Size_type nbytes = len*sizeof(T); detail::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, nbytes); } @@ -216,7 +239,7 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, */ template inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, int len, int align) + T*& ptr, Size_type len, Size_type align) { if (new_dataSpace != old_dataSpace) { @@ -237,7 +260,7 @@ template struct AutoDataMover { AutoDataMover(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, int len, int align) + T*& ptr, Size_type len, Size_type align) : m_ptr(&ptr) , m_new_dataSpace(new_dataSpace) , m_old_dataSpace(old_dataSpace) @@ -284,17 +307,17 @@ struct AutoDataMover T** m_ptr; DataSpace m_new_dataSpace; DataSpace m_old_dataSpace; - int m_len; - int m_align; + Size_type m_len; + Size_type m_align; }; /*! * \brief Allocate and initialize data array. */ template -inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align) +inline void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace init_dataSpace = hostCopyDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -310,10 +333,10 @@ inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align) * Array entries are initialized using the method initDataConst. */ template -inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int align, +inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align, T val) { - DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace init_dataSpace = hostCopyDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -328,9 +351,9 @@ inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int ali * Array is initialized using method initDataRandSign. */ template -inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int align) +inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace init_dataSpace = hostCopyDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -346,9 +369,9 @@ inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int * Array is initialized using method initDataRandValue. */ template -inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int align) +inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace init_dataSpace = hostCopyDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -361,13 +384,13 @@ inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int * Calculate and return checksum for arrays. */ template -inline long double calcChecksum(DataSpace dataSpace, T* ptr, int len, int align, +inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size_type align, Real_type scale_factor) { T* check_ptr = ptr; T* copied_ptr = nullptr; - DataSpace check_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace check_dataSpace = hostCopyDataSpace(dataSpace); if (check_dataSpace != dataSpace) { allocData(check_dataSpace, copied_ptr, len, align); @@ -428,9 +451,9 @@ struct RAJAPoolAllocatorHolder } /*[[nodiscard]]*/ - value_type* allocate(size_t num) + value_type* allocate(Size_type num) { - if (num > std::numeric_limits::max() / sizeof(value_type)) { + if (num > std::numeric_limits::max() / sizeof(value_type)) { throw std::bad_alloc(); } diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index d730fb21d..6a951334a 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -24,7 +24,10 @@ #include "basic/REDUCE3_INT.hpp" #include "basic/INDEXLIST_3LOOP.hpp" #include "algorithm/SORT.hpp" -#include "apps/HALOEXCHANGE_FUSED.hpp" +#include "comm/HALO_PACKING_FUSED.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include "comm/HALO_EXCHANGE_FUSED.hpp" +#endif #include #include @@ -121,12 +124,13 @@ Executor::Executor(int argc, char** argv) { #if defined(RAJA_PERFSUITE_USE_CALIPER) configuration cc; - adiak::init(NULL); - adiak::user(); - adiak::launchdate(); - adiak::libraries(); - adiak::cmdline(); - adiak::clustername(); + #if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_Comm adiak_comm = MPI_COMM_WORLD; + adiak::init(&adiak_comm); + #else + adiak::init(nullptr); + #endif + adiak::collect_all(); adiak::value("perfsuite_version", cc.adiak_perfsuite_version); adiak::value("raja_version", cc.adiak_raja_version); adiak::value("cmake_build_type", cc.adiak_cmake_build_type); @@ -165,8 +169,20 @@ Executor::Executor(int argc, char** argv) if (strlen(cc.adiak_cmake_hip_architectures) > 0) { adiak::value("cmake_hip_architectures", cc.adiak_cmake_hip_architectures); } - if (cc.adiak_gpu_targets_block_sizes.size() > 0) { - adiak::value("gpu_targets_block_sizes", cc.adiak_gpu_targets_block_sizes); + if (strlen(cc.adiak_tuning_cuda_arch) > 0) { + adiak::value("tuning_cuda_arch", cc.adiak_tuning_cuda_arch); + } + if (strlen(cc.adiak_tuning_hip_arch) > 0) { + adiak::value("tuning_hip_arch", cc.adiak_tuning_hip_arch); + } + if (cc.adiak_gpu_block_sizes.size() > 0) { + adiak::value("gpu_block_sizes", cc.adiak_gpu_block_sizes); + } + if (cc.adiak_atomic_replications.size() > 0) { + adiak::value("atomic_replications", cc.adiak_atomic_replications); + } + if (cc.adiak_gpu_items_per_thread.size() > 0) { + adiak::value("gpu_items_per_thread", cc.adiak_gpu_items_per_thread); } if (cc.adiak_raja_hipcc_flags.size() > 0) { adiak::value("raja_hipcc_flags", cc.adiak_raja_hipcc_flags); @@ -316,7 +332,8 @@ void Executor::setupSuite() KernelBase::setCaliperMgrVariantTuning(vid, tstr, run_params.getOutputDirName(), - run_params.getAddToSpotConfig()); + run_params.getAddToSpotConfig(), + run_params.getAddToCaliperConfig()); #endif } @@ -392,6 +409,13 @@ void Executor::reportRunSummary(ostream& str) const str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; str << "\t Output files will be named " << ofiles << endl; +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + str << "\nRunning with " << run_params.getMPISize() << " MPI procs" << endl; + auto div3d = run_params.getMPI3DDivision(); + const char* valid3d = run_params.validMPI3DDivision() ? "" : "invalid"; + str << "\t 3D division = " << div3d[0] << " x " << div3d[1] << " x " << div3d[2] << " " << valid3d << endl; +#endif + str << "\nThe following kernels and variants (when available for a kernel) will be run:" << endl; str << "\nData Spaces" @@ -409,11 +433,55 @@ void Executor::reportRunSummary(ostream& str) const if (isVariantAvailable(VariantID::Base_HIP)) { str << "\nHip - " << getDataSpaceName(run_params.getHipDataSpace()); } + if (isVariantAvailable(VariantID::Base_SYCL)) { + str << "\nSycl - " << getDataSpaceName(run_params.getSyclDataSpace()); + } if (isVariantAvailable(VariantID::Kokkos_Lambda)) { str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosDataSpace()); } str << endl; + str << "\nReduction Data Spaces" + << "\n--------"; + str << "\nSeq - " << getDataSpaceName(run_params.getSeqReductionDataSpace()); + if (isVariantAvailable(VariantID::Base_OpenMP)) { + str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpReductionDataSpace()); + } + if (isVariantAvailable(VariantID::Base_OpenMPTarget)) { + str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetReductionDataSpace()); + } + if (isVariantAvailable(VariantID::Base_CUDA)) { + str << "\nCuda - " << getDataSpaceName(run_params.getCudaReductionDataSpace()); + } + if (isVariantAvailable(VariantID::Base_HIP)) { + str << "\nHip - " << getDataSpaceName(run_params.getHipReductionDataSpace()); + } + if (isVariantAvailable(VariantID::Kokkos_Lambda)) { + str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosReductionDataSpace()); + } + str << endl; + + str << "\nMPI Data Spaces" + << "\n--------"; + str << "\nSeq - " << getDataSpaceName(run_params.getSeqMPIDataSpace()); + if (isVariantAvailable(VariantID::Base_OpenMP)) { + str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpMPIDataSpace()); + } + if (isVariantAvailable(VariantID::Base_OpenMPTarget)) { + str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetMPIDataSpace()); + } + if (isVariantAvailable(VariantID::Base_CUDA)) { + str << "\nCuda - " << getDataSpaceName(run_params.getCudaMPIDataSpace()); + } + if (isVariantAvailable(VariantID::Base_HIP)) { + str << "\nHip - " << getDataSpaceName(run_params.getHipMPIDataSpace()); + } + if (isVariantAvailable(VariantID::Kokkos_Lambda)) { + str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosMPIDataSpace()); + } + str << endl; + + str << "\nVariants and Tunings" << "\n--------\n"; for (size_t iv = 0; iv < variant_ids.size(); ++iv) { @@ -456,15 +524,21 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const Index_type itsrep_width = 0; Index_type bytesrep_width = 0; Index_type flopsrep_width = 0; + Index_type bytesReadrep_width = 0; + Index_type bytesWrittenrep_width = 0; + Index_type bytesAtomicModifyWrittenrep_width = 0; Index_type dash_width = 0; for (size_t ik = 0; ik < kernels.size(); ++ik) { kercol_width = max(kercol_width, kernels[ik]->getName().size()); psize_width = max(psize_width, kernels[ik]->getActualProblemSize()); reps_width = max(reps_width, kernels[ik]->getRunReps()); - itsrep_width = max(reps_width, kernels[ik]->getItsPerRep()); + itsrep_width = max(itsrep_width, kernels[ik]->getItsPerRep()); bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep()); - flopsrep_width = max(bytesrep_width, kernels[ik]->getFLOPsPerRep()); + flopsrep_width = max(flopsrep_width, kernels[ik]->getFLOPsPerRep()); + bytesReadrep_width = max(bytesReadrep_width, kernels[ik]->getBytesReadPerRep()); + bytesWrittenrep_width = max(bytesWrittenrep_width, kernels[ik]->getBytesWrittenPerRep()); + bytesAtomicModifyWrittenrep_width = max(bytesAtomicModifyWrittenrep_width, kernels[ik]->getBytesAtomicModifyWrittenPerRep()); } const string sepchr(" , "); @@ -508,6 +582,24 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const static_cast(frsize) ) + 3; dash_width += flopsrep_width + static_cast(sepchr.size()); + double brrsize = log10( static_cast(bytesReadrep_width) ); + string bytesReadrep_head("BytesRead/rep"); + bytesReadrep_width = max( static_cast(bytesReadrep_head.size()), + static_cast(brrsize) ) + 3; + dash_width += bytesReadrep_width + static_cast(sepchr.size()); + + double bwrsize = log10( static_cast(bytesWrittenrep_width) ); + string bytesWrittenrep_head("BytesWritten/rep"); + bytesWrittenrep_width = max( static_cast(bytesWrittenrep_head.size()), + static_cast(bwrsize) ) + 3; + dash_width += bytesWrittenrep_width + static_cast(sepchr.size()); + + double bamrrsize = log10( static_cast(bytesAtomicModifyWrittenrep_width) ); + string bytesAtomicModifyWrittenrep_head("BytesAtomicModifyWritten/rep"); + bytesAtomicModifyWrittenrep_width = max( static_cast(bytesAtomicModifyWrittenrep_head.size()), + static_cast(bamrrsize) ) + 3; + dash_width += bytesAtomicModifyWrittenrep_width + static_cast(sepchr.size()); + str <getKernelsPerRep() << sepchr <getBytesPerRep() << sepchr <getFLOPsPerRep() + << sepchr <getBytesReadPerRep() + << sepchr <getBytesWrittenPerRep() + << sepchr <getBytesAtomicModifyWrittenPerRep() << endl; } @@ -632,59 +730,77 @@ void Executor::runWarmupKernels() getCout() << "\n\nRun warmup kernels...\n"; // - // For kernels to be run, assemble a set of feature IDs + // Get warmup kernels to run from input // - std::set feature_ids; - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kernel = kernels[ik]; + std::set kernel_ids = run_params.getWarmupKernelIDsToRun(); + + if ( kernel_ids.empty() ) { + + // + // If no warmup kernels were given, choose a warmup kernel for each feature + // + + // + // For kernels to be run, assemble a set of feature IDs + // + std::set feature_ids; + for (size_t ik = 0; ik < kernels.size(); ++ik) { + KernelBase* kernel = kernels[ik]; - for (size_t fid = 0; fid < NumFeatures; ++fid) { - FeatureID tfid = static_cast(fid); - if (kernel->usesFeature(tfid) ) { - feature_ids.insert( tfid ); + for (size_t fid = 0; fid < NumFeatures; ++fid) { + FeatureID tfid = static_cast(fid); + if (kernel->usesFeature(tfid) ) { + feature_ids.insert( tfid ); + } } - } - - } // iterate over kernels - // - // Map feature IDs to set of warmup kernel IDs - // - std::set kernel_ids; - for ( auto fid = feature_ids.begin(); fid != feature_ids.end(); ++ fid ) { + } // iterate over kernels - switch (*fid) { + // + // Map feature IDs to set of warmup kernel IDs + // + for ( auto fid = feature_ids.begin(); fid != feature_ids.end(); ++ fid ) { - case Forall: - case Kernel: - case Launch: - kernel_ids.insert(Basic_DAXPY); break; + switch (*fid) { - case Sort: - kernel_ids.insert(Algorithm_SORT); break; - - case Scan: - kernel_ids.insert(Basic_INDEXLIST_3LOOP); break; + case Forall: + case Kernel: + case Launch: + kernel_ids.insert(Basic_DAXPY); break; - case Workgroup: - kernel_ids.insert(Apps_HALOEXCHANGE_FUSED); break; + case Sort: + kernel_ids.insert(Algorithm_SORT); break; - case Reduction: - kernel_ids.insert(Basic_REDUCE3_INT); break; + case Scan: + kernel_ids.insert(Basic_INDEXLIST_3LOOP); break; - case Atomic: - kernel_ids.insert(Basic_PI_ATOMIC); break; + case Workgroup: + kernel_ids.insert(Comm_HALO_PACKING_FUSED); break; - case View: - break; - - default: - break; + case Reduction: + kernel_ids.insert(Basic_REDUCE3_INT); break; + + case Atomic: + kernel_ids.insert(Basic_PI_ATOMIC); break; + + case View: + break; + + #ifdef RAJA_PERFSUITE_ENABLE_MPI + case MPI: + kernel_ids.insert(Comm_HALO_EXCHANGE_FUSED); break; + #endif + + default: + break; + + } } } + // // Run warmup kernels // diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index c16286700..348fb44b7 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 8d6012a6d..dcf309ec9 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -16,10 +16,12 @@ #include "rajaperf_config.hpp" +#include + namespace rajaperf { -namespace gpu_block_size +namespace integer { namespace detail @@ -44,50 +46,40 @@ constexpr size_t lesser_of_squarest_factor_pair_helper(size_t n, size_t guess) : lesser_of_squarest_factor_pair_helper(n, guess - 1); // continue searching } -// class to get the size of a camp::int_seq -template < typename IntSeq > -struct SizeOfIntSeq; -/// -template < size_t... Is > -struct SizeOfIntSeq> -{ - static const size_t size = sizeof...(Is); -}; - // class to help prepend integers to a list -// this is used for the false case where I is not prepended to IntSeq -template < bool B, size_t I, typename IntSeq > +// this is used for the false case where I is not prepended to List +template < bool B, typename T, typename List > struct conditional_prepend { - using type = IntSeq; + using type = List; }; -/// this is used for the true case where I is prepended to IntSeq -template < size_t I, size_t... Is > -struct conditional_prepend> +/// this is used for the true case where I is prepended to List +template < typename T, typename... Ts > +struct conditional_prepend> { - using type = camp::int_seq; + using type = camp::list; }; -// class to help create a sequence that is only the valid values in IntSeq -template < typename validity_checker, typename IntSeq > +// class to help create a sequence that is only the valid values in List +template < typename validity_checker, typename List > struct remove_invalid; // base case where the list is empty, use the empty list template < typename validity_checker > -struct remove_invalid> +struct remove_invalid> { - using type = camp::int_seq; + using type = camp::list<>; }; -// check validity of I and conditionally prepend I to a recursively generated +// check validity of T and conditionally prepend T to a recursively generated // list of valid values -template < typename validity_checker, size_t I, size_t... Is > -struct remove_invalid> +template < typename validity_checker, typename T, typename... Ts > +struct remove_invalid> { using type = typename conditional_prepend< - validity_checker::template valid(), - I, - typename remove_invalid>::type + validity_checker::valid(T{}), + T, + typename remove_invalid>::type >::type; }; @@ -119,55 +111,236 @@ constexpr size_t greater_of_squarest_factor_pair(size_t n) // always true struct AllowAny { - template < size_t I > - static constexpr bool valid() { return true; } + static constexpr bool valid(size_t RAJAPERF_UNUSED_ARG(i)) { return true; } +}; + +// true only if i > 0 +struct PositiveOnly +{ + static constexpr bool valid(size_t i) { return i > 0; } }; -// true if of I is a multiple of N, false otherwise +// true if of i is a multiple of N, false otherwise template < size_t N > struct MultipleOf { - template < size_t I > - static constexpr bool valid() { return (I/N)*N == I; } + static constexpr bool valid(size_t i) { return (i/N)*N == i; } }; -// true if the sqrt of I is representable as a size_t, false otherwise +// true if the sqrt of i is representable as a size_t, false otherwise struct ExactSqrt { - template < size_t I > - static constexpr bool valid() { return sqrt(I)*sqrt(I) == I; } + static constexpr bool valid(size_t i) { return sqrt(i)*sqrt(i) == i; } }; -template < size_t... block_sizes > -using list_type = camp::int_seq; +template < size_t N > +struct LessEqual +{ + static constexpr bool valid(size_t i) { return i <= N; } +}; -// A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes -// if rajaperf::configuration::gpu_block_sizes is not empty -// and a camp::int_seq of default_block_size otherwise -// with invalid entries removed according to validity_checker -template < size_t default_block_size, typename validity_checker = AllowAny > -using make_list_type = +// A camp::list of camp::integral_constant types. +// If gpu_block_sizes from the configuration is not empty it is those gpu_block_sizes, +// otherwise it is a list containing just default_block_size. +// Invalid entries are removed according to validity_checker in either case. +template < size_t default_block_size, typename validity_checker = PositiveOnly > +using make_gpu_block_size_list_type = typename detail::remove_invalid::size > 0), + typename std::conditional< (camp::size::value > 0), rajaperf::configuration::gpu_block_sizes, list_type >::type >::type; -} // closing brace for gpu_block_size namespace +// A camp::list of camp::integral_constant types. +// If atomic_replications from the configuration is not empty it is those atomic_replications, +// otherwise it is a list containing just default_atomic_replication. +// Invalid entries are removed according to validity_checker in either case. +template < size_t default_atomic_replication, typename validity_checker = PositiveOnly > +using make_atomic_replication_list_type = + typename detail::remove_invalid::value > 0), + rajaperf::configuration::atomic_replications, + list_type + >::type + >::type; + +// A camp::list of camp::integral_constant types. +// If gpu_items_per_thread from the configuration is not empty it is those gpu_items_per_thread, +// otherwise it is a list containing just default_gpu_items_per_thread. +// Invalid entries are removed according to validity_checker in either case. +template < size_t default_gpu_items_per_thread, typename validity_checker = PositiveOnly > +using make_gpu_items_per_thread_list_type = + typename detail::remove_invalid::value > 0), + rajaperf::configuration::gpu_items_per_thread, + list_type + >::type + >::type; + +} // closing brace for integer namespace -//compile time loop over an integer sequence -//this allows for creating a loop over a compile time constant variable -template -inline void seq_for(camp::int_seq const&, Func&& func) +namespace gpu_algorithm { + +struct block_atomic_helper { - // braced init lists are evaluated in order - int seq_unused_array[] = {(func(camp::integral_constant{}), 0)...}; - RAJAPERF_UNUSED_VAR(seq_unused_array); -} + static constexpr bool atomic = true; + static std::string get_name() { return "blkatm"; } +}; + +struct block_device_helper +{ + static constexpr bool atomic = false; + static std::string get_name() { return "blkdev"; } +}; + +struct block_host_helper +{ + static constexpr bool atomic = false; + static std::string get_name() { return "blkhst"; } +}; + +using reducer_helpers = camp::list< + block_atomic_helper, + block_device_helper >; + +} // closing brace for gpu_algorithm namespace + +namespace gpu_mapping { + +struct global_direct_helper +{ + static constexpr bool direct = true; + static std::string get_name() { return "direct"; } +}; + +struct global_loop_occupancy_grid_stride_helper +{ + static constexpr bool direct = false; + static std::string get_name() { return "occgs"; } +}; + +using reducer_helpers = camp::list< + global_direct_helper, + global_loop_occupancy_grid_stride_helper >; + +} // closing brace for gpu_mapping namespace } // closing brace for rajaperf namespace +// Get the max number of blocks to launch with the given MappingHelper +// for kernel func with the given block_size and shmem. +// This will use the occupancy calculator if MappingHelper::direct is false +#define RAJAPERF_CUDA_GET_MAX_BLOCKS(MappingHelper, func, block_size, shmem) \ + MappingHelper::direct \ + ? std::numeric_limits::max() \ + : detail::getCudaOccupancyMaxBlocks( \ + (func), (block_size), (shmem)); +/// +#define RAJAPERF_HIP_GET_MAX_BLOCKS(MappingHelper, func, block_size, shmem) \ + MappingHelper::direct \ + ? std::numeric_limits::max() \ + : detail::getHipOccupancyMaxBlocks( \ + (func), (block_size), (shmem)); + +// allocate pointer of pointer_type with length +// device_ptr_name gets memory in the reduction data space for the current variant +// host_ptr_name is set to either device_ptr_name if the reduction data space is +// host accessible or a new allocation in a host accessible data space otherwise +#define RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication) \ + DataSpace reduction_data_space = getReductionDataSpace(vid); \ + DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); \ + \ + pointer_type device_ptr_name; \ + allocData(reduction_data_space, device_ptr_name, (length)*(replication)); \ + pointer_type host_ptr_name = device_ptr_name; \ + if (reduction_data_space != host_data_space) { \ + allocData(host_data_space, host_ptr_name, (length)*(replication)); \ + } + +// deallocate device_ptr_name and host_ptr_name +// must be in the same scope as RAJAPERF_GPU_REDUCER_SETUP_IMPL +#define RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) \ + deallocData(reduction_data_space, device_ptr_name); \ + if (reduction_data_space != host_data_space) { \ + deallocData(host_data_space, host_ptr_name); \ + } + +// Initialize device_ptr_name with length copies of init_value +// host_ptr_name will be used as an intermediary with an explicit copy +// if the reduction data space is not host accessible +#define RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(gpu_type, init_value, device_ptr_name, host_ptr_name, length, replication) \ + if (device_ptr_name != host_ptr_name) { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t r = 0; r < static_cast(replication); ++r) { \ + host_ptr_name[i*(replication) + r] = (init_value); \ + } \ + } \ + gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ + (length)*(replication)*sizeof(device_ptr_name[0]), \ + gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ + } else { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t r = 0; r < static_cast(replication); ++r) { \ + device_ptr_name[i*(replication) + r] = (init_value); \ + } \ + } \ + } + +// Initialize device_ptr_name with values in init_ptr +// host_ptr_name will be used as an intermediary with an explicit copy +// if the reduction data space is not host accessible +#define RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(gpu_type, init_ptr, device_ptr_name, host_ptr_name, length, replication) \ + if (device_ptr_name != host_ptr_name) { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t r = 0; r < static_cast(replication); ++r) { \ + host_ptr_name[i*(replication) + r] = (init_ptr)[i]; \ + } \ + } \ + gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ + (length)*(replication)*sizeof(device_ptr_name[0]), \ + gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ + } else { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t r = 0; r < static_cast(replication); ++r) { \ + device_ptr_name[i*(replication) + r] = (init_ptr)[i]; \ + } \ + } \ + } + +// Copy back data from device_ptr_name into host_ptr_name +// if the reduction data space is not host accessible +#define RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(gpu_type, device_ptr_name, host_ptr_name, length, replication) \ + if (device_ptr_name != host_ptr_name) { \ + gpu_type##Errchk( gpu_type##MemcpyAsync( host_ptr_name, device_ptr_name, \ + (length)*(replication)*sizeof(device_ptr_name[0]), \ + gpu_type##MemcpyDeviceToHost, res.get_stream() ) ); \ + } \ + gpu_type##Errchk( gpu_type##StreamSynchronize( res.get_stream() ) ); + +#define RAJAPERF_CUDA_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_CUDA_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ + RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) +#define RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(cuda, init_value, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_CUDA_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(cuda, init_ptr, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_CUDA_REDUCER_COPY_BACK(device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, device_ptr_name, host_ptr_name, length, replication) + +#define RAJAPERF_HIP_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_HIP_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ + RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) +#define RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(hip, init_value, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_HIP_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(hip, init_ptr, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_HIP_REDUCER_COPY_BACK(device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, device_ptr_name, host_ptr_name, length, replication) + + // #define RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(kernel, variant) \ void kernel::run##variant##Variant(VariantID vid, size_t tune_idx) \ @@ -190,7 +363,11 @@ inline void seq_for(camp::int_seq const&, Func&& func) seq_for(gpu_block_sizes_type{}, [&](auto block_size) { \ if (run_params.numValidGPUBlockSize() == 0u || \ run_params.validGPUBlockSize(block_size)) { \ - addVariantTuningName(vid, "block_"+std::to_string(block_size)); \ + if (block_size == 0u) { \ + addVariantTuningName(vid, "block_auto"); \ + } else { \ + addVariantTuningName(vid, "block_"+std::to_string(block_size)); \ + } \ } \ }); \ } diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 8046fe785..14c1b7381 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -29,6 +29,33 @@ namespace rajaperf { +/*! + * \brief Method for launching a HIP kernel with given configuration. + * + * Note: method checks whether number of args and their types in + * kernel signature matches args passed to this method. + */ +template +void RPlaunchHipKernel(void (*kernel)(KernArgs...), + const dim3& numBlocks, const dim3& dimBlocks, + std::uint32_t sharedMemBytes, hipStream_t stream, + Args const&... args) +{ + static_assert(sizeof...(KernArgs) == sizeof...(Args), + "Number of kernel args doesn't match what's passed to method"); + + static_assert(conjunction, std::decay_t>...>::value, + "Kernel arg types don't match what's passed to method"); + + constexpr size_t count = sizeof...(Args); + void* arg_arr[count]{(void*)&args...}; + + auto k = reinterpret_cast(kernel); + hipErrchk( hipLaunchKernel(k, numBlocks, dimBlocks, + arg_arr, + sharedMemBytes, stream) ); +} + /*! * \brief Simple forall hip kernel that runs a lambda. */ @@ -81,10 +108,37 @@ inline int getHipDevice() return device; } +/*! + * \brief Get properties of the current hip device. + */ +inline hipDeviceProp_t getHipDeviceProp() +{ + hipDeviceProp_t prop; + hipErrchk(hipGetDeviceProperties(&prop, getHipDevice())); + return prop; +} + +/*! + * \brief Get max occupancy in blocks for the given kernel for the current + * hip device. + */ +template < typename Func > +RAJA_INLINE +int getHipOccupancyMaxBlocks(Func&& func, int num_threads, size_t shmem_size) +{ + int max_blocks = -1; + hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks, func, num_threads, shmem_size)); + + size_t multiProcessorCount = getHipDeviceProp().multiProcessorCount; + + return max_blocks * multiProcessorCount; +} + /* * Copy memory len bytes from src to dst. */ -inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len) +inline void copyHipData(void* dst_ptr, const void* src_ptr, Size_type len) { hipErrchk( hipMemcpy( dst_ptr, src_ptr, len, hipMemcpyDefault ) ); @@ -93,7 +147,7 @@ inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len) /*! * \brief Allocate HIP device data array (dptr). */ -inline void* allocHipDeviceData(size_t len) +inline void* allocHipDeviceData(Size_type len) { void* dptr = nullptr; hipErrchk( hipMalloc( &dptr, len ) ); @@ -103,7 +157,7 @@ inline void* allocHipDeviceData(size_t len) /*! * \brief Allocate HIP fine-grained device data array (dfptr). */ -inline void* allocHipDeviceFineData(size_t len) +inline void* allocHipDeviceFineData(Size_type len) { void* dfptr = nullptr; hipErrchk( hipExtMallocWithFlags( &dfptr, len, @@ -114,7 +168,7 @@ inline void* allocHipDeviceFineData(size_t len) /*! * \brief Allocate HIP managed data array (mptr). */ -inline void* allocHipManagedData(size_t len) +inline void* allocHipManagedData(Size_type len) { void* mptr = nullptr; hipErrchk( hipMallocManaged( &mptr, len, @@ -125,7 +179,7 @@ inline void* allocHipManagedData(size_t len) /*! * \brief Allocate HIP pinned data array (pptr). */ -inline void* allocHipPinnedData(size_t len) +inline void* allocHipPinnedData(Size_type len) { void* pptr = nullptr; hipErrchk( hipHostMalloc( &pptr, len, @@ -136,7 +190,7 @@ inline void* allocHipPinnedData(size_t len) /*! * \brief Allocate HIP fine-grained pinned data array (pfptr). */ -inline void* allocHipPinnedFineData(size_t len) +inline void* allocHipPinnedFineData(Size_type len) { void* pfptr = nullptr; hipErrchk( hipHostMalloc( &pfptr, len, @@ -147,7 +201,7 @@ inline void* allocHipPinnedFineData(size_t len) /*! * \brief Allocate HIP coarse-grained pinned data array (pcptr). */ -inline void* allocHipPinnedCoarseData(size_t len) +inline void* allocHipPinnedCoarseData(Size_type len) { void* pcptr = nullptr; hipErrchk( hipHostMalloc( &pcptr, len, @@ -158,7 +212,7 @@ inline void* allocHipPinnedCoarseData(size_t len) /*! * \brief Apply mem advice to HIP data array (ptr). */ -inline void adviseHipData(void* ptr, int len, hipMemoryAdvise advice, int device) +inline void adviseHipData(void* ptr, size_t len, hipMemoryAdvise advice, int device) { hipErrchk( hipMemAdvise( ptr, len, advice, device ) ); } diff --git a/src/common/HipGridScan.hpp b/src/common/HipGridScan.hpp new file mode 100644 index 000000000..c8c0c6e8b --- /dev/null +++ b/src/common/HipGridScan.hpp @@ -0,0 +1,260 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#if defined(RAJA_ENABLE_HIP) + +#include +#include +#include +#include + +#include + +namespace rajaperf +{ +namespace detail +{ +namespace hip +{ + +// +// Define magic numbers for HIP execution +// +const size_t warp_size = 64; +const size_t max_static_shmem = 65536; + + +// perform a grid scan on val and returns the result at each thread +// in exclusive and inclusive, note that val is used as scratch space +template < typename DataType, size_t block_size, size_t items_per_thread > +struct GridScan +{ + using BlockScan = rocprim::block_scan; //, rocprim::block_scan_algorithm::reduce_then_scan>; + using BlockExchange = rocprim::block_exchange; + using WarpReduce = rocprim::warp_reduce; + + union SharedStorage { + typename BlockScan::storage_type block_scan_storage; + typename BlockExchange::storage_type block_exchange_storage; + typename WarpReduce::storage_type warp_reduce_storage; + volatile DataType prev_grid_count; + }; + + static constexpr size_t shmem_size = sizeof(SharedStorage); + + __device__ + static void grid_scan(const int block_id, + DataType (&val)[items_per_thread], + DataType (&exclusive)[items_per_thread], + DataType (&inclusive)[items_per_thread], + DataType* block_counts, + DataType* grid_counts, + unsigned* block_readys) + { + const bool first_block = (block_id == 0); + const bool last_block = (block_id == static_cast(gridDim.x-1)); + const bool last_thread = (threadIdx.x == block_size-1); + const bool last_warp = (threadIdx.x >= block_size - warp_size); + const int warp_index = (threadIdx.x % warp_size); + const unsigned long long warp_index_mask = (1ull << warp_index); + const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull); + + __shared__ SharedStorage s_temp_storage; + + + BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage); + __syncthreads(); + + + BlockScan().exclusive_scan(val, exclusive, DataType{0}, s_temp_storage.block_scan_storage); + __syncthreads(); + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + inclusive[ti] = exclusive[ti] + val[ti]; + } + + BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage); + __syncthreads(); + BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage); + __syncthreads(); + if (first_block) { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready + } + + } else { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 1u); // write block_counts is ready + } + + // get prev_grid_count using last warp in block + if (last_warp) { + + DataType prev_grid_count = 0; + + // accumulate previous block counts into registers of warp + + int prev_block_base_id = block_id - warp_size; + + unsigned prev_block_ready = 0u; + unsigned long long prev_blocks_ready_ballot = 0ull; + unsigned long long prev_grids_ready_ballot = 0ull; + + // accumulate full warp worths of block counts + // stop if run out of full warps of a grid count is ready + while (prev_block_base_id >= 0) { + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + do { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + + prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); + + } while (prev_blocks_ready_ballot != 0xffffffffffffffffull); + + prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); + + if (prev_grids_ready_ballot != 0ull) { + break; + } + + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + + prev_block_ready = 0u; + + prev_block_base_id -= warp_size; + } + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + // this checks that block counts is ready for all blocks above + // the highest grid count that is ready + while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { + + if (prev_block_id >= 0) { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + } + + prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); + prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); + } + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // read one grid_count from a block with id grid_count_ready_id + // and read the block_counts from blocks with higher ids. + if (warp_index_mask > prev_grids_ready_ballot) { + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // accumulate grid_count for grid_count_ready_id + prev_grid_count += grid_counts[prev_block_id]; + } + + + WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage); + prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp + + if (last_thread) { + + if (!last_block) { + grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } + + s_temp_storage.prev_grid_count = prev_grid_count; + } + } + + __syncthreads(); + DataType prev_grid_count = s_temp_storage.prev_grid_count; + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + exclusive[ti] = prev_grid_count + exclusive[ti]; + inclusive[ti] = prev_grid_count + inclusive[ti]; + } + } + } + +}; + + +namespace detail +{ + +template < typename T, size_t block_size, size_t max_items_per_thread > +struct grid_scan_max_items_per_thread + : std::conditional_t< (GridScan::shmem_size <= max_static_shmem), + grid_scan_max_items_per_thread, + std::integral_constant > +{ +}; + +} + +template < typename T, size_t block_size > +struct grid_scan_max_items_per_thread + : detail::grid_scan_max_items_per_thread +{ +}; + + +// tune grid scan to maximize throughput while minimizing items_per_thread + +// default tuning for unknown DataType or hip_arch +template < typename DataType, size_t block_size, size_t hip_arch, typename enable = void > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = + grid_scan_max_items_per_thread::value / 2; +}; + +// tuning for gfx90a +template < typename DataType, size_t block_size > +struct grid_scan_default_items_per_thread< + DataType, block_size, 910, std::enable_if_t > +{ + static constexpr size_t value = + (block_size <= 64) ? 6 : + (block_size <= 128) ? 4 : + (block_size <= 256) ? 4 : + (block_size <= 512) ? 4 : + (block_size <= 1024) ? 2 : 1; +}; + +// tuning for gfx942 +template < typename DataType, size_t block_size > +struct grid_scan_default_items_per_thread< + DataType, block_size, 942, std::enable_if_t> +{ + static constexpr size_t value = + (block_size <= 64) ? 22 : + (block_size <= 128) ? 22 : + (block_size <= 256) ? 19 : + (block_size <= 512) ? 13 : + (block_size <= 1024) ? 7 : 1; +}; + +} // end namespace hip +} // end namespace detail +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 646c9bd8d..679001df5 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -11,6 +11,7 @@ #include "RunParams.hpp" #include "OpenMPTargetDataUtils.hpp" +#include "RAJA/RAJA.hpp" #include #include #include @@ -37,7 +38,9 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) its_per_rep = -1; kernels_per_rep = -1; - bytes_per_rep = -1; + bytes_read_per_rep = -1; + bytes_written_per_rep = -1; + bytes_atomic_modify_written_per_rep = -1; FLOPs_per_rep = -1; running_variant = NumVariants; @@ -68,6 +71,18 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); + Bytes_Read_Rep_attr = cali_create_attribute("BytesRead/Rep", CALI_TYPE_DOUBLE, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Bytes_Written_Rep_attr = cali_create_attribute("BytesWritten/Rep", CALI_TYPE_DOUBLE, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Bytes_AtomicModifyWritten_Rep_attr = cali_create_attribute("BytesAtomicModifyWritten/Rep", CALI_TYPE_DOUBLE, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); Flops_Rep_attr = cali_create_attribute("Flops/Rep", CALI_TYPE_DOUBLE, CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | @@ -76,6 +91,14 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); + for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { + FeatureID fid = static_cast(i); + std::string feature = getFeatureName(fid); + Feature_attrs[feature] = cali_create_attribute(feature.c_str(), CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + } #endif } @@ -167,13 +190,23 @@ void KernelBase::setVariantDefined(VariantID vid) #endif break; } + + case Base_SYCL: + case RAJA_SYCL: + { +#if defined(RAJA_ENABLE_SYCL) + setSyclTuningDefinitions(vid); +#endif + break; + } + // Required for running Kokkos case Kokkos_Lambda : { #if defined(RUN_KOKKOS) - setKokkosTuningDefinitions(vid); + setKokkosTuningDefinitions(vid); #endif - break; + break; } default : { @@ -194,7 +227,7 @@ void KernelBase::setVariantDefined(VariantID vid) #endif } -int KernelBase::getDataAlignment() const +Size_type KernelBase::getDataAlignment() const { return run_params.getDataAlignment(); } @@ -227,6 +260,10 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const case RAJA_HIP : return run_params.getHipDataSpace(); + case Base_SYCL : + case RAJA_SYCL : + return run_params.getSyclDataSpace(); + case Kokkos_Lambda : return run_params.getKokkosDataSpace(); @@ -235,9 +272,84 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const } } -DataSpace KernelBase::getHostAccessibleDataSpace(VariantID vid) const +DataSpace KernelBase::getMPIDataSpace(VariantID vid) const +{ + switch ( vid ) { + + case Base_Seq : + case Lambda_Seq : + case RAJA_Seq : + return run_params.getSeqMPIDataSpace(); + + case Base_OpenMP : + case Lambda_OpenMP : + case RAJA_OpenMP : + return run_params.getOmpMPIDataSpace(); + + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + return run_params.getOmpTargetMPIDataSpace(); + + case Base_CUDA : + case Lambda_CUDA : + case RAJA_CUDA : + return run_params.getCudaMPIDataSpace(); + + case Base_HIP : + case Lambda_HIP : + case RAJA_HIP : + return run_params.getHipMPIDataSpace(); + + case Base_SYCL : + case RAJA_SYCL : + return run_params.getSyclMPIDataSpace(); + + case Kokkos_Lambda : + return run_params.getKokkosMPIDataSpace(); + + default: + throw std::invalid_argument("getDataSpace : Unknown variant id"); + } +} + +DataSpace KernelBase::getReductionDataSpace(VariantID vid) const { - return hostAccessibleDataSpace(getDataSpace(vid)); + switch ( vid ) { + + case Base_Seq : + case Lambda_Seq : + case RAJA_Seq : + return run_params.getSeqReductionDataSpace(); + + case Base_OpenMP : + case Lambda_OpenMP : + case RAJA_OpenMP : + return run_params.getOmpReductionDataSpace(); + + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + return run_params.getOmpTargetReductionDataSpace(); + + case Base_CUDA : + case Lambda_CUDA : + case RAJA_CUDA : + return run_params.getCudaReductionDataSpace(); + + case Base_HIP : + case Lambda_HIP : + case RAJA_HIP : + return run_params.getHipReductionDataSpace(); + + case Base_SYCL : + case RAJA_SYCL : + return run_params.getSyclReductionDataSpace(); + + case Kokkos_Lambda : + return run_params.getKokkosReductionDataSpace(); + + default: + throw std::invalid_argument("getReductionDataSpace : Unknown variant id"); + } } void KernelBase::execute(VariantID vid, size_t tune_idx) @@ -339,11 +451,22 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx) #endif break; } + + case Base_SYCL: + case RAJA_SYCL: + { +#if defined(RAJA_ENABLE_SYCL) + runSyclVariant(vid, tune_idx); +#endif + break; + } + case Kokkos_Lambda : { #if defined(RUN_KOKKOS) runKokkosVariant(vid, tune_idx); #endif + break; } default : { @@ -384,7 +507,9 @@ void KernelBase::print(std::ostream& os) const } os << "\t\t\t its_per_rep = " << its_per_rep << std::endl; os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl; - os << "\t\t\t bytes_per_rep = " << bytes_per_rep << std::endl; + os << "\t\t\t bytes_read_per_rep = " << bytes_read_per_rep << std::endl; + os << "\t\t\t bytes_written_per_rep = " << bytes_written_per_rep << std::endl; + os << "\t\t\t bytes_atomic_modify_written_per_rep = " << bytes_atomic_modify_written_per_rep << std::endl; os << "\t\t\t FLOPs_per_rep = " << FLOPs_per_rep << std::endl; os << "\t\t\t num_exec: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { @@ -439,8 +564,16 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx) cali_set_double(Iters_Rep_attr,(double)getItsPerRep()); cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep()); cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep()); + cali_set_double(Bytes_Read_Rep_attr,(double)getBytesReadPerRep()); + cali_set_double(Bytes_Written_Rep_attr,(double)getBytesWrittenPerRep()); + cali_set_double(Bytes_AtomicModifyWritten_Rep_attr,(double)getBytesAtomicModifyWrittenPerRep()); cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep()); cali_set_double(BlockSize_attr, getBlockSize()); + for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { + FeatureID fid = static_cast(i); + std::string feature = getFeatureName(fid); + cali_set_int(Feature_attrs[feature], usesFeature(fid)); + } } } @@ -454,7 +587,8 @@ void KernelBase::doOnceCaliMetaEnd(VariantID vid, size_t tune_idx) void KernelBase::setCaliperMgrVariantTuning(VariantID vid, std::string tstr, const std::string& outdir, - const std::string& addToConfig) + const std::string& addToSpotConfig, + const std::string& addToCaliConfig) { static bool ran_spot_config_check = false; bool config_ok = true; @@ -476,8 +610,21 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, { "expr": "any(max#Iterations/Rep)", "as": "Iterations/Rep" }, { "expr": "any(max#Kernels/Rep)", "as": "Kernels/Rep" }, { "expr": "any(max#Bytes/Rep)", "as": "Bytes/Rep" }, + { "expr": "any(max#BytesRead/Rep)", "as": "BytesRead/Rep" }, + { "expr": "any(max#BytesWritten/Rep)", "as": "BytesWritten/Rep" }, + { "expr": "any(max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" }, { "expr": "any(max#Flops/Rep)", "as": "Flops/Rep" }, - { "expr": "any(max#BlockSize)", "as": "BlockSize" } + { "expr": "any(max#BlockSize)", "as": "BlockSize" }, + { "expr": "any(max#Forall)", "as": "FeatureForall" }, + { "expr": "any(max#Kernel)", "as": "FeatureKernel" }, + { "expr": "any(max#Launch)", "as": "FeatureLaunch" }, + { "expr": "any(max#Sort)", "as": "FeatureSort" }, + { "expr": "any(max#Scan)", "as": "FeatureScan" }, + { "expr": "any(max#Workgroup)", "as": "FeatureWorkgroup" }, + { "expr": "any(max#Reduction)", "as": "FeatureReduction" }, + { "expr": "any(max#Atomic)", "as": "FeatureAtomic" }, + { "expr": "any(max#View)", "as": "FeatureView" }, + { "expr": "any(max#MPI)", "as": "FeatureMPI" } ] }, { @@ -489,21 +636,47 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, { "expr": "any(any#max#Iterations/Rep)", "as": "Iterations/Rep" }, { "expr": "any(any#max#Kernels/Rep)", "as": "Kernels/Rep" }, { "expr": "any(any#max#Bytes/Rep)", "as": "Bytes/Rep" }, + { "expr": "any(any#max#BytesRead/Rep)", "as": "BytesRead/Rep" }, + { "expr": "any(any#max#BytesWritten/Rep)", "as": "BytesWritten/Rep" }, + { "expr": "any(any#max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" }, { "expr": "any(any#max#Flops/Rep)", "as": "Flops/Rep" }, - { "expr": "any(any#max#BlockSize)", "as": "BlockSize" } + { "expr": "any(any#max#BlockSize)", "as": "BlockSize" }, + { "expr": "any(any#max#Forall)", "as": "FeatureForall" }, + { "expr": "any(any#max#Kernel)", "as": "FeatureKernel" }, + { "expr": "any(any#max#Launch)", "as": "FeatureLaunch" }, + { "expr": "any(any#max#Sort)", "as": "FeatureSort" }, + { "expr": "any(any#max#Scan)", "as": "FeatureScan" }, + { "expr": "any(any#max#Workgroup)", "as": "FeatureWorkgroup" }, + { "expr": "any(any#max#Reduction)", "as": "FeatureReduction" }, + { "expr": "any(any#max#Atomic)", "as": "FeatureAtomic" }, + { "expr": "any(any#max#View)", "as": "FeatureView" }, + { "expr": "any(any#max#MPI)", "as": "FeatureMPI" } ] } ] } )json"; - if(!ran_spot_config_check && (!addToConfig.empty())) { + // Skip check if both empty + if ((!addToSpotConfig.empty() || !addToCaliConfig.empty()) && !ran_spot_config_check) { cali::ConfigManager cm; - std::string check_profile = "spot()," + addToConfig; + std::string check_profile; + // If both not empty + if (!addToSpotConfig.empty() && !addToCaliConfig.empty()) { + check_profile = "spot(" + addToSpotConfig + ")," + addToCaliConfig; + } + else if (!addToSpotConfig.empty()) { + check_profile = "spot(" + addToSpotConfig + ")"; + } + // if !addToCaliConfig.empty() + else { + check_profile = addToCaliConfig; + } + std::string msg = cm.check(check_profile.c_str()); if(!msg.empty()) { std::cerr << "Problem with Cali Config: " << check_profile << "\n"; - std::cerr << "Check your command line argument: " << addToConfig << "\n"; + std::cerr << msg << "\n"; config_ok = false; exit(-1); } @@ -519,9 +692,13 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, od = outdir + "/"; } std::string vstr = getVariantName(vid); - std::string profile = "spot(output=" + od + vstr + "-" + tstr + ".cali)"; - if(!addToConfig.empty()) { - profile += "," + addToConfig; + std::string profile = "spot(output=" + od + vstr + "-" + tstr + ".cali"; + if(!addToSpotConfig.empty()) { + profile += "," + addToSpotConfig; + } + profile += ")"; + if (!addToCaliConfig.empty()) { + profile += "," + addToCaliConfig; } std::cout << "Profile: " << profile << std::endl; mgr[vid][tstr].add_option_spec(kernel_info_spec); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index b3ce7c3e3..8c7069f3c 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -25,6 +25,11 @@ #if defined(RAJA_ENABLE_HIP) #include "RAJA/policy/hip/raja_hiperrchk.hpp" #endif +#if defined(RAJA_ENABLE_SYCL) +#include +#endif + +#include "camp/resource.hpp" #include #include @@ -97,7 +102,9 @@ class KernelBase void setDefaultReps(Index_type reps) { default_reps = reps; } void setItsPerRep(Index_type its) { its_per_rep = its; }; void setKernelsPerRep(Index_type nkerns) { kernels_per_rep = nkerns; }; - void setBytesPerRep(Index_type bytes) { bytes_per_rep = bytes;} + void setBytesReadPerRep(Index_type bytes) { bytes_read_per_rep = bytes;} + void setBytesWrittenPerRep(Index_type bytes) { bytes_written_per_rep = bytes;} + void setBytesAtomicModifyWrittenPerRep(Index_type bytes) { bytes_atomic_modify_written_per_rep = bytes;} void setFLOPsPerRep(Index_type FLOPs) { FLOPs_per_rep = FLOPs; } void setBlockSize(Index_type size) { kernel_block_size = size; } @@ -134,6 +141,11 @@ class KernelBase virtual void setKokkosTuningDefinitions(VariantID vid) { addVariantTuningName(vid, getDefaultTuningName()); } #endif +#if defined(RAJA_ENABLE_SYCL) + virtual void setSyclTuningDefinitions(VariantID vid) + { addVariantTuningName(vid, getDefaultTuningName()); } +#endif + // // Getter methods used to generate kernel execution summary @@ -145,7 +157,10 @@ class KernelBase Index_type getDefaultReps() const { return default_reps; } Index_type getItsPerRep() const { return its_per_rep; }; Index_type getKernelsPerRep() const { return kernels_per_rep; }; - Index_type getBytesPerRep() const { return bytes_per_rep; } + Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count atomic_modify_write operations as a read and a write to match previous counting + Index_type getBytesReadPerRep() const { return bytes_read_per_rep; } + Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep; } + Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; } Index_type getFLOPsPerRep() const { return FLOPs_per_rep; } double getBlockSize() const { return kernel_block_size; } @@ -239,6 +254,17 @@ class KernelBase return camp::resources::Hip::get_default(); } #endif +#if defined(RAJA_ENABLE_SYCL) + camp::resources::Sycl getSyclResource() + { + /* + if (run_params.getGPUStream() == 0) { + return camp::resources::Sycl::SyclFromStream(0); + } + */ + return camp::resources::Sycl::get_default(); + } +#endif void synchronize() { @@ -256,24 +282,54 @@ class KernelBase hipErrchk( hipDeviceSynchronize() ); } #endif +#if defined(RAJA_ENABLE_SYCL) + if ( running_variant == Base_SYCL || + running_variant == RAJA_SYCL ) { + getSyclResource().get_queue()->wait(); + } +#endif + } - int getDataAlignment() const; + Size_type getDataAlignment() const; DataSpace getDataSpace(VariantID vid) const; - DataSpace getHostAccessibleDataSpace(VariantID vid) const; + DataSpace getReductionDataSpace(VariantID vid) const; + DataSpace getMPIDataSpace(VariantID vid) const; template - void allocData(DataSpace dataSpace, T& ptr, int len) + void allocData(DataSpace dataSpace, T& ptr, Size_type len) { rajaperf::allocData(dataSpace, ptr, len, getDataAlignment()); } + template + void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len) + { + rajaperf::allocAndInitData(dataSpace, + ptr, len, getDataAlignment()); + } + + template + void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, T val) + { + rajaperf::allocAndInitDataConst(dataSpace, + ptr, len, getDataAlignment(), val); + } + + template + rajaperf::AutoDataMover scopedMoveData(DataSpace dataSpace, T*& ptr, Size_type len) + { + DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); + rajaperf::moveData(hds, dataSpace, ptr, len, getDataAlignment()); + return {dataSpace, hds, ptr, len, getDataAlignment()}; + } + template void copyData(DataSpace dst_dataSpace, T* dst_ptr, DataSpace src_dataSpace, const T* src_ptr, - int len) + Size_type len) { rajaperf::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, len); } @@ -285,46 +341,47 @@ class KernelBase } template - void allocData(T*& ptr, int len, VariantID vid) + void allocData(T*& ptr, Size_type len, VariantID vid) { rajaperf::allocData(getDataSpace(vid), ptr, len, getDataAlignment()); } template - void allocAndInitData(T*& ptr, int len, VariantID vid) + void allocAndInitData(T*& ptr, Size_type len, VariantID vid) { rajaperf::allocAndInitData(getDataSpace(vid), ptr, len, getDataAlignment()); } template - void allocAndInitDataConst(T*& ptr, int len, T val, VariantID vid) + void allocAndInitDataConst(T*& ptr, Size_type len, T val, VariantID vid) { rajaperf::allocAndInitDataConst(getDataSpace(vid), ptr, len, getDataAlignment(), val); } template - void allocAndInitDataRandSign(T*& ptr, int len, VariantID vid) + void allocAndInitDataRandSign(T*& ptr, Size_type len, VariantID vid) { rajaperf::allocAndInitDataRandSign(getDataSpace(vid), ptr, len, getDataAlignment()); } template - void allocAndInitDataRandValue(T*& ptr, int len, VariantID vid) + void allocAndInitDataRandValue(T*& ptr, Size_type len, VariantID vid) { rajaperf::allocAndInitDataRandValue(getDataSpace(vid), ptr, len, getDataAlignment()); } template - rajaperf::AutoDataMover scopedMoveData(T*& ptr, int len, VariantID vid) + rajaperf::AutoDataMover scopedMoveData(T*& ptr, Size_type len, VariantID vid) { - rajaperf::moveData(getHostAccessibleDataSpace(vid), getDataSpace(vid), - ptr, len, getDataAlignment()); - return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; + DataSpace ds = getDataSpace(vid); + DataSpace hds = rajaperf::hostCopyDataSpace(ds); + rajaperf::moveData(hds, ds, ptr, len, getDataAlignment()); + return {ds, hds, ptr, len, getDataAlignment()}; } template @@ -341,14 +398,21 @@ class KernelBase } template - long double calcChecksum(T* ptr, int len, VariantID vid) + long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, VariantID RAJAPERF_UNUSED_ARG(vid)) + { + return rajaperf::calcChecksum(dataSpace, + ptr, len, getDataAlignment(), 1.0); + } + + template + long double calcChecksum(T* ptr, Size_type len, VariantID vid) { return rajaperf::calcChecksum(getDataSpace(vid), ptr, len, getDataAlignment(), 1.0); } template - long double calcChecksum(T* ptr, int len, Real_type scale_factor, VariantID vid) + long double calcChecksum(T* ptr, Size_type len, Real_type scale_factor, VariantID vid) { return rajaperf::calcChecksum(getDataSpace(vid), ptr, len, getDataAlignment(), scale_factor); @@ -406,6 +470,13 @@ class KernelBase virtual void runOpenMPTargetVariant(VariantID vid, size_t tune_idx) = 0; #endif +#if defined(RAJA_ENABLE_SYCL) + virtual void runSyclVariant(VariantID vid, size_t tune_idx) + { + getCout() << "\n KernelBase: Unimplemented Sycl variant id = " << vid << std::endl; + } +#endif + #if defined(RUN_KOKKOS) virtual void runKokkosVariant(VariantID vid, size_t tune_idx) { @@ -413,6 +484,7 @@ class KernelBase } #endif + #if defined(RAJA_PERFSUITE_USE_CALIPER) void caliperOn() { doCaliperTiming = true; } void caliperOff() { doCaliperTiming = false; } @@ -421,7 +493,8 @@ class KernelBase static void setCaliperMgrVariantTuning(VariantID vid, std::string tstr, const std::string& outdir, - const std::string& addToConfig); + const std::string& addToSpotConfig, + const std::string& addToCaliConfig); static void setCaliperMgrStart(VariantID vid, std::string tstr) { mgr[vid][tstr].start(); } static void setCaliperMgrStop(VariantID vid, std::string tstr) { mgr[vid][tstr].stop(); } @@ -482,7 +555,9 @@ class KernelBase // Index_type its_per_rep; Index_type kernels_per_rep; - Index_type bytes_per_rep; + Index_type bytes_read_per_rep; + Index_type bytes_written_per_rep; + Index_type bytes_atomic_modify_written_per_rep; Index_type FLOPs_per_rep; double kernel_block_size = nan(""); // Set default value for non GPU kernels @@ -501,8 +576,12 @@ class KernelBase cali_id_t Iters_Rep_attr; cali_id_t Kernels_Rep_attr; cali_id_t Bytes_Rep_attr; + cali_id_t Bytes_Read_Rep_attr; + cali_id_t Bytes_Written_Rep_attr; + cali_id_t Bytes_AtomicModifyWritten_Rep_attr; cali_id_t Flops_Rep_attr; cali_id_t BlockSize_attr; + std::map Feature_attrs; // we need a Caliper Manager object per variant diff --git a/src/common/KokkosViewUtils.hpp b/src/common/KokkosViewUtils.hpp index 856fcb6f1..65a700030 100644 --- a/src/common/KokkosViewUtils.hpp +++ b/src/common/KokkosViewUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index b5c98cb97..b6875c7f7 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -47,7 +47,7 @@ namespace detail /* * Copy memory len bytes from src to dst. */ -inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, +inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, Size_type len, int dst_did, int src_did) { omp_target_memcpy( dst_ptr, const_cast(src_ptr), len, @@ -58,7 +58,7 @@ inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, * \brief Allocate device data array (dptr) and copy given hptr (host) * data to device array. */ -inline void* allocOpenMPDeviceData(size_t len, +inline void* allocOpenMPDeviceData(Size_type len, int did = getOpenMPTargetDevice()) { return omp_target_alloc( len, did); @@ -83,7 +83,7 @@ inline void deallocOpenMPDeviceData(void* dptr, * and of propoer size for copy operation to succeed. */ template -void initOpenMPDeviceData(T* dptr, const T* hptr, int len, +void initOpenMPDeviceData(T* dptr, const T* hptr, Size_type len, int did = getOpenMPTargetDevice(), int hid = getOpenMPTargetHost()) { @@ -97,7 +97,7 @@ void initOpenMPDeviceData(T* dptr, const T* hptr, int len, * and of propoer size for copy operation to succeed. */ template -void getOpenMPDeviceData(T* hptr, const T* dptr, int len, +void getOpenMPDeviceData(T* hptr, const T* dptr, Size_type len, int hid = getOpenMPTargetHost(), int did = getOpenMPTargetDevice()) { diff --git a/src/common/OutputUtils.cpp b/src/common/OutputUtils.cpp index 87648a545..fbd7f3653 100644 --- a/src/common/OutputUtils.cpp +++ b/src/common/OutputUtils.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/OutputUtils.hpp b/src/common/OutputUtils.hpp index 5641401e9..197721133 100644 --- a/src/common/OutputUtils.hpp +++ b/src/common/OutputUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 085c058c4..c55dd83bd 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -35,6 +35,7 @@ #include "basic/REDUCE3_INT.hpp" #include "basic/REDUCE_STRUCT.hpp" #include "basic/TRAP_INT.hpp" +#include "basic/MULTI_REDUCE.hpp" // // Lcals kernels... @@ -86,12 +87,11 @@ #include "apps/EDGE3D.hpp" #include "apps/ENERGY.hpp" #include "apps/FIR.hpp" -#include "apps/HALOEXCHANGE.hpp" -#include "apps/HALOEXCHANGE_FUSED.hpp" #include "apps/LTIMES.hpp" #include "apps/LTIMES_NOVIEW.hpp" #include "apps/MASS3DEA.hpp" #include "apps/MASS3DPA.hpp" +#include "apps/MATVEC_3D_STENCIL.hpp" #include "apps/NODAL_ACCUMULATION_3D.hpp" #include "apps/PRESSURE.hpp" #include "apps/VOL3D.hpp" @@ -106,6 +106,19 @@ #include "algorithm/REDUCE_SUM.hpp" #include "algorithm/MEMSET.hpp" #include "algorithm/MEMCPY.hpp" +#include "algorithm/ATOMIC.hpp" +#include "algorithm/HISTOGRAM.hpp" + +// +// Comm kernels... +// +#include "comm/HALO_PACKING.hpp" +#include "comm/HALO_PACKING_FUSED.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include "comm/HALO_SENDRECV.hpp" +#include "comm/HALO_EXCHANGE.hpp" +#include "comm/HALO_EXCHANGE_FUSED.hpp" +#endif #include @@ -133,6 +146,7 @@ static const std::string GroupNames [] = std::string("Stream"), std::string("Apps"), std::string("Algorithm"), + std::string("Comm"), std::string("Unknown Group") // Keep this at the end and DO NOT remove.... @@ -175,6 +189,7 @@ static const std::string KernelNames [] = std::string("Basic_REDUCE3_INT"), std::string("Basic_REDUCE_STRUCT"), std::string("Basic_TRAP_INT"), + std::string("Basic_MULTI_REDUCE"), // // Lcals kernels... @@ -226,12 +241,11 @@ static const std::string KernelNames [] = std::string("Apps_EDGE3D"), std::string("Apps_ENERGY"), std::string("Apps_FIR"), - std::string("Apps_HALOEXCHANGE"), - std::string("Apps_HALOEXCHANGE_FUSED"), std::string("Apps_LTIMES"), std::string("Apps_LTIMES_NOVIEW"), std::string("Apps_MASS3DEA"), std::string("Apps_MASS3DPA"), + std::string("Apps_MATVEC_3D_STENCIL"), std::string("Apps_NODAL_ACCUMULATION_3D"), std::string("Apps_PRESSURE"), std::string("Apps_VOL3D"), @@ -246,6 +260,19 @@ static const std::string KernelNames [] = std::string("Algorithm_REDUCE_SUM"), std::string("Algorithm_MEMSET"), std::string("Algorithm_MEMCPY"), + std::string("Algorithm_ATOMIC"), + std::string("Algorithm_HISTOGRAM"), + +// +// Comm kernels... +// + std::string("Comm_HALO_PACKING"), + std::string("Comm_HALO_PACKING_FUSED"), +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + std::string("Comm_HALO_SENDRECV"), + std::string("Comm_HALO_EXCHANGE"), + std::string("Comm_HALO_EXCHANGE_FUSED"), +#endif std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... @@ -288,6 +315,9 @@ static const std::string VariantNames [] = std::string("Kokkos_Lambda"), + std::string("Base_SYCL"), + std::string("RAJA_SYCL"), + std::string("Unknown Variant") // Keep this at the end and DO NOT remove.... }; // END VariantNames @@ -321,6 +351,10 @@ static const std::string FeatureNames [] = std::string("View"), +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + std::string("MPI"), +#endif + std::string("Unknown Feature") // Keep this at the end and DO NOT remove.... }; // END FeatureNames @@ -348,6 +382,10 @@ static const std::string DataSpaceNames [] = std::string("CudaPinned"), std::string("CudaManaged"), + std::string("CudaManagedHostPreferred"), + std::string("CudaManagedDevicePreferred"), + std::string("CudaManagedHostPreferredDeviceAccessed"), + std::string("CudaManagedDevicePreferredHostAccessed"), std::string("CudaDevice"), std::string("HipHostAdviseFine"), @@ -361,6 +399,14 @@ static const std::string DataSpaceNames [] = std::string("HipDevice"), std::string("HipDeviceFine"), + std::string("SyclPinned"), + std::string("SyclManaged"), + std::string("SyclDevice"), + + std::string("Unknown Memory"), // Keep this at the end and DO NOT remove.... + + std::string("Copy"), + std::string("Unknown Memory") // Keep this at the end and DO NOT remove.... }; // END VariantNames @@ -478,6 +524,13 @@ bool isVariantAvailable(VariantID vid) } #endif +#if defined(RAJA_ENABLE_SYCL) + if ( vid == Base_SYCL || + vid == RAJA_SYCL ) { + ret_val = true; + } +#endif + return ret_val; } @@ -539,6 +592,13 @@ bool isVariantGPU(VariantID vid) } #endif +#if defined(RAJA_ENABLE_SYCL) + if ( vid == Base_SYCL || + vid == RAJA_SYCL ) { + ret_val = true; + } +#endif + return ret_val; } @@ -570,7 +630,7 @@ const std::string& getDataSpaceName(DataSpace ds) /*! ******************************************************************************* * - * Return true if the allocate associated with DataSpace enum value is available. + * Return true if the allocator associated with DataSpace enum value is available. * ******************************************************************************* */ @@ -579,24 +639,37 @@ bool isDataSpaceAvailable(DataSpace dataSpace) bool ret_val = false; switch (dataSpace) { - case DataSpace::Host: - ret_val = true; break; + + case DataSpace::Host: { + ret_val = true; + break; + } #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - case DataSpace::Omp: - ret_val = true; break; + case DataSpace::Omp: { + ret_val = true; + break; + } #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - case DataSpace::OmpTarget: - ret_val = true; break; + case DataSpace::OmpTarget: { + ret_val = true; + break; + } #endif #if defined(RAJA_ENABLE_CUDA) case DataSpace::CudaPinned: case DataSpace::CudaManaged: - case DataSpace::CudaDevice: - ret_val = true; break; + case DataSpace::CudaManagedHostPreferred: + case DataSpace::CudaManagedDevicePreferred: + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + case DataSpace::CudaManagedDevicePreferredHostAccessed: + case DataSpace::CudaDevice: { + ret_val = true; + break; + } #endif #if defined(RAJA_ENABLE_HIP) @@ -613,17 +686,57 @@ bool isDataSpaceAvailable(DataSpace dataSpace) case DataSpace::HipManagedAdviseCoarse: #endif case DataSpace::HipDevice: - case DataSpace::HipDeviceFine: - ret_val = true; break; + case DataSpace::HipDeviceFine: { + ret_val = true; + break; + } #endif - default: - ret_val = false; break; - } +#if defined(RAJA_ENABLE_SYCL) + case DataSpace::SyclPinned: + case DataSpace::SyclManaged: + case DataSpace::SyclDevice: { + ret_val = true; + break; + } +#endif + + default: { + ret_val = false; + break; + } + + } // close switch (dataSpace) return ret_val; } +/*! + ******************************************************************************* + * + * Return true if the DataSpace enum value is a psuedo DataSpace. + * + ******************************************************************************* + */ +bool isPseudoDataSpace(DataSpace dataSpace) +{ + bool ret_val = false; + + switch (dataSpace) { + + case DataSpace::Copy: { + ret_val = true; + break; + } + default: { + ret_val = false; + break; + } + + } + + return ret_val; +} /* ******************************************************************************* @@ -714,6 +827,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::TRAP_INT(run_params); break; } + case Basic_MULTI_REDUCE : { + kernel = new basic::MULTI_REDUCE(run_params); + break; + } // // Lcals kernels... @@ -871,14 +988,6 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::FIR(run_params); break; } - case Apps_HALOEXCHANGE : { - kernel = new apps::HALOEXCHANGE(run_params); - break; - } - case Apps_HALOEXCHANGE_FUSED : { - kernel = new apps::HALOEXCHANGE_FUSED(run_params); - break; - } case Apps_LTIMES : { kernel = new apps::LTIMES(run_params); break; @@ -895,6 +1004,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::MASS3DPA(run_params); break; } + case Apps_MATVEC_3D_STENCIL : { + kernel = new apps::MATVEC_3D_STENCIL(run_params); + break; + } case Apps_NODAL_ACCUMULATION_3D : { kernel = new apps::NODAL_ACCUMULATION_3D(run_params); break; @@ -939,6 +1052,40 @@ KernelBase* getKernelObject(KernelID kid, kernel = new algorithm::MEMCPY(run_params); break; } + case Algorithm_ATOMIC: { + kernel = new algorithm::ATOMIC(run_params); + break; + } + case Algorithm_HISTOGRAM: { + kernel = new algorithm::HISTOGRAM(run_params); + break; + } + +// +// Comm kernels... +// + case Comm_HALO_PACKING : { + kernel = new comm::HALO_PACKING(run_params); + break; + } + case Comm_HALO_PACKING_FUSED : { + kernel = new comm::HALO_PACKING_FUSED(run_params); + break; + } +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + case Comm_HALO_SENDRECV : { + kernel = new comm::HALO_SENDRECV(run_params); + break; + } + case Comm_HALO_EXCHANGE : { + kernel = new comm::HALO_EXCHANGE(run_params); + break; + } + case Comm_HALO_EXCHANGE_FUSED : { + kernel = new comm::HALO_EXCHANGE_FUSED(run_params); + break; + } +#endif default: { getCout() << "\n Unknown Kernel ID = " << kid << std::endl; @@ -949,6 +1096,7 @@ KernelBase* getKernelObject(KernelID kid, return kernel; } + // subclass of streambuf that ignores overflow // never printing anything to the underlying stream struct NullStream : std::streambuf, std::ostream diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 3270a4090..35a400b32 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,6 +52,7 @@ enum GroupID { Stream, Apps, Algorithm, + Comm, NumGroups // Keep this one last and DO NOT remove (!!) @@ -94,6 +95,7 @@ enum KernelID { Basic_REDUCE3_INT, Basic_REDUCE_STRUCT, Basic_TRAP_INT, + Basic_MULTI_REDUCE, // // Lcals kernels... @@ -145,12 +147,11 @@ enum KernelID { Apps_EDGE3D, Apps_ENERGY, Apps_FIR, - Apps_HALOEXCHANGE, - Apps_HALOEXCHANGE_FUSED, Apps_LTIMES, Apps_LTIMES_NOVIEW, Apps_MASS3DEA, Apps_MASS3DPA, + Apps_MATVEC_3D_STENCIL, Apps_NODAL_ACCUMULATION_3D, Apps_PRESSURE, Apps_VOL3D, @@ -165,6 +166,19 @@ enum KernelID { Algorithm_REDUCE_SUM, Algorithm_MEMSET, Algorithm_MEMCPY, + Algorithm_ATOMIC, + Algorithm_HISTOGRAM, + +// +// Comm kernels... +// + Comm_HALO_PACKING, + Comm_HALO_PACKING_FUSED, +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + Comm_HALO_SENDRECV, + Comm_HALO_EXCHANGE, + Comm_HALO_EXCHANGE_FUSED, +#endif NumKernels // Keep this one last and NEVER comment out (!!) @@ -206,6 +220,9 @@ enum VariantID { Kokkos_Lambda, + Base_SYCL, + RAJA_SYCL, + NumVariants // Keep this one last and NEVER comment out (!!) }; @@ -238,6 +255,10 @@ enum FeatureID { View, +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI, +#endif + NumFeatures // Keep this one last and NEVER comment out (!!) }; @@ -266,6 +287,10 @@ enum struct DataSpace { CudaPinned, CudaManaged, + CudaManagedHostPreferred, + CudaManagedDevicePreferred, + CudaManagedHostPreferredDeviceAccessed, + CudaManagedDevicePreferredHostAccessed, CudaDevice, HipHostAdviseFine, @@ -279,7 +304,15 @@ enum struct DataSpace { HipDevice, HipDeviceFine, - NumSpaces // Keep this one last and NEVER comment out (!!) + SyclPinned, + SyclManaged, + SyclDevice, + + NumSpaces, // Keep this one here and NEVER comment out (!!) + + Copy, + + EndPseudoSpaces // Keep this one last and NEVER comment out (!!) }; @@ -365,12 +398,21 @@ const std::string& getDataSpaceName(DataSpace cd); /*! ******************************************************************************* * - * Return true if the allocate associated with DataSpace enum value is available. + * Return true if the allocator associated with DataSpace enum value is available. * ******************************************************************************* */ bool isDataSpaceAvailable(DataSpace dataSpace); +/*! + ******************************************************************************* + * + * Return true if the DataSpace enum value is a pseudo DataSpace. + * + ******************************************************************************* + */ +bool isPseudoDataSpace(DataSpace dataSpace); + /*! ******************************************************************************* * diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index b86f6b7b6..9ec2566eb 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -61,6 +61,16 @@ using Index_type = RAJA::Index_type; using Index_ptr = Index_type*; +/*! + ****************************************************************************** + * + * \brief Type used for sizing allocations. + * + ****************************************************************************** + */ +using Size_type = size_t; + + /*! ****************************************************************************** * @@ -95,10 +105,14 @@ using Checksum_type = long double; #if defined(RP_USE_DOUBLE) /// using Real_type = double; +/// +#define Real_MPI_type MPI_DOUBLE #elif defined(RP_USE_FLOAT) /// using Real_type = float; +/// +#define Real_MPI_type MPI_FLOAT #else #error Real_type is undefined! diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 96e6821a8..1665783a9 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -38,12 +38,27 @@ RunParams::RunParams(int argc, char** argv) size(0.0), size_factor(0.0), data_alignment(RAJA::DATA_ALIGN), + multi_reduce_num_bins(10), + multi_reduce_bin_assignment_algorithm(BinAssignmentAlgorithm::RunsRandomSizes), + ltimes_num_d(64), + ltimes_num_g(32), + ltimes_num_m(25), + array_of_ptrs_array_size(ARRAY_OF_PTRS_MAX_ARRAY_SIZE), + halo_width(1), + halo_num_vars(3), gpu_stream(1), gpu_block_sizes(), + atomic_replications(), + items_per_threads(), + mpi_size(1), + mpi_rank(0), + mpi_3d_division({-1, -1, -1}), pf_tol(0.1), checkrun_reps(1), reference_variant(), reference_vid(NumVariants), + warmup_kernel_input(), + invalid_warmup_kernel_input(), kernel_input(), invalid_kernel_input(), exclude_kernel_input(), @@ -115,11 +130,37 @@ void RunParams::print(std::ostream& str) const str << "\n size = " << size; str << "\n size_factor = " << size_factor; str << "\n data_alignment = " << data_alignment; + + str << "\n multi_reduce_num_bins = " << multi_reduce_num_bins; + str << "\n multi_reduce_bin_assignment_algorithm = " << BinAssignmentAlgorithmToStr(multi_reduce_bin_assignment_algorithm); + + str << "\n ltimes_num_d = " << ltimes_num_d; + str << "\n ltimes_num_g = " << ltimes_num_g; + str << "\n ltimes_num_m = " << ltimes_num_m; + + str << "\n array_of_ptrs_array_size = " << array_of_ptrs_array_size; + + str << "\n halo_width = " << halo_width; + str << "\n halo_num_vars = " << halo_num_vars; + str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default"); str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { str << "\n\t" << gpu_block_sizes[j]; } + str << "\n atomic_replications = "; + for (size_t j = 0; j < atomic_replications.size(); ++j) { + str << "\n\t" << atomic_replications[j]; + } + str << "\n items_per_threads = "; + for (size_t j = 0; j < items_per_threads.size(); ++j) { + str << "\n\t" << items_per_threads[j]; + } + str << "\n mpi_size = " << mpi_size; + str << "\n mpi_3d_division = "; + for (size_t j = 0; j < 3; ++j) { + str << "\n\t" << mpi_3d_division[j]; + } str << "\n pf_tol = " << pf_tol; str << "\n checkrun_reps = " << checkrun_reps; str << "\n reference_variant = " << reference_variant; @@ -140,6 +181,30 @@ void RunParams::print(std::ostream& str) const str << "\n cuda data space = " << getDataSpaceName(cudaDataSpace); str << "\n hip data space = " << getDataSpaceName(hipDataSpace); str << "\n kokkos data space = " << getDataSpaceName(kokkosDataSpace); + str << "\n sycl data space = " << getDataSpaceName(syclDataSpace); + + str << "\n seq reduction data space = " << getDataSpaceName(seqReductionDataSpace); + str << "\n omp reduction data space = " << getDataSpaceName(ompReductionDataSpace); + str << "\n omp target reduction data space = " << getDataSpaceName(ompTargetReductionDataSpace); + str << "\n cuda reduction data space = " << getDataSpaceName(cudaReductionDataSpace); + str << "\n hip reduction data space = " << getDataSpaceName(hipReductionDataSpace); + str << "\n kokkos reduction data space = " << getDataSpaceName(kokkosReductionDataSpace); + + str << "\n seq MPI data space = " << getDataSpaceName(seqMPIDataSpace); + str << "\n omp MPI data space = " << getDataSpaceName(ompMPIDataSpace); + str << "\n omp target MPI data space = " << getDataSpaceName(ompTargetMPIDataSpace); + str << "\n cuda MPI data space = " << getDataSpaceName(cudaMPIDataSpace); + str << "\n hip MPI data space = " << getDataSpaceName(hipMPIDataSpace); + str << "\n kokkos MPI data space = " << getDataSpaceName(kokkosMPIDataSpace); + + str << "\n warmup_kernel_input = "; + for (size_t j = 0; j < warmup_kernel_input.size(); ++j) { + str << "\n\t" << warmup_kernel_input[j]; + } + str << "\n invalid_warmup_kernel_input = "; + for (size_t j = 0; j < invalid_warmup_kernel_input.size(); ++j) { + str << "\n\t" << invalid_warmup_kernel_input[j]; + } str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { @@ -232,6 +297,11 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) { getCout() << "\n\nReading command line input..." << std::endl; +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); +#endif + for (int i = 1; i < argc; ++i) { std::string opt(argv[i]); @@ -403,6 +473,192 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--multi_reduce_num_bins") ) { + + i++; + if ( i < argc ) { + long long num_bins = ::atoll( argv[i] ); + long long min_num_bins = 1; + if ( num_bins < min_num_bins ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num_bins + << std::endl; + input_state = BadInput; + } else { + multi_reduce_num_bins = num_bins; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--multi_reduce_bin_assignment_algorithm") ) { + + i++; + if ( i < argc ) { + + std::string bin_assignment_algorithm_name(argv[i]); + + if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Random)) { + multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::Random; + } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsRandomSizes)) { + multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::RunsRandomSizes; + } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsEvenSizes)) { + multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::RunsEvenSizes; + } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Single)) { + multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::Single; + } else { + getCout() << "\nBad input:" + << " must give " << opt << " one of the following values\n" + << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Random) << ", " + << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsRandomSizes) << ", " + << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsEvenSizes) << ", " + << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Single) + << std::endl; + input_state = BadInput; + invalid_npasses_combiner_input.emplace_back(bin_assignment_algorithm_name); + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (string)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--ltimes_num_d") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + ltimes_num_d = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--ltimes_num_g") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + ltimes_num_g = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--ltimes_num_m") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + ltimes_num_m = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--array_of_ptrs_array_size") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + long long max_num = ARRAY_OF_PTRS_MAX_ARRAY_SIZE; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else if ( num > max_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at most " << max_num + << std::endl; + input_state = BadInput; + } else { + array_of_ptrs_array_size = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--halo_width") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + halo_width = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--halo_num_vars") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + halo_num_vars = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--gpu_stream_0") ) { gpu_stream = 0; @@ -438,6 +694,99 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--atomic_replication") ) { + + bool got_someting = false; + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + got_someting = true; + int atomic_replication = ::atoi( opt.c_str() ); + if ( atomic_replication <= 0 ) { + getCout() << "\nBad input:" + << " must give --atomic_replication POSITIVE values (int)" + << std::endl; + input_state = BadInput; + } else { + atomic_replications.push_back(atomic_replication); + } + ++i; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give --atomic_replication one or more values (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--items_per_thread") ) { + + bool got_someting = false; + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + got_someting = true; + int items_per_thread = ::atoi( opt.c_str() ); + if ( items_per_thread <= 0 ) { + getCout() << "\nBad input:" + << " must give --items_per_thread POSITIVE values (int)" + << std::endl; + input_state = BadInput; + } else { + items_per_threads.push_back(items_per_thread); + } + ++i; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give --items_per_thread one or more values (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--mpi_3d_division") ) { + + int num_got = 0; + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + num_got += 1; + int number = ::atoi( opt.c_str() ); + if ( number <= 0 ) { + getCout() << "\nBad input:" + << " must give --mpi_3d_division POSITIVE values (int)" + << std::endl; + input_state = BadInput; + } else if (num_got <= 3) { + mpi_3d_division[num_got-1] = number; + } + ++i; + } + } + if (num_got != 3) { + getCout() << "\nBad input:" + << " must give --mpi_3d_division three values (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--pass-fail-tol") || opt == std::string("-pftol") ) { @@ -451,6 +800,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--warmup-kernels") || + opt == std::string("-wk") ) { + + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + warmup_kernel_input.push_back(opt); + ++i; + } + } + } else if ( opt == std::string("--kernels") || opt == std::string("-k") ) { @@ -525,11 +890,28 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) opt == std::string("-cds") || opt == std::string("--hip-data-space") || opt == std::string("-hds") || + opt == std::string("--sycl-data-space") || + opt == std::string("-syds") || opt == std::string("--kokkos-data-space") || - opt == std::string("-kds") ) { + opt == std::string("-kds") || + opt == std::string("--seq-reduction-data-space") || + opt == std::string("--omp-reduction-data-space") || + opt == std::string("--omptarget-reduction-data-space") || + opt == std::string("--cuda-reduction-data-space") || + opt == std::string("--hip-reduction-data-space") || + opt == std::string("--sycl-reduction-data-space") || + opt == std::string("--kokkos-reduction-data-space") || + opt == std::string("--seq-mpi-data-space") || + opt == std::string("--omp-mpi-data-space") || + opt == std::string("--omptarget-mpi-data-space") || + opt == std::string("--cuda-mpi-data-space") || + opt == std::string("--hip-mpi-data-space") || + opt == std::string("--sycl-mpi-data-space") || + opt == std::string("--kokkos-mpi-data-space") ) { bool got_someting = false; bool got_something_available = false; + bool got_something_pseudo = false; i++; if ( i < argc ) { auto opt_name = std::move(opt); @@ -537,11 +919,12 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( opt.at(0) == '-' ) { i--; } else { - for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { + for (int ids = 0; ids < static_cast(DataSpace::EndPseudoSpaces); ++ids) { DataSpace ds = static_cast(ids); if (getDataSpaceName(ds) == opt) { got_someting = true; got_something_available = isDataSpaceAvailable(ds); + got_something_pseudo = isPseudoDataSpace(ds); if ( opt_name == std::string("--seq-data-space") || opt_name == std::string("-sds") ) { seqDataSpace = ds; @@ -557,9 +940,47 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( opt_name == std::string("--hip-data-space") || opt_name == std::string("-hds") ) { hipDataSpace = ds; + } else if ( opt_name == std::string("--sycl-data-space") || + opt_name == std::string("-syds") ) { + syclDataSpace = ds; } else if ( opt_name == std::string("--kokkos-data-space") || opt_name == std::string("-kds") ) { kokkosDataSpace = ds; + } else if ( opt_name == std::string("--seq-reduction-data-space") ) { + seqReductionDataSpace = ds; + } else if ( opt_name == std::string("--omp-reduction-data-space") ) { + ompReductionDataSpace = ds; + } else if ( opt_name == std::string("--omptarget-reduction-data-space") ) { + ompTargetReductionDataSpace = ds; + } else if ( opt_name == std::string("--cuda-reduction-data-space") ) { + cudaReductionDataSpace = ds; + } else if ( opt_name == std::string("--hip-reduction-data-space") ) { + hipReductionDataSpace = ds; + } else if ( opt_name == std::string("--sycl-reduction-data-space") ) { + syclReductionDataSpace = ds; + } else if ( opt_name == std::string("--kokkos-reduction-data-space") ) { + kokkosReductionDataSpace = ds; + } else if ( opt_name == std::string("--seq-mpi-data-space") ) { + seqMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--omp-mpi-data-space") ) { + ompMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--omptarget-mpi-data-space") ) { + ompTargetMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--cuda-mpi-data-space") ) { + cudaMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--hip-mpi-data-space") ) { + hipMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--sycl-mpi-data-space") ) { + syclMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--kokkos-mpi-data-space") ) { + kokkosMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; } else { got_someting = false; } @@ -580,6 +1001,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } } + } else if ( std::string(argv[i]) == std::string("--tunings") || std::string(argv[i]) == std::string("-t") ) { @@ -727,6 +1149,17 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) add_to_spot_config = std::string( argv[i] ); } } + } else if ( std::string(argv[i]) == std::string("--add-to-cali-config") || + std::string(argv[i]) == std::string("-atcc") ) { + i++; + if ( i < argc ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + } else { + add_to_cali_config = std::string( argv[i] ); + } + } #endif } else { @@ -739,6 +1172,10 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } + if (input_state == InfoRequest) { + break; + } + } // Default size and size_meaning if unset @@ -747,6 +1184,55 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) size_factor = 1.0; } +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + + // assumes number is >= 0 + // returns {0} if number is 0 + // {1} if number is 1 + // {prime factors in non-decreasing order} otherwise + auto factorize = [](int number) { + std::vector prime_factors; + int factor = 2; + while (factor <= std::sqrt(number)) { + int quotient = number / factor; + if (quotient * factor == number) { + prime_factors.emplace_back(factor); + number = quotient; + } else { + factor++; + } + } + prime_factors.emplace_back(number); + return prime_factors; + }; + + // Uses prime factors to set division + // to a relatively square grid + auto set_division = [](int* division, const int dims, + std::vector const& prime_factors) { + for (int d = 0; d < dims; ++d) { + division[d] = 1; + } + + for (int factor : prime_factors) { + + int min_d = 0; + for (int d = 1; d < dims; ++d) { + if (division[d] < division[min_d]) { + min_d = d; + } + } + + division[min_d] *= factor; + } + }; + + if (mpi_3d_division[0] == -1) { + std::vector prime_factors = factorize(mpi_size); + set_division(mpi_3d_division.data(), 3, prime_factors); + } +#endif + processNpassesCombinerInput(); processKernelInput(); @@ -755,12 +1241,12 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) processTuningInput(); - if ( input_state != BadInput && + if ( input_state != InfoRequest && + input_state != BadInput && input_state != DryRun && - input_state != CheckRun ) { + input_state != CheckRun) { input_state = PerfRun; } - } @@ -829,6 +1315,15 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --disable-warmup (disable warmup kernels) [Default is run warmup kernels that are relevant to kernels selected to run]\n\n"; + str << "\t --warmup-kernels, -wk [Default is run warmup kernels that are relevant to kernels selected to run]\n" + << "\t (names of individual kernels and/or groups of kernels to warmup)\n" + << "\t See '--print-kernels'/'-pk' option for list of valid kernel and group names.\n" + << "\t Kernel names are listed as _.\n"; + str << "\t\t Examples...\n" + << "\t\t --warmup-kernels Polybench (warmup all kernels in Polybench group)\n" + << "\t\t -wk INIT3 MULADDSUB (warmup INIT3 and MULADDSUB kernels)\n" + << "\t\t -wk INIT3 Apps (warmup INIT3 kernel and all kernels in Apps group)\n\n"; + str << "\t --kernels, -k [Default is run all]\n" << "\t (names of individual kernels and/or groups of kernels to run)\n" << "\t See '--print-kernels'/'-pk' option for list of valid kernel and group names.\n" @@ -915,12 +1410,36 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --gpu_block_size [no default]\n" << "\t (block sizes to run for all GPU kernels)\n" + << "\t Given int values must be > 0\n." << "\t GPU kernels not supporting gpu_block_size option will be skipped.\n" - << "\t Behavior depends on kernel implementations and \n" - << "\t values give via CMake variable RAJA_PERFSUITE_GPU_BLOCKSIZES.\n"; + << "\t Behavior depends on individual kernel implementations and \n" + << "\t compile configuration values given via CMake variable \n" + << "\t RAJA_PERFSUITE_GPU_BLOCKSIZES.\n"; str << "\t\t Example...\n" << "\t\t --gpu_block_size 128 256 512 (runs kernels with gpu_block_size 128, 256, and 512)\n\n"; + str << "\t --atomic_replication [no default]\n" + << "\t (atomic replications to run for all GPU kernels)\n" + << "\t GPU kernels not supporting atomic_replication option will be skipped.\n" + << "\t Behavior depends on kernel implementations and \n" + << "\t values give via CMake variable RAJA_PERFSUITE_ATOMIC_REPLICATIONS.\n"; + str << "\t\t Example...\n" + << "\t\t --atomic_replication 128 256 512 (runs kernels with atomic_replication 128, 256, and 512)\n\n"; + + str << "\t --items_per_thread [no default]\n" + << "\t (items per thread to run for all GPU kernels)\n" + << "\t GPU kernels not supporting items_per_thread option will be skipped.\n" + << "\t Behavior depends on kernel implementations and \n" + << "\t values give via CMake variable RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD.\n"; + str << "\t\t Example...\n" + << "\t\t --items_per_thread 128 256 512 (runs kernels with items_per_thread 128, 256, and 512)\n\n"; + + str << "\t --mpi_3d_division [no default]\n" + << "\t (number of mpi ranks in each dimension in a 3d grid)\n" + << "\t (3D MPI kernels will be skipped if the product of mpi_3d_division is not equal to the number of ranks)\n"; + str << "\t\t Example...\n" + << "\t\t --mpi_3d_division 2 3 5 (runs 3d MPI kernels on a 2 by 3 by 5 grid)\n\n"; + str << "\t --tunings, -t [Default is run all]\n" << "\t (names of tunings to run)\n" << "\t Note: knowing which tunings are available requires knowledge about the variants,\n" @@ -937,7 +1456,7 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t -et default library (exclude default and library tunings)\n\n"; str << "\t Options for selecting kernel data used in kernels....\n" - << "\t ======================================================\n\n";; + << "\t ======================================================\n\n"; str << "\t --data_alignment, -align [default is RAJA::DATA_ALIGN]\n" << "\t (minimum memory alignment for host allocations)\n" @@ -945,6 +1464,55 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t -align 4096 (allocates memory aligned to 4KiB boundaries)\n\n"; + str << "\t --multi_reduce_num_bins [default is 10]\n" + << "\t (number of bins used in multi-reduce kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --multi_reduce_num_bins 100\n\n"; + + str << "\t --multi_reduce_bin_assignment_algorithm [default is RunsRandomSizes]\n" + << "\t (algorithm used to assign bins to iterates in multi-reduce kernels)\n" + << "\t Valid assignment algorithm names are 'Random', 'RunsRandomSizes', 'RunsEvenSizes', or 'Single'\n"; + str << "\t\t Example...\n" + << "\t\t --multi_reduce_bin_assignment_algorithm Random\n\n"; + + str << "\t --ltimes_num_d [default is 64]\n" + << "\t (num_d used in ltimes kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --ltimes_num_d 32\n\n"; + + str << "\t --ltimes_num_g [default is 32]\n" + << "\t (num_g used in ltimes kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --ltimes_num_g 64\n\n"; + + str << "\t --ltimes_num_m [default is 25]\n" + << "\t (num_m used in ltimes kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --ltimes_num_m 100\n\n"; + + str << "\t --array_of_ptrs_array_size [default is " << ARRAY_OF_PTRS_MAX_ARRAY_SIZE << "]\n" + << "\t (array size used in ARRAY_OF_PTRS kernel)\n" + << "\t Must be greater than 0.\n" + << "\t Must be less than or equal to " << ARRAY_OF_PTRS_MAX_ARRAY_SIZE << ".\n"; + str << "\t\t Example...\n" + << "\t\t --array_of_ptrs_array_size 4\n\n"; + + str << "\t --halo_width [default is 1]\n" + << "\t (halo width used in halo kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --halo_width 2\n\n"; + + str << "\t --halo_num_vars [default is 3]\n" + << "\t (num vars used in halo kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --halo_num_vars 10\n\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" << "\t (name of data space to use for sequential variants)\n" << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; @@ -980,17 +1548,105 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t --hip-data-space HipManaged (run HIP variants with Hip Managed memory)\n" << "\t\t -hds HipPinned (run HIP variants with Hip Pinned memory)\n\n"; + str << "\t --sycl-data-space, -syds [Default is SyclDevice]\n" + << "\t (names of data space to use for SYCL variants)\n" + << "\t Valid data space names are 'SyclDevice', 'SyclPinned', or 'SyclManaged'\n"; + str << "\t\t Examples...\n" + << "\t\t --sycl-data-space SyclManaged (run SYCL variants with Sycl Managed memory)\n" + << "\t\t -syds SyclPinned (run SYCL variants with Sycl Pinned memory)\n\n"; + str << "\t --kokkos-data-space, -kds [Default is Host]\n" << "\t (names of data space to use)\n"; str << "\t\t Examples...\n" << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n" - << "\t\t -kds HipPinned (run KOKKOS variants with Hip Pinned memory)\n\n"; + << "\t\t -kds HipPinned (run KOKKOS variants with CUDA Pinned memory)\n\n"; + + str << "\t --seq-reduction-data-space [Default is Host]\n" + << "\t (name of data space to use with reductions for sequential variants)\n" + << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; + str << "\t\t Examples...\n" + << "\t\t --seq-reduction-data-space Host (run sequential variants with Host memory)\n\n"; + + str << "\t --omp-reduction-data-space [Default is Omp]\n" + << "\t (names of data space to use with reductions for OpenMP variants)\n" + << "\t Valid data space names are 'Host' or 'Omp'\n"; + str << "\t\t Examples...\n" + << "\t\t --omp-reduction-data-space Omp (run Omp variants with Omp memory)\n\n"; + + str << "\t --omptarget-reduction-data-space [Default is OmpTarget]\n" + << "\t (names of data space to use with reductions for OpenMP Target variants)\n" + << "\t Valid data space names are 'OmpTarget' or 'CudaPinned'\n"; + str << "\t\t Examples...\n" + << "\t\t --omptarget-reduction-data-space OmpTarget (run Omp Target variants with Omp Target memory)\n\n"; + + str << "\t --cuda-reduction-data-space [Default is CudaManagedDevicePreferredHostAccessed]\n" + << "\t (names of data space to use with reductions for CUDA variants)\n" + << "\t Valid data space names are 'CudaDevice', 'CudaPinned', or 'CudaManaged'\n"; + str << "\t\t Examples...\n" + << "\t\t --cuda-reduction-data-space CudaManaged (run CUDA variants with Cuda Managed memory)\n\n"; + + str << "\t --hip-reduction-data-space [Default is HipDevice]\n" + << "\t (names of data space to use with reductions for HIP variants)\n" + << "\t Valid data space names are 'HipDevice', 'HipPinned', or 'HipManaged'\n"; + str << "\t\t Examples...\n" + << "\t\t --hip-reduction-data-space HipManaged (run HIP variants with Hip Managed memory)\n\n"; + + str << "\t --sycl-reduction-data-space [Default is SyclDevice]\n" + << "\t (names of data space to use with reductions for SYCL variants)\n" + << "\t Valid data space names are 'SyclDevice', 'SyclPinned', or 'SyclManaged'\n"; + str << "\t\t Examples...\n" + << "\t\t --sycl-reduction-data-space SyclManaged (run SYCL variants with Sycl Managed memory)\n\n"; + + str << "\t --kokkos-reduction-data-space [Default is Host]\n" + << "\t (names of data space to use with reductions)\n"; + str << "\t\t Examples...\n" + << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n" + << "\t\t -kds HipPinned (run KOKKOS variants with HIP Pinned memory)\n\n"; + + str << "\t --seq-mpi-data-space [Default is Host]\n" + << "\t (name of data space to use with MPI and sequential execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --seq-mpi-data-space Host (run sequential variants with Host memory for MPI buffers)\n\n"; + + str << "\t --omp-mpi-data-space [Default is Omp]\n" + << "\t (name of data space to use with MPI and OpenMP execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --omp-mpi-data-space Omp (run Omp variants with Omp memory for MPI buffers)\n\n"; + + str << "\t --omptarget-mpi-data-space [Default is Copy]\n" + << "\t (name of data space to use with MPI and OpenMP target execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --omptarget-mpi-data-space Copy (run Omp Target variants and copy to Host memory for MPI buffers)\n\n"; + + str << "\t --cuda-mpi-data-space [Default is CudaPinned]\n" + << "\t (name of data space to use with MPI and CUDA execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --cuda-mpi-data-space CudaPinned (run CUDA variants with Cuda Pinned memory for MPI buffers)\n\n"; + + str << "\t --hip-mpi-data-space [Default is HipPinned]\n" + << "\t (name of data space to use with MPI and HIP execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --hip-mpi-data-space Copy (run HIP variants and copy to Host memory for MPI buffers)\n\n"; + + str << "\t --sycl-mpi-data-space [Default is SyclPinned]\n" + << "\t (name of data space to use with MPI and SYCL execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --sycl-mpi-data-space Copy (run SYCL variants and copy to Host memory for MPI buffers)\n\n"; + + str << "\t --kokkos-mpi-data-space [Default is Copy]\n" + << "\t (name of data space to use with MPI and kokkos execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --kokkos-mpi-data-space Copy (run KOKKOS variants and copy to Host memory for MPI buffers)\n\n"; #if defined(RAJA_PERFSUITE_USE_CALIPER) str << "\t --add-to-spot-config, -atsc [Default is none]\n" - << "\t\t appends additional parameters to the built-in Caliper spot config\n"; + << "\t\t appends additional parameters to the built-in Caliper spot config (CALI_CONFIG=spot(...))\n"; str << "\t\t Example to include some PAPI counters (Intel arch)\n" << "\t\t -atsc topdown.all\n\n"; + str << "\t --add-to-cali-config, -atcc [Default is none]\n" + << "\t\t include parameters in the Caliper config (same as CALI_CONFIG=...)\n"; + str << "\t\t Example to include time spent in MPI functions\n" + << "\t\t -atcc mpi-report\n\n"; #endif str << std::endl; @@ -1034,21 +1690,27 @@ void RunParams::printVariantNames(std::ostream& str) const void RunParams::printDataSpaceNames(std::ostream& str) const { str << "\nAvailable data spaces:"; - str << "\n-------------------\n"; + str << "\n----------------------\n"; for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { DataSpace ds = static_cast(ids); if (isDataSpaceAvailable(ds)) { str << getDataSpaceName(ds) << std::endl; } } - str << "\nUnavailable data spaces:"; - str << "\n-------------------\n"; + str << "\nUnavailable data spaces in current build configuration:"; + str << "\n-------------------------------------------------------\n"; for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { DataSpace ds = static_cast(ids); if (!isDataSpaceAvailable(ds)) { str << getDataSpaceName(ds) << std::endl; } } + str << "\nPseudo data spaces:"; + str << "\n-------------------\n"; + for (int ids = static_cast(DataSpace::NumSpaces)+1; ids < static_cast(DataSpace::EndPseudoSpaces); ++ids) { + DataSpace ds = static_cast(ids); + str << getDataSpaceName(ds) << std::endl; + } str.flush(); } @@ -1316,6 +1978,77 @@ void RunParams::processKernelInput() // // ================================================================ + run_warmup_kernels.clear(); + + if ( !warmup_kernel_input.empty() ) { + + // + // Need to parse input to determine which warmup kernels to run + // + + // Make list copy of warmup kernel name input to manipulate for + // processing potential group names and/or kernel names, next + Slist warmup_kern_names(warmup_kernel_input.begin(), warmup_kernel_input.end()); + + // + // Search warmup_kern_names for matching group names. + // warmup_groups2run will contain names of groups to run. + // + Svector warmup_groups2run; + for (Slist::iterator it = warmup_kern_names.begin(); it != warmup_kern_names.end(); ++it) + { + for (size_t ig = 0; ig < NumGroups; ++ig) { + const std::string& group_name = getGroupName(static_cast(ig)); + if ( group_name == *it ) { + warmup_groups2run.push_back(group_name); + } + } + } + + // + // If group name(s) found in warmup_kern_names, assemble kernels in group(s) + // to run and remove those group name(s) from warmup_kern_names list. + // + for (size_t ig = 0; ig < warmup_groups2run.size(); ++ig) { + const std::string& gname(warmup_groups2run[ig]); + + for (size_t kid = 0; kid < NumKernels; ++kid) { + KernelID tkid = static_cast(kid); + if ( getFullKernelName(tkid).find(gname) != std::string::npos && + exclude_kernels.find(tkid) == exclude_kernels.end()) { + run_warmup_kernels.insert(tkid); + } + } + + warmup_kern_names.remove(gname); + } + + // + // Look for matching names of individual kernels in remaining warmup_kern_names. + // + for (Slist::iterator it = warmup_kern_names.begin(); it != warmup_kern_names.end(); ++it) + { + bool found_it = false; + + for (size_t kid = 0; kid < NumKernels && !found_it; ++kid) { + KernelID tkid = static_cast(kid); + if ( getKernelName(tkid) == *it || getFullKernelName(tkid) == *it ) { + if (exclude_kernels.find(tkid) == exclude_kernels.end()) { + run_warmup_kernels.insert(tkid); + } + found_it = true; + } + } + + // Assemble invalid input for output message. + if ( !found_it ) { + invalid_warmup_kernel_input.push_back(*it); + } + + } // iterate over kernel name input + + } + run_kernels.clear(); if ( kernel_input.empty() && feature_input.empty() ) { @@ -1465,7 +2198,8 @@ void RunParams::processKernelInput() // Set BadInput state based on invalid kernel input // - if ( !(invalid_kernel_input.empty()) || + if ( !(invalid_warmup_kernel_input.empty()) || + !(invalid_kernel_input.empty()) || !(invalid_exclude_kernel_input.empty()) ) { input_state = BadInput; } diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index bfa8f8896..46cd78f4f 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -12,9 +12,15 @@ #include #include #include +#include #include #include "RAJAPerfSuite.hpp" +#include "RPTypes.hpp" + + +#define ARRAY_OF_PTRS_MAX_ARRAY_SIZE 26 + namespace rajaperf { @@ -95,6 +101,37 @@ class RunParams { } } + /*! + * \brief Enumeration for the bin assignment algorithm used in multi-reduce kernels + */ + enum struct BinAssignmentAlgorithm : int { + Random, /*!< random bin for each iterate */ + RunsRandomSizes, /*!< each bin in turn is repeated a random number of times, + Ex. 6 bins and 10 iterates [ 0 0 1 2 2 2 2 3 3 5] */ + RunsEvenSizes, /*!< each bin in turn is repeated the same number of times, + Ex. 6 bins and 10 iterates [ 0 0 1 1 2 2 3 3 4 5] */ + Single /*!< use bin 0 for each iterate */ + }; + + /*! + * \brief Translate BinAssignmentAlgorithm enum value to string + */ + static std::string BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm baa) + { + switch (baa) { + case BinAssignmentAlgorithm::Random: + return "Random"; + case BinAssignmentAlgorithm::RunsRandomSizes: + return "RunsRandomSizes"; + case BinAssignmentAlgorithm::RunsEvenSizes: + return "RunsEvenSizes"; + case BinAssignmentAlgorithm::Single: + return "Single"; + default: + return "Unknown"; + } + } + /*! * \brief Return state of input parsed to this point. */ @@ -119,7 +156,19 @@ class RunParams { double getSizeFactor() const { return size_factor; } - size_t getDataAlignment() const { return data_alignment; } + Size_type getDataAlignment() const { return data_alignment; } + + Index_type getMultiReduceNumBins() const { return multi_reduce_num_bins; } + BinAssignmentAlgorithm getMultiReduceBinAssignmentAlgorithm() const { return multi_reduce_bin_assignment_algorithm; } + + Index_type getLtimesNumD() const { return ltimes_num_d; } + Index_type getLtimesNumG() const { return ltimes_num_g; } + Index_type getLtimesNumM() const { return ltimes_num_m; } + + Index_type getArrayOfPtrsArraySize() const { return array_of_ptrs_array_size; } + + Index_type getHaloWidth() const { return halo_width; } + Index_type getHaloNumVars() const { return halo_num_vars; } int getGPUStream() const { return gpu_stream; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } @@ -132,6 +181,31 @@ class RunParams { } return false; } + size_t numValidAtomicReplication() const { return atomic_replications.size(); } + bool validAtomicReplication(size_t atomic_replication) const + { + for (size_t valid_atomic_replication : atomic_replications) { + if (valid_atomic_replication == atomic_replication) { + return true; + } + } + return false; + } + size_t numValidItemsPerThread() const { return items_per_threads.size(); } + bool validItemsPerThread(size_t items_per_thread) const + { + for (size_t valid_items_per_thread : items_per_threads) { + if (valid_items_per_thread == items_per_thread) { + return true; + } + } + return false; + } + + int getMPISize() const { return mpi_size; } + int getMPIRank() const { return mpi_rank; } + bool validMPI3DDivision() const { return (mpi_3d_division[0]*mpi_3d_division[1]*mpi_3d_division[2] == mpi_size); } + std::array const& getMPI3DDivision() const { return mpi_3d_division; } DataSpace getSeqDataSpace() const { return seqDataSpace; } DataSpace getOmpDataSpace() const { return ompDataSpace; } @@ -139,6 +213,23 @@ class RunParams { DataSpace getCudaDataSpace() const { return cudaDataSpace; } DataSpace getHipDataSpace() const { return hipDataSpace; } DataSpace getKokkosDataSpace() const { return kokkosDataSpace; } + DataSpace getSyclDataSpace() const { return syclDataSpace; } + + DataSpace getSeqReductionDataSpace() const { return seqReductionDataSpace; } + DataSpace getOmpReductionDataSpace() const { return ompReductionDataSpace; } + DataSpace getOmpTargetReductionDataSpace() const { return ompTargetReductionDataSpace; } + DataSpace getCudaReductionDataSpace() const { return cudaReductionDataSpace; } + DataSpace getHipReductionDataSpace() const { return hipReductionDataSpace; } + DataSpace getSyclReductionDataSpace() const { return syclReductionDataSpace; } + DataSpace getKokkosReductionDataSpace() const { return kokkosReductionDataSpace; } + + DataSpace getSeqMPIDataSpace() const { return seqMPIDataSpace; } + DataSpace getOmpMPIDataSpace() const { return ompMPIDataSpace; } + DataSpace getOmpTargetMPIDataSpace() const { return ompTargetMPIDataSpace; } + DataSpace getCudaMPIDataSpace() const { return cudaMPIDataSpace; } + DataSpace getHipMPIDataSpace() const { return hipMPIDataSpace; } + DataSpace getSyclMPIDataSpace() const { return syclMPIDataSpace; } + DataSpace getKokkosMPIDataSpace() const { return kokkosMPIDataSpace; } double getPFTolerance() const { return pf_tol; } @@ -156,10 +247,12 @@ class RunParams { #if defined(RAJA_PERFSUITE_USE_CALIPER) const std::string& getAddToSpotConfig() const { return add_to_spot_config; } + const std::string& getAddToCaliperConfig() const { return add_to_cali_config; } #endif bool getDisableWarmup() const { return disable_warmup; } + const std::set& getWarmupKernelIDsToRun() const { return run_warmup_kernels; } const std::set& getKernelIDsToRun() const { return run_kernels; } const std::set& getVariantIDsToRun() const { return run_variants; } VariantID getReferenceVariantID() const { return reference_vid; } @@ -208,10 +301,28 @@ class RunParams { SizeMeaning size_meaning; /*!< meaning of size value */ double size; /*!< kernel size to run (input option) */ double size_factor; /*!< default kernel size multipier (input option) */ - size_t data_alignment; + Size_type data_alignment; + + Index_type multi_reduce_num_bins; /*!< number of bins used in multi reduction kernels (input option) */ + BinAssignmentAlgorithm multi_reduce_bin_assignment_algorithm; /*!< algorithm used to assign bins to iterates used in multi reduction kernels (input option) */ + + Index_type ltimes_num_d; /*!< num_d used in ltimes kernels (input option) */ + Index_type ltimes_num_g; /*!< num_g used in ltimes kernels (input option) */ + Index_type ltimes_num_m; /*!< num_m used in ltimes kernels (input option) */ + + Index_type array_of_ptrs_array_size; /*!< number of pointers used in ARRAY_OF_PTRS kernel (input option) */ + + Index_type halo_width; /*!< halo width used in halo kernels (input option) */ + Index_type halo_num_vars; /*!< num vars used in halo kernels (input option) */ int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ + std::vector atomic_replications; /*!< Atomic replications for gpu tunings to run (input option) */ + std::vector items_per_threads; /*!< Items per thread for gpu tunings to run (input option) */ + + int mpi_size; /*!< Number of MPI ranks */ + int mpi_rank; /*!< Rank of this MPI process */ + std::array mpi_3d_division; /*!< Number of MPI ranks in each dimension of a 3D grid */ double pf_tol; /*!< pct RAJA variant run time can exceed base for each PM case to pass/fail acceptance */ @@ -228,11 +339,30 @@ class RunParams { DataSpace cudaDataSpace = DataSpace::CudaDevice; DataSpace hipDataSpace = DataSpace::HipDevice; DataSpace kokkosDataSpace = DataSpace::Host; + DataSpace syclDataSpace = DataSpace::SyclDevice; + + DataSpace seqReductionDataSpace = DataSpace::Host; + DataSpace ompReductionDataSpace = DataSpace::Omp; + DataSpace ompTargetReductionDataSpace = DataSpace::OmpTarget; + DataSpace cudaReductionDataSpace = DataSpace::CudaManagedDevicePreferredHostAccessed; + DataSpace hipReductionDataSpace = DataSpace::HipDevice; + DataSpace syclReductionDataSpace = DataSpace::SyclDevice; + DataSpace kokkosReductionDataSpace = DataSpace::Host; + + DataSpace seqMPIDataSpace = DataSpace::Host; + DataSpace ompMPIDataSpace = DataSpace::Omp; + DataSpace ompTargetMPIDataSpace = DataSpace::Copy; + DataSpace cudaMPIDataSpace = DataSpace::CudaPinned; + DataSpace hipMPIDataSpace = DataSpace::HipPinned; + DataSpace syclMPIDataSpace = DataSpace::SyclPinned; + DataSpace kokkosMPIDataSpace = DataSpace::Copy; // // Arrays to hold input strings for valid/invalid input. Helpful for // debugging command line args. // + std::vector warmup_kernel_input; + std::vector invalid_warmup_kernel_input; std::vector kernel_input; std::vector invalid_kernel_input; std::vector exclude_kernel_input; @@ -258,10 +388,12 @@ class RunParams { #if defined(RAJA_PERFSUITE_USE_CALIPER) std::string add_to_spot_config; + std::string add_to_cali_config; #endif bool disable_warmup; + std::set run_warmup_kernels; std::set run_kernels; std::set run_variants; diff --git a/src/common/SyclDataUtils.hpp b/src/common/SyclDataUtils.hpp new file mode 100644 index 000000000..e426476c4 --- /dev/null +++ b/src/common/SyclDataUtils.hpp @@ -0,0 +1,157 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Methods for SYCL kernel data allocation, initialization, and deallocation. +/// + + +#ifndef RAJAPerf_SyclDataUtils_HPP +#define RAJAPerf_SyclDataUtils_HPP + +#include "RPTypes.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/GPUUtils.hpp" + +#include + + +namespace rajaperf +{ + +/*! + * \brief Copy given hptr (host) data to SYCL device (dptr). + * + * Method assumes both host and device data arrays are allocated + * and of propoer size for copy operation to succeed. + */ +template +void initSyclDeviceData(T& dptr, const T hptr, int len, sycl::queue* qu) +{ + auto e = qu->memcpy( dptr, hptr, + len * sizeof(typename std::remove_pointer::type)); + e.wait(); + + detail::incDataInitCount(); +} + +/*! + * \brief Allocate SYCL device data array (dptr) and copy given hptr (host) + * data to device array. + */ +template +void allocAndInitSyclDeviceData(T& dptr, const T hptr, int len, sycl::queue *qu) +{ + dptr = sycl::malloc_device::type>(len, *qu); + + initSyclDeviceData(dptr, hptr, len, qu); +} + +/*! + * \brief Copy given dptr (SYCL device) data to host (hptr). + * + * Method assumes both host and device data arrays are allocated + * and of propoer size for copy operation to succeed. + */ +template +void getSyclDeviceData(T& hptr, const T dptr, int len, sycl::queue *qu) +{ + auto e = qu->memcpy( hptr, dptr, + len * sizeof(typename std::remove_pointer::type)); + e.wait(); +} + +/*! + * \brief Free device data array. + */ +template +void deallocSyclDeviceData(T& dptr, sycl::queue *qu) +{ + sycl::free(dptr, *qu); + dptr = 0; +} + +namespace detail +{ +/* + * Copy memory len bytes from src to dst. + */ +inline void copySyclData(void* dst_ptr, const void* src_ptr, Size_type len, sycl::queue *qu) +{ + auto e = qu->memcpy( dst_ptr, src_ptr, len); + e.wait(); +} + +/*! + * \brief Allocate SYCL device data array (dptr). + */ +inline void* allocSyclDeviceData(Size_type len, sycl::queue *qu) +{ + void* dptr = nullptr; + dptr = sycl::malloc_device(len, *qu); + return dptr; +} + +/*! + * \brief Allocate SYCL managed data array (dptr). + */ +inline void* allocSyclManagedData(Size_type len, sycl::queue *qu) +{ + void* mptr = nullptr; + mptr = sycl::malloc_shared(len, *qu); + return mptr; +} + +/*! + * \brief Allocate SYCL pinned data array (pptr). + */ +inline void* allocSyclPinnedData(Size_type len, sycl::queue *qu) +{ + void* pptr = nullptr; + pptr = sycl::malloc_host(len, *qu); + return pptr; +} + + +/*! + * \brief Free device data array. + */ +inline void deallocSyclDeviceData(void* dptr, sycl::queue *qu) +{ + sycl::free(dptr, *qu); + dptr = 0; +} + +/*! + * \brief Free managed data array. + */ +inline void deallocSyclManagedData(void* dptr, sycl::queue *qu) +{ + sycl::free(dptr, *qu); + dptr = 0; +} + +/*! + * \brief Free managed data array. + */ +inline void deallocSyclPinnedData(void* dptr, sycl::queue *qu) +{ + sycl::free(dptr, *qu); + dptr = 0; +} + +} // closing brace for detail namespac + +} // closing brace for rajaperf namespace + +#endif // RAJA_ENABLE_SYCL + +#endif // closing endif for header file include guard + diff --git a/src/lcals-kokkos/CMakeLists.txt b/src/lcals-kokkos/CMakeLists.txt index 47e5b48c8..7cb6706e8 100644 --- a/src/lcals-kokkos/CMakeLists.txt +++ b/src/lcals-kokkos/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp index 4c7dd6b39..b8d8311ba 100644 --- a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/EOS-Kokkos.cpp b/src/lcals-kokkos/EOS-Kokkos.cpp index be30c0b60..2046b540d 100644 --- a/src/lcals-kokkos/EOS-Kokkos.cpp +++ b/src/lcals-kokkos/EOS-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp index 02ae5097e..071e2687c 100644 --- a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp index cd2957436..ebc31ddff 100644 --- a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp index b7da76fd0..37b2d0c41 100644 --- a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp index 00960c3aa..8dce97c22 100644 --- a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp +++ b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp index 20e05fde4..a2fdcfd02 100644 --- a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp +++ b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp index 45761b11e..e9b388105 100644 --- a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp +++ b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp index 451e6fe77..7609b3f3c 100644 --- a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp index b2c582790..e5263cf07 100644 --- a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp +++ b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp index ac0943dd8..f0ec388e7 100644 --- a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp +++ b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt index f767bbd0b..6fc819b2b 100644 --- a/src/lcals/CMakeLists.txt +++ b/src/lcals/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -14,65 +14,76 @@ blt_add_library( DIFF_PREDICT-Cuda.cpp DIFF_PREDICT-OMP.cpp DIFF_PREDICT-OMPTarget.cpp + DIFF_PREDICT-Sycl.cpp EOS.cpp EOS-Seq.cpp EOS-Hip.cpp EOS-Cuda.cpp EOS-OMP.cpp EOS-OMPTarget.cpp + EOS-Sycl.cpp FIRST_DIFF.cpp FIRST_DIFF-Seq.cpp FIRST_DIFF-Hip.cpp FIRST_DIFF-Cuda.cpp FIRST_DIFF-OMP.cpp FIRST_DIFF-OMPTarget.cpp + FIRST_DIFF-Sycl.cpp FIRST_MIN.cpp FIRST_MIN-Seq.cpp FIRST_MIN-Hip.cpp FIRST_MIN-Cuda.cpp FIRST_MIN-OMP.cpp FIRST_MIN-OMPTarget.cpp + FIRST_MIN-Sycl.cpp FIRST_SUM.cpp FIRST_SUM-Seq.cpp FIRST_SUM-Hip.cpp FIRST_SUM-Cuda.cpp FIRST_SUM-OMP.cpp FIRST_SUM-OMPTarget.cpp + FIRST_SUM-Sycl.cpp GEN_LIN_RECUR.cpp GEN_LIN_RECUR-Seq.cpp GEN_LIN_RECUR-Hip.cpp GEN_LIN_RECUR-Cuda.cpp GEN_LIN_RECUR-OMP.cpp GEN_LIN_RECUR-OMPTarget.cpp + GEN_LIN_RECUR-Sycl.cpp HYDRO_1D.cpp HYDRO_1D-Seq.cpp HYDRO_1D-Hip.cpp HYDRO_1D-Cuda.cpp HYDRO_1D-OMP.cpp HYDRO_1D-OMPTarget.cpp + HYDRO_1D-Sycl.cpp HYDRO_2D.cpp HYDRO_2D-Seq.cpp HYDRO_2D-Hip.cpp HYDRO_2D-Cuda.cpp HYDRO_2D-OMP.cpp HYDRO_2D-OMPTarget.cpp + HYDRO_2D-Sycl.cpp INT_PREDICT.cpp INT_PREDICT-Seq.cpp INT_PREDICT-Hip.cpp INT_PREDICT-Cuda.cpp INT_PREDICT-OMP.cpp INT_PREDICT-OMPTarget.cpp + INT_PREDICT-Sycl.cpp PLANCKIAN.cpp PLANCKIAN-Seq.cpp PLANCKIAN-Hip.cpp PLANCKIAN-Cuda.cpp PLANCKIAN-OMP.cpp PLANCKIAN-OMPTarget.cpp + PLANCKIAN-Sycl.cpp TRIDIAG_ELIM.cpp TRIDIAG_ELIM-Seq.cpp TRIDIAG_ELIM-Hip.cpp TRIDIAG_ELIM-Cuda.cpp TRIDIAG_ELIM-OMP.cpp TRIDIAG_ELIM-OMPTarget.cpp + TRIDIAG_ELIM-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index c66ca2598..a33f1aecf 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,10 +52,11 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - diff_predict<<>>( px, cx, - offset, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (diff_predict), + grid_size, block_size, + shmem, res.get_stream(), + px, cx, offset, iend ); } stopTimer(); diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 7bd49a994..6d25a6f42 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,10 +53,11 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((diff_predict), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), px, cx, - offset, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (diff_predict), + grid_size, block_size, + shmem, res.get_stream(), + px, cx, offset, iend ); } stopTimer(); diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp index 09da23262..6e2110edb 100644 --- a/src/lcals/DIFF_PREDICT-OMP.cpp +++ b/src/lcals/DIFF_PREDICT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp index e04b1e07d..3509b6aaa 100644 --- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp +++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp index eae7cda8f..9dcd9a035 100644 --- a/src/lcals/DIFF_PREDICT-Seq.cpp +++ b/src/lcals/DIFF_PREDICT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp new file mode 100644 index 000000000..5ac815671 --- /dev/null +++ b/src/lcals/DIFF_PREDICT-Sycl.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFF_PREDICT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + +template +void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + DIFF_PREDICT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) + { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + DIFF_PREDICT_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + DIFF_PREDICT_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n DIFF_PREDICT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DIFF_PREDICT, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index b5ddc90e4..40ff30713 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (10*sizeof(Real_type) + 10*sizeof(Real_type)) * getActualProblemSize()); + setBytesReadPerRep( 10*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 10*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(9 * getActualProblemSize()); setUsesFeature(Forall); @@ -50,6 +52,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 3a583381b..7bd77eade 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -93,18 +93,24 @@ class DIFF_PREDICT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_px; Real_ptr m_cx; diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index a3583ca53..fafbdef56 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,10 +52,13 @@ void EOS::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - eos<<>>( x, y, z, u, - q, r, t, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (eos), + grid_size, block_size, + shmem, res.get_stream(), + x, y, z, u, + q, r, t, + iend ); } stopTimer(); diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 2cbd78891..35c80e320 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,10 +52,13 @@ void EOS::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((eos), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x, y, z, u, - q, r, t, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (eos), + grid_size, block_size, + shmem, res.get_stream(), + x, y, z, u, + q, r, t, + iend ); } stopTimer(); diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp index 7ac9cdb8f..88e8e9da1 100644 --- a/src/lcals/EOS-OMP.cpp +++ b/src/lcals/EOS-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp index 16a6b841b..b9bf454eb 100644 --- a/src/lcals/EOS-OMPTarget.cpp +++ b/src/lcals/EOS-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp index 083fc343e..384a9d260 100644 --- a/src/lcals/EOS-Seq.cpp +++ b/src/lcals/EOS-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp new file mode 100644 index 000000000..898d25bc8 --- /dev/null +++ b/src/lcals/EOS-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "EOS.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + +template +void EOS::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + EOS_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + EOS_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + EOS_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n EOS : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(EOS, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 517d144f8..a9076c144 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -31,8 +31,10 @@ EOS::EOS(const RunParams& params) setItsPerRep( getActualProblemSize() ); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_array_length ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() + + 1*sizeof(Real_type) * m_array_length ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(16 * getActualProblemSize()); checksum_scale_factor = 0.0001 * @@ -58,6 +60,9 @@ EOS::EOS(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 9cc202a02..fed56916d 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -62,18 +62,24 @@ class EOS : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index 05d73d38f..2101da14f 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,12 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - first_diff<<>>( x, y, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (first_diff), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); } stopTimer(); diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 651590776..666b9783d 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,12 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((first_diff), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x, y, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (first_diff), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); } stopTimer(); diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp index a3b814124..b664bfbf7 100644 --- a/src/lcals/FIRST_DIFF-OMP.cpp +++ b/src/lcals/FIRST_DIFF-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp index 341ef57f4..bf3a40ad9 100644 --- a/src/lcals/FIRST_DIFF-OMPTarget.cpp +++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp index 54d2a8ce1..2382015e0 100644 --- a/src/lcals/FIRST_DIFF-Seq.cpp +++ b/src/lcals/FIRST_DIFF-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp new file mode 100644 index 000000000..41bacafe3 --- /dev/null +++ b/src/lcals/FIRST_DIFF-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_DIFF.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + +template +void FIRST_DIFF::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + FIRST_DIFF_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + FIRST_DIFF_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + FIRST_DIFF_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n FIRST_DIFF : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_DIFF, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_Sycl diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 3e8e42ec6..aa5aaa31a 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,11 +28,11 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params) m_N = getActualProblemSize()+1; - setItsPerRep( getActualProblemSize() ); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type) * m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); @@ -54,6 +54,9 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index f3f6424f0..c01907f9b 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,18 +52,24 @@ class FIRST_DIFF : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index e7d860877..08f2ab240 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,10 @@ #include "common/CudaDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -56,11 +60,10 @@ __global__ void first_min(Real_ptr x, } -template < size_t block_size > -void FIRST_MIN::runCudaVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void FIRST_MIN::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -69,50 +72,71 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value + constexpr size_t shmem = sizeof(MyMinLoc)*block_size; + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (first_min), block_size, shmem); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - MyMinLoc* dminloc; - cudaErrchk( cudaMalloc( (void**)&dminloc, - grid_size * sizeof(MyMinLoc) ) ); + RAJAPERF_CUDA_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { FIRST_MIN_MINLOC_INIT; + RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size, 1); - constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - first_min<<>>(x, dminloc, mymin, iend); - cudaErrchk( cudaGetLastError() ); - - cudaErrchk( cudaMemcpyAsync( mymin_block, dminloc, - grid_size * sizeof(MyMinLoc), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + RPlaunchCudaKernel( (first_min), + grid_size, block_size, + shmem, res.get_stream(), + x, dminloc, mymin, + iend ); + RAJAPERF_CUDA_REDUCER_COPY_BACK(dminloc, mymin_block, grid_size, 1); for (Index_type i = 0; i < static_cast(grid_size); i++) { if ( mymin_block[i].val < mymin.val ) { mymin = mymin_block[i]; } } - m_minloc = RAJA_MAX(m_minloc, mymin.loc); + m_minloc = mymin.loc; } stopTimer(); - cudaErrchk( cudaFree( dminloc ) ); - delete[] mymin_block; + RAJAPERF_CUDA_REDUCER_TEARDOWN(dminloc, mymin_block); - } else if ( vid == RAJA_CUDA ) { + } else { + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename MappingHelper > +void FIRST_MIN::runCudaVariantRAJA(VariantID vid) +{ + using reduction_policy = RAJA::cuda_reduce; + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceMinLoc loc( + RAJA::ReduceMinLoc loc( m_xmin_init, m_initloc); - RAJA::forall< RAJA::cuda_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { FIRST_MIN_BODY_RAJA; }); @@ -127,7 +151,155 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Cuda) +template < size_t block_size, typename MappingHelper > +void FIRST_MIN::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + using VL_TYPE = RAJA::expt::ValLoc; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + VL_TYPE tloc(m_xmin_init, m_initloc); + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=] __device__ (Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJA(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_host_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } else if ( vid == RAJA_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + }); + + } + + }); + + } + +} } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index bb106ce0e..3c6fd7b35 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,10 @@ #include "common/HipDataUtils.hpp" #include +#include +#include +#include + namespace rajaperf { @@ -56,11 +60,10 @@ __global__ void first_min(Real_ptr x, } -template < size_t block_size > -void FIRST_MIN::runHipVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void FIRST_MIN::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -69,31 +72,28 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value + constexpr size_t shmem = sizeof(MyMinLoc)*block_size; + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (first_min), block_size, shmem); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - MyMinLoc* dminloc; - hipErrchk( hipMalloc( (void**)&dminloc, - grid_size * sizeof(MyMinLoc) ) ); + RAJAPERF_HIP_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { FIRST_MIN_MINLOC_INIT; + RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size, 1); - constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - hipLaunchKernelGGL( (first_min), grid_size, block_size, - shmem, res.get_stream(), x, - dminloc, - mymin, - iend ); - hipErrchk( hipGetLastError() ); - - hipErrchk( hipMemcpyAsync( mymin_block, dminloc, - grid_size * sizeof(MyMinLoc), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + RPlaunchHipKernel( (first_min), + grid_size, block_size, + shmem, res.get_stream(), + x, dminloc, mymin, + iend ); + RAJAPERF_HIP_REDUCER_COPY_BACK(dminloc, mymin_block, grid_size, 1); for (Index_type i = 0; i < static_cast(grid_size); i++) { if ( mymin_block[i].val < mymin.val ) { mymin = mymin_block[i]; @@ -104,18 +104,39 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) } stopTimer(); - hipErrchk( hipFree( dminloc ) ); - delete[] mymin_block; + RAJAPERF_HIP_REDUCER_TEARDOWN(dminloc, mymin_block); - } else if ( vid == RAJA_HIP ) { + } else { + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename MappingHelper > +void FIRST_MIN::runHipVariantRAJA(VariantID vid) +{ + using reduction_policy = RAJA::hip_reduce; + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceMinLoc loc( + RAJA::ReduceMinLoc loc( m_xmin_init, m_initloc); - RAJA::forall< RAJA::hip_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { FIRST_MIN_BODY_RAJA; }); @@ -130,7 +151,153 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Hip) +template < size_t block_size, typename MappingHelper > +void FIRST_MIN::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + using VL_TYPE = RAJA::expt::ValLoc; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + VL_TYPE tloc(m_xmin_init, m_initloc); + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=] __device__ (Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + } +} + +void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJA(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void FIRST_MIN::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_host_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } else if ( vid == RAJA_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + +} } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 1a7722570..a9a7f1ba1 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -87,21 +87,49 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + FIRST_MIN_BODY_RAJA; + }); + + m_minloc = loc.getLoc(); + + } + stopTimer(); + + } else if (tune_idx == 1) { + + using VL_TYPE = RAJA::expt::ValLoc; - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - FIRST_MIN_BODY_RAJA; - }); + VL_TYPE tloc(m_xmin_init, m_initloc); - m_minloc = loc.getLoc(); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=](Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown OpenMP tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -114,8 +142,17 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void FIRST_MIN::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index 5a4dccc69..578bdfe63 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -27,7 +27,7 @@ namespace lcals const size_t threads_per_team = 256; -void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -60,21 +60,49 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + FIRST_MIN_BODY_RAJA; + }); + + m_minloc = loc.getLoc(); + + } + stopTimer(); + + } else if (tune_idx == 1) { - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); + using VL_TYPE = RAJA::expt::ValLoc; - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - FIRST_MIN_BODY_RAJA; - }); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - m_minloc = loc.getLoc(); + VL_TYPE tloc(m_xmin_init, m_initloc); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=](Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown OMP Target tuning index = " << tune_idx << std::endl; } - stopTimer(); } else { getCout() << "\n FIRST_MIN : Unknown OMP Target variant id = " << vid << std::endl; @@ -82,6 +110,14 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } +void FIRST_MIN::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index 6e5de0437..a32ed4962 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,8 +18,11 @@ namespace lcals { -void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void FIRST_MIN::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -76,21 +79,49 @@ void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + FIRST_MIN_BODY_RAJA; + }); + + m_minloc = loc.getLoc(); + + } + stopTimer(); + + } else if (tune_idx == 1) { + + using VL_TYPE = RAJA::expt::ValLoc; - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - FIRST_MIN_BODY_RAJA; - }); + VL_TYPE tloc(m_xmin_init, m_initloc); - m_minloc = loc.getLoc(); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=](Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown Seq tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -104,5 +135,13 @@ void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } +void FIRST_MIN::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-Sycl.cpp b/src/lcals/FIRST_MIN-Sycl.cpp new file mode 100644 index 000000000..616c84dcb --- /dev/null +++ b/src/lcals/FIRST_MIN-Sycl.cpp @@ -0,0 +1,118 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_MIN.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace lcals +{ + +template +struct reduce_pair { + bool operator<(const reduce_pair& o) const { + return (val < o.val); + } + VAL_TYPE val; + IDX_TYPE idx; +}; + +template +void FIRST_MIN::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + FIRST_MIN_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + using result_type = reduce_pair; + + auto result = sycl::malloc_shared< result_type >(1, *qu); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + result_type result_init = { m_xmin_init, m_initloc }; + *result = result_init; + auto reduction_obj = sycl::reduction( result, result_init, sycl::minimum() ); + + qu->submit([&] (sycl::handler& h) { + + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + reduction_obj, + [=] (sycl::nd_item<1> item, auto& loc) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + loc.combine( {x[i], i} ); + } + + }); + + }); + + qu->wait(); + + m_minloc = static_cast(result->idx); + + } + stopTimer(); + + sycl::free(result, *qu); + + } else if ( vid == RAJA_SYCL ) { + + using VL_TYPE = RAJA::expt::ValLoc; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + VL_TYPE tloc(m_xmin_init, m_initloc); + + RAJA::forall< RAJA::sycl_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=] (Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + std::cout << "\n FIRST_MIN : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 875932958..63a3be8df 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -33,9 +33,12 @@ FIRST_MIN::FIRST_MIN(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) + - (1*sizeof(Index_type) + 1*sizeof(Index_type)) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N ); + setBytesReadPerRep( 1*sizeof(Index_type) + + 1*sizeof(Real_type ) + + 1*sizeof(Real_type ) * m_N ); + setBytesWrittenPerRep( 1*sizeof(Index_type) + + 1*sizeof(Real_type ) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); @@ -58,6 +61,9 @@ FIRST_MIN::FIRST_MIN(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index dd00d4392..1660739fb 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -79,18 +79,37 @@ class FIRST_MIN : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); + + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_type m_xmin_init; diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 2ac57e5a1..b4a025a20 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -49,11 +49,14 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - first_sum<<>>( x, y, - iend ); - cudaErrchk( cudaGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (first_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); } stopTimer(); diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 5f48abe69..01c2082d5 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,12 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((first_sum),grid_size, block_size, shmem, res.get_stream(), x, y, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (first_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); } stopTimer(); diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp index e545538fc..223379dbe 100644 --- a/src/lcals/FIRST_SUM-OMP.cpp +++ b/src/lcals/FIRST_SUM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp index 324b26d54..932f32fc4 100644 --- a/src/lcals/FIRST_SUM-OMPTarget.cpp +++ b/src/lcals/FIRST_SUM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index 4d3ef658f..4eba8626e 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-Sycl.cpp b/src/lcals/FIRST_SUM-Sycl.cpp new file mode 100644 index 000000000..3d63fdcbc --- /dev/null +++ b/src/lcals/FIRST_SUM-Sycl.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +template < size_t work_group_size > +void FIRST_SUM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + FIRST_SUM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < iend) { + FIRST_SUM_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + FIRST_SUM_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_SUM : Unknown Syclvariant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_SUM, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index 046528e2b..a40bf533a 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -30,8 +30,9 @@ FIRST_SUM::FIRST_SUM(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_N-1) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N-1) ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * (getActualProblemSize()-1)); setUsesFeature(Forall); @@ -53,6 +54,9 @@ FIRST_SUM::FIRST_SUM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 59c1c0bfd..1fc9c48cd 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -55,18 +55,24 @@ class FIRST_SUM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 3790be5f5..17e56a2a0 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -65,16 +65,24 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size); - genlinrecur1<<>>( b5, stb5, sa, sb, - kb5i, - N ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (genlinrecur1), + grid_size1, block_size, + shmem, res.get_stream(), + b5, stb5, + sa, sb, + kb5i, + N ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); - genlinrecur2<<>>( b5, stb5, sa, sb, - kb5i, - N ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (genlinrecur2), + grid_size2, block_size, + shmem, res.get_stream(), + b5, stb5, + sa, sb, + kb5i, + N ); } stopTimer(); diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index b4dc1be54..5d428fa87 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -65,18 +65,24 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((genlinrecur1), grid_size1, block_size, shmem, res.get_stream(), - b5, stb5, sa, sb, - kb5i, - N ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (genlinrecur1), + grid_size1, block_size, + shmem, res.get_stream(), + b5, stb5, + sa, sb, + kb5i, + N ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); - hipLaunchKernelGGL((genlinrecur2), grid_size2, block_size, shmem, res.get_stream(), - b5, stb5, sa, sb, - kb5i, - N ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (genlinrecur2), + grid_size2, block_size, + shmem, res.get_stream(), + b5, stb5, + sa, sb, + kb5i, + N ); } stopTimer(); diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp index 660d47273..d4ac65995 100644 --- a/src/lcals/GEN_LIN_RECUR-OMP.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp index 3838a2af0..9932469cb 100644 --- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp index 9d728a9f7..b1d6c3be5 100644 --- a/src/lcals/GEN_LIN_RECUR-Seq.cpp +++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp new file mode 100644 index 000000000..06ca45e7b --- /dev/null +++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp @@ -0,0 +1,98 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "GEN_LIN_RECUR.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + +template +void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + GEN_LIN_RECUR_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size1, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type k = item.get_global_id(0); + if (k < N) { + GEN_LIN_RECUR_BODY1; + } + + }); + }); + + const size_t global_size2 = work_group_size * RAJA_DIVIDE_CEILING_INT(N+1, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size2, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < N+1) { + GEN_LIN_RECUR_BODY2; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(0, N), [=] (Index_type k) { + GEN_LIN_RECUR_BODY1; + }); + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(1, N+1), [=] (Index_type i) { + GEN_LIN_RECUR_BODY2; + }); + + } + stopTimer(); + + } else { + std::cout << "\n GEN_LIN_RECUR : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(GEN_LIN_RECUR, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 9c132a3db..80b7f9b10 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -30,8 +30,11 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(2); - setBytesPerRep( (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_N + - (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_N ); + setBytesReadPerRep( 3*sizeof(Real_type ) * m_N + + 3*sizeof(Real_type ) * m_N ); + setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N + + 2*sizeof(Real_type ) * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((3 + 3 ) * getActualProblemSize()); @@ -58,6 +61,9 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 9586a69b4..33c0895af 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -76,18 +76,24 @@ class GEN_LIN_RECUR : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_b5; Real_ptr m_sa; diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index 960f80c49..4d89cc5b6 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,10 +52,13 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hydro_1d<<>>( x, y, z, - q, r, t, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (hydro_1d), + grid_size, block_size, + shmem, res.get_stream(), + x, y, z, + q, r, t, + iend ); } stopTimer(); diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index c04da1eb2..aa2a12c99 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,10 +52,13 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((hydro_1d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x, y, z, - q, r, t, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (hydro_1d), + grid_size, block_size, + shmem, res.get_stream(), + x, y, z, + q, r, t, + iend ); } stopTimer(); diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp index f2088205a..d3ac150a4 100644 --- a/src/lcals/HYDRO_1D-OMP.cpp +++ b/src/lcals/HYDRO_1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp index b5fbe0657..b5cf82420 100644 --- a/src/lcals/HYDRO_1D-OMPTarget.cpp +++ b/src/lcals/HYDRO_1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp index 47ca2aedd..22f257e8d 100644 --- a/src/lcals/HYDRO_1D-Seq.cpp +++ b/src/lcals/HYDRO_1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp new file mode 100644 index 000000000..3ccbad9a7 --- /dev/null +++ b/src/lcals/HYDRO_1D-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HYDRO_1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + +template +void HYDRO_1D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + HYDRO_1D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + HYDRO_1D_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + HYDRO_1D_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n HYDRO_1D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HYDRO_1D, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index d92267fc9..c4821788f 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -30,8 +30,10 @@ HYDRO_1D::HYDRO_1D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * getActualProblemSize() + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * (getActualProblemSize()+1) ); + setBytesReadPerRep( 1*sizeof(Real_type ) * getActualProblemSize() + + 1*sizeof(Real_type ) * (getActualProblemSize()+1) ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(5 * getActualProblemSize()); checksum_scale_factor = 0.001 * @@ -57,6 +59,9 @@ HYDRO_1D::HYDRO_1D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index dd61f112c..4827fcecd 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -57,18 +57,24 @@ class HYDRO_1D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index 2f46572b4..ad09dd2a3 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -111,25 +111,31 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) HYDRO_2D_THREADS_PER_BLOCK_CUDA; HYDRO_2D_NBLOCKS_CUDA; - hydro_2d1 - <<>>(zadat, zbdat, - zpdat, zqdat, zrdat, zmdat, - jn, kn); - cudaErrchk( cudaGetLastError() ); - - hydro_2d2 - <<>>(zudat, zvdat, - zadat, zbdat, zzdat, zrdat, - s, - jn, kn); - cudaErrchk( cudaGetLastError() ); - - hydro_2d3 - <<>>(zroutdat, zzoutdat, - zrdat, zudat, zzdat, zvdat, - t, - jn, kn); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (hydro_2d1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + zadat, zbdat, + zpdat, zqdat, + zrdat, zmdat, + jn, kn); + + RPlaunchCudaKernel( (hydro_2d2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + zudat, zvdat, + zadat, zbdat, + zzdat, zrdat, + s, + jn, kn); + + RPlaunchCudaKernel( (hydro_2d3), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + zroutdat, zzoutdat, + zrdat, zudat, + zzdat, zvdat, + t, + jn, kn); } stopTimer(); diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index 0d65cb260..58b530ba8 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -111,28 +111,31 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) HYDRO_2D_THREADS_PER_BLOCK_HIP; HYDRO_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((hydro_2d1), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), + RPlaunchHipKernel( (hydro_2d1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), zadat, zbdat, - zpdat, zqdat, zrdat, zmdat, + zpdat, zqdat, + zrdat, zmdat, jn, kn); - hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((hydro_2d2), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), + RPlaunchHipKernel( (hydro_2d2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), zudat, zvdat, - zadat, zbdat, zzdat, zrdat, + zadat, zbdat, + zzdat, zrdat, s, jn, kn); - hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((hydro_2d3), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), + RPlaunchHipKernel( (hydro_2d3), + nblocks, nthreads_per_block, + shmem, res.get_stream(), zroutdat, zzoutdat, - zrdat, zudat, zzdat, zvdat, + zrdat, zudat, + zzdat, zvdat, t, jn, kn); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp index e153dbdca..92f1bb080 100644 --- a/src/lcals/HYDRO_2D-OMP.cpp +++ b/src/lcals/HYDRO_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp index 43304884b..f830feefb 100644 --- a/src/lcals/HYDRO_2D-OMPTarget.cpp +++ b/src/lcals/HYDRO_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp index cf43e885f..522dba679 100644 --- a/src/lcals/HYDRO_2D-Seq.cpp +++ b/src/lcals/HYDRO_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp new file mode 100644 index 000000000..975467bc5 --- /dev/null +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -0,0 +1,155 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HYDRO_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define k_wg_sz (work_group_size / j_wg_sz) + +template +void HYDRO_2D::runSyclVariantImpl(VariantID vid) { + + const Index_type run_reps = getRunReps(); + const Index_type kbeg = 1; + const Index_type kend = m_kn - 1; + const Index_type jbeg = 1; + const Index_type jend = m_jn - 1; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + HYDRO_2D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim(1, + k_wg_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(jn-2, j_wg_sz)); + sycl::range<3> wkgroup_dim(1, k_wg_sz, j_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + int j = item.get_global_id(2) + 1; + int k = item.get_global_id(1) + 1; + + if (j < jn-1 && k < kn-1) { + HYDRO_2D_BODY1 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + int j = item.get_global_id(2) + 1; + int k = item.get_global_id(1) + 1; + + if (j < jn-1 && k < kn-1) { + HYDRO_2D_BODY2 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + int j = item.get_global_id(2) + 1; + int k = item.get_global_id(1) + 1; + + if (j < jn-1 && k < kn-1) { + HYDRO_2D_BODY3 + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + HYDRO_2D_VIEWS_RAJA; + + using EXECPOL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_resource( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + res, + [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY1_RAJA; + }); + + RAJA::kernel_resource( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + res, + [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY2_RAJA; + }); + + RAJA::kernel_resource( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + res, + [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY3_RAJA; + }); + + } + stopTimer(); + + } else { + std::cout << "\n HYDRO_2D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HYDRO_2D, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index fd1dd9406..d1ae233d0 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -33,18 +33,20 @@ HYDRO_2D::HYDRO_2D(const RunParams& params) setDefaultProblemSize(m_kn * m_jn); setDefaultReps(100); - m_jn = m_kn = std::sqrt(getTargetProblemSize()); + m_jn = m_kn = std::sqrt(getTargetProblemSize()) + std::sqrt(2)-1; m_array_length = m_kn * m_jn; setActualProblemSize( getTargetProblemSize() ); setItsPerRep( 3 * getActualProblemSize() ); setKernelsPerRep(3); - setBytesPerRep( (2*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) + - (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_array_length + - (2*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) + - (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_array_length + - (2*sizeof(Real_type ) + 4*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) ); + setBytesReadPerRep( 4*sizeof(Real_type ) * m_array_length + + 4*sizeof(Real_type ) * m_array_length + + 4*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) ); + setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) + + 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) + + 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((14 + 26 + 4 ) * (m_jn-2)*(m_kn-2)); @@ -72,6 +74,9 @@ HYDRO_2D::HYDRO_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index b6ad936ca..1c9cc8d1c 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -151,19 +151,25 @@ class HYDRO_2D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Real_ptr m_za; Real_ptr m_zb; diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index 02b22cbb8..3ec139130 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -55,12 +55,16 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - int_predict<<>>( px, - dm22, dm23, dm24, dm25, - dm26, dm27, dm28, c0, - offset, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (int_predict), + grid_size, block_size, + shmem, res.get_stream(), + px, + dm22, dm23, dm24, + dm25, dm26, dm27, + dm28, c0, + offset, + iend ); } stopTimer(); diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index cc0c06477..1e2741cd7 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -55,12 +55,16 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((int_predict), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), px, - dm22, dm23, dm24, dm25, - dm26, dm27, dm28, c0, - offset, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (int_predict), + grid_size, block_size, + shmem, res.get_stream(), + px, + dm22, dm23, dm24, + dm25, dm26, dm27, + dm28, c0, + offset, + iend ); } stopTimer(); diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp index 29b167881..b33e5cd2b 100644 --- a/src/lcals/INT_PREDICT-OMP.cpp +++ b/src/lcals/INT_PREDICT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp index 4172c1822..a7e257532 100644 --- a/src/lcals/INT_PREDICT-OMPTarget.cpp +++ b/src/lcals/INT_PREDICT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp index de167bc11..1d8e52fda 100644 --- a/src/lcals/INT_PREDICT-Seq.cpp +++ b/src/lcals/INT_PREDICT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp new file mode 100644 index 000000000..992dbcba1 --- /dev/null +++ b/src/lcals/INT_PREDICT-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INT_PREDICT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + +template +void INT_PREDICT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + INT_PREDICT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + INT_PREDICT_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + INT_PREDICT_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n INT_PREDICT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INT_PREDICT, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index eb56b5725..afb4a2ea9 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 10*sizeof(Real_type )) * getActualProblemSize() ); + setBytesReadPerRep( 10*sizeof(Real_type ) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(17 * getActualProblemSize()); setUsesFeature(Forall); @@ -50,6 +52,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index a81ae6fb2..5435af4f4 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -72,18 +72,24 @@ class INT_PREDICT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_array_length; Index_type m_offset; diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index 76c5082cd..40a8bf7f0 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,10 +53,13 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - planckian<<>>( x, y, - u, v, w, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (planckian), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + u, v, w, + iend ); } stopTimer(); diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index 7d93b2dca..00323115d 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,10 +53,13 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((planckian), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x, y, - u, v, w, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (planckian), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + u, v, w, + iend ); } stopTimer(); diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp index cc90067eb..e82f9eccd 100644 --- a/src/lcals/PLANCKIAN-OMP.cpp +++ b/src/lcals/PLANCKIAN-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp index fb0f41cef..c69732531 100644 --- a/src/lcals/PLANCKIAN-OMPTarget.cpp +++ b/src/lcals/PLANCKIAN-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp index 25ff3ff2b..56c57971b 100644 --- a/src/lcals/PLANCKIAN-Seq.cpp +++ b/src/lcals/PLANCKIAN-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp new file mode 100644 index 000000000..31b43c2f7 --- /dev/null +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -0,0 +1,84 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "PLANCKIAN.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + +template +void PLANCKIAN::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + PLANCKIAN_DATA_SETUP; + + using sycl::exp; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + PLANCKIAN_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + PLANCKIAN_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n PLANCKIAN : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PLANCKIAN, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 2bb8d3f7b..cf15e6a29 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,7 +28,9 @@ PLANCKIAN::PLANCKIAN(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * getActualProblemSize() ); + setBytesReadPerRep( 3*sizeof(Real_type ) * getActualProblemSize() ); + setBytesWrittenPerRep( 2*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(4 * getActualProblemSize()); // 1 exp setUsesFeature(Forall); @@ -50,6 +52,9 @@ PLANCKIAN::PLANCKIAN(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 92b55fc95..a999d2178 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -57,18 +57,24 @@ class PLANCKIAN : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 8b6643d2b..18cc284ea 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,8 +23,9 @@ namespace lcals template < size_t block_size > __launch_bounds__(block_size) -__global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, - Index_type N) +__global__ void tridiag_elim(Real_ptr xout, Real_ptr xin, + Real_ptr y, Real_ptr z, + Index_type N) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N) { @@ -51,10 +52,13 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - eos - <<>>( xout, xin, y, z, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (tridiag_elim), + grid_size, block_size, + shmem, res.get_stream(), + xout, xin, + y, z, + iend ); } stopTimer(); diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index f6c4c9ebe..1b0db7e7f 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,8 +23,9 @@ namespace lcals template < size_t block_size > __launch_bounds__(block_size) -__global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, - Index_type N) +__global__ void tridiag_elim(Real_ptr xout, Real_ptr xin, + Real_ptr y, Real_ptr z, + Index_type N) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N) { @@ -51,9 +52,13 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((eos), grid_size, block_size, shmem, res.get_stream(), xout, xin, y, z, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (tridiag_elim), + grid_size, block_size, + shmem, res.get_stream(), + xout, xin, + y, z, + iend ); } stopTimer(); diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp index 8f31c9493..22673b5f7 100644 --- a/src/lcals/TRIDIAG_ELIM-OMP.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp index 59a8a323c..5433879a5 100644 --- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp index 5c0003d93..0b23c9143 100644 --- a/src/lcals/TRIDIAG_ELIM-Seq.cpp +++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp new file mode 100644 index 000000000..74e23665f --- /dev/null +++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIDIAG_ELIM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + +template +void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = m_N; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + TRIDIAG_ELIM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < iend) { + TRIDIAG_ELIM_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + TRIDIAG_ELIM_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n TRIDIAG_ELIM : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRIDIAG_ELIM, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 710927c3e..9955bee66 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,11 +26,13 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - m_N = getActualProblemSize(); + m_N = getActualProblemSize() + 1; setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 3*sizeof(Real_type )) * (m_N-1) ); + setBytesReadPerRep( 3*sizeof(Real_type ) * (m_N-1) ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * (getActualProblemSize()-1)); setUsesFeature(Forall); @@ -52,6 +54,9 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index c95685de9..69c1a2d9c 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -57,18 +57,24 @@ class TRIDIAG_ELIM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_xout; Real_ptr m_xin; diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index f9cd2c1c2..2722a1fac 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -14,77 +14,90 @@ blt_add_library( POLYBENCH_2MM-Cuda.cpp POLYBENCH_2MM-OMP.cpp POLYBENCH_2MM-OMPTarget.cpp + POLYBENCH_2MM-Sycl.cpp POLYBENCH_3MM.cpp POLYBENCH_3MM-Seq.cpp POLYBENCH_3MM-Hip.cpp POLYBENCH_3MM-Cuda.cpp POLYBENCH_3MM-OMP.cpp POLYBENCH_3MM-OMPTarget.cpp + POLYBENCH_3MM-Sycl.cpp POLYBENCH_ADI.cpp POLYBENCH_ADI-Seq.cpp POLYBENCH_ADI-Hip.cpp POLYBENCH_ADI-Cuda.cpp POLYBENCH_ADI-OMP.cpp POLYBENCH_ADI-OMPTarget.cpp + POLYBENCH_ADI-Sycl.cpp POLYBENCH_ATAX.cpp POLYBENCH_ATAX-Seq.cpp POLYBENCH_ATAX-Hip.cpp POLYBENCH_ATAX-Cuda.cpp POLYBENCH_ATAX-OMP.cpp POLYBENCH_ATAX-OMPTarget.cpp + POLYBENCH_ATAX-Sycl.cpp POLYBENCH_FDTD_2D.cpp POLYBENCH_FDTD_2D-Seq.cpp POLYBENCH_FDTD_2D-Hip.cpp POLYBENCH_FDTD_2D-Cuda.cpp POLYBENCH_FDTD_2D-OMP.cpp POLYBENCH_FDTD_2D-OMPTarget.cpp + POLYBENCH_FDTD_2D-Sycl.cpp POLYBENCH_FLOYD_WARSHALL.cpp POLYBENCH_FLOYD_WARSHALL-Seq.cpp POLYBENCH_FLOYD_WARSHALL-Hip.cpp POLYBENCH_FLOYD_WARSHALL-Cuda.cpp POLYBENCH_FLOYD_WARSHALL-OMP.cpp POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp + POLYBENCH_FLOYD_WARSHALL-Sycl.cpp POLYBENCH_GEMM.cpp POLYBENCH_GEMM-Seq.cpp POLYBENCH_GEMM-Hip.cpp POLYBENCH_GEMM-Cuda.cpp POLYBENCH_GEMM-OMP.cpp POLYBENCH_GEMM-OMPTarget.cpp + POLYBENCH_GEMM-Sycl.cpp POLYBENCH_GEMVER.cpp POLYBENCH_GEMVER-Seq.cpp POLYBENCH_GEMVER-Hip.cpp POLYBENCH_GEMVER-Cuda.cpp POLYBENCH_GEMVER-OMP.cpp POLYBENCH_GEMVER-OMPTarget.cpp + POLYBENCH_GEMVER-Sycl.cpp POLYBENCH_GESUMMV.cpp POLYBENCH_GESUMMV-Seq.cpp POLYBENCH_GESUMMV-Hip.cpp POLYBENCH_GESUMMV-Cuda.cpp POLYBENCH_GESUMMV-OMP.cpp POLYBENCH_GESUMMV-OMPTarget.cpp + POLYBENCH_GESUMMV-Sycl.cpp POLYBENCH_HEAT_3D.cpp POLYBENCH_HEAT_3D-Seq.cpp POLYBENCH_HEAT_3D-Hip.cpp POLYBENCH_HEAT_3D-Cuda.cpp POLYBENCH_HEAT_3D-OMP.cpp POLYBENCH_HEAT_3D-OMPTarget.cpp + POLYBENCH_HEAT_3D-Sycl.cpp POLYBENCH_JACOBI_1D.cpp POLYBENCH_JACOBI_1D-Seq.cpp POLYBENCH_JACOBI_1D-Hip.cpp POLYBENCH_JACOBI_1D-Cuda.cpp POLYBENCH_JACOBI_1D-OMP.cpp POLYBENCH_JACOBI_1D-OMPTarget.cpp + POLYBENCH_JACOBI_1D-Sycl.cpp POLYBENCH_JACOBI_2D.cpp POLYBENCH_JACOBI_2D-Seq.cpp POLYBENCH_JACOBI_2D-Hip.cpp POLYBENCH_JACOBI_2D-Cuda.cpp POLYBENCH_JACOBI_2D-OMP.cpp POLYBENCH_JACOBI_2D-OMPTarget.cpp + POLYBENCH_JACOBI_2D-Sycl.cpp POLYBENCH_MVT.cpp POLYBENCH_MVT-Seq.cpp POLYBENCH_MVT-Hip.cpp POLYBENCH_MVT-Cuda.cpp POLYBENCH_MVT-OMP.cpp POLYBENCH_MVT-OMPTarget.cpp + POLYBENCH_MVT-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index 7a8f43e58..28b49c779 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -125,16 +125,24 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_2MM_1_NBLOCKS_CUDA; - poly_2mm_1 - <<>>(tmp, A, B, alpha, - ni, nj, nk); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( + (poly_2mm_1), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + tmp, A, B, + alpha, + ni, nj, nk ); POLY_2MM_2_NBLOCKS_CUDA; - poly_2mm_2 - <<>>(tmp, C, D, beta, - ni, nl, nj); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( + (poly_2mm_2), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + tmp, C, D, + beta, + ni, nl, nj ); } stopTimer(); @@ -148,30 +156,38 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_2MM_1_NBLOCKS_CUDA; - poly_2mm_1_lam - <<>>(ni, nj, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_2MM_BODY1; - for (Index_type k=0; k < nk; ++k) { - POLYBENCH_2MM_BODY2; - } - POLYBENCH_2MM_BODY3; + + auto poly_2mm_1_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_2MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_2MM_BODY2; } - ); - cudaErrchk( cudaGetLastError() ); + POLYBENCH_2MM_BODY3; + }; + + RPlaunchCudaKernel( + (poly_2mm_1_lam), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_2mm_1_lambda ); POLY_2MM_2_NBLOCKS_CUDA; - poly_2mm_2_lam - <<>>(ni, nl, - [=] __device__ (Index_type i, Index_type l) { - POLYBENCH_2MM_BODY4; - for (Index_type j=0; j < nj; ++j) { - POLYBENCH_2MM_BODY5; - } - POLYBENCH_2MM_BODY6; + + auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) { + POLYBENCH_2MM_BODY4; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_2MM_BODY5; } - ); - cudaErrchk( cudaGetLastError() ); + POLYBENCH_2MM_BODY6; + }; + + RPlaunchCudaKernel( + (poly_2mm_2_lam), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + ni, nl, poly_2mm_2_lambda ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 1a0f26ecd..28308ef32 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -125,18 +125,24 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_2MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_1), - dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(), - tmp, A, B, alpha, - ni, nj, nk); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( + (poly_2mm_1), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + tmp, A, B, + alpha, + ni, nj, nk ); POLY_2MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_2), - dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(), - tmp, C, D, beta, - ni, nl, nj); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( + (poly_2mm_2), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + tmp, C, D, + beta, + ni, nl, nj ); } stopTimer(); @@ -149,6 +155,8 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) POLY_2MM_THREADS_PER_BLOCK_HIP; constexpr size_t shmem = 0; + POLY_2MM_1_NBLOCKS_HIP; + auto poly_2mm_1_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_2MM_BODY1; for (Index_type k=0; k < nk; ++k) { @@ -157,11 +165,14 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) POLYBENCH_2MM_BODY3; }; - POLY_2MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_1_lam), - dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nj, poly_2mm_1_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_2mm_1_lam), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_2mm_1_lambda ); + + POLY_2MM_2_NBLOCKS_HIP; auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) { POLYBENCH_2MM_BODY4; @@ -171,11 +182,12 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) POLYBENCH_2MM_BODY6; }; - POLY_2MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_2_lam), - dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nl, poly_2mm_2_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_2mm_2_lam), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + ni, nl, poly_2mm_2_lambda ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp index 8b6cdb290..b73813df8 100644 --- a/src/polybench/POLYBENCH_2MM-OMP.cpp +++ b/src/polybench/POLYBENCH_2MM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp index 79d6f96c0..781139422 100644 --- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp index 1a3120246..5cffa3207 100644 --- a/src/polybench/POLYBENCH_2MM-Seq.cpp +++ b/src/polybench/POLYBENCH_2MM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp new file mode 100644 index 000000000..867ad780e --- /dev/null +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -0,0 +1,172 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_2MM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define in_wg_sz (32) +#define out_wg_sz (work_group_size / in_wg_sz) + + +template +void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_2MM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim1(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, in_wg_sz)); + + sycl::range<3> global_dim2(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); + + sycl::range<3> wkgroup_dim(1, out_wg_sz, in_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i < ni && j < nj) { + POLYBENCH_2MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_2MM_BODY2; + } + POLYBENCH_2MM_BODY3; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim2, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type l = item.get_global_id(2); + + if (i < ni && l < nl) { + POLYBENCH_2MM_BODY4; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_2MM_BODY5; + } + POLYBENCH_2MM_BODY6; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_2MM_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_2MM_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Index_type k, + Real_type &dot) { + POLYBENCH_2MM_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_2MM_BODY3_RAJA; + } + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nj}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_2MM_BODY4_RAJA; + }, + [=] (Index_type i, Index_type l, Index_type j, + Real_type &dot) { + POLYBENCH_2MM_BODY5_RAJA; + }, + [=] (Index_type i, Index_type l, + Real_type &dot) { + POLYBENCH_2MM_BODY6_RAJA; + } + ); + + } + stopTimer(); + + } else { + std::cout << "\n POLYBENCH_2MM : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_2MM, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 5c0ebe484..55cc577cc 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -31,9 +31,9 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) ni_default*nl_default ) ); setDefaultReps(2); - m_ni = std::sqrt( getTargetProblemSize() ) + 1; + m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_nj = m_ni; - m_nk = nk_default; + m_nk = Index_type(double(nk_default)/ni_default*m_ni); m_nl = m_ni; m_alpha = 1.5; @@ -44,13 +44,15 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setItsPerRep( m_ni*m_nj + m_ni*m_nl ); setKernelsPerRep(2); - setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk + + setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + + 1*sizeof(Real_type ) * m_nj * m_nk + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nl + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nl ); + 1*sizeof(Real_type ) * m_ni * m_nj + + 1*sizeof(Real_type ) * m_nj * m_nl ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj + + + 1*sizeof(Real_type ) * m_ni * m_nl ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(3 * m_ni*m_nj*m_nk + 2 * m_ni*m_nj*m_nl ); @@ -78,6 +80,9 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_2MM::~POLYBENCH_2MM() diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index e11d4889b..541682454 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -127,18 +127,23 @@ class POLYBENCH_2MM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index 9131a629a..401660aca 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -159,22 +159,31 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_3MM_1_NBLOCKS_CUDA; - poly_3mm_1 - <<>>(E, A, B, - ni, nj, nk); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( + (poly_3mm_1), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + E, A, B, + ni, nj, nk ); POLY_3MM_2_NBLOCKS_CUDA; - poly_3mm_2 - <<>>(F, C, D, - nj, nl, nm); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( + (poly_3mm_2), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + F, C, D, + nj, nl, nm ); POLY_3MM_3_NBLOCKS_CUDA; - poly_3mm_3 - <<>>(G, E, F, - ni, nl, nj); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( + (poly_3mm_3), + nblocks3, nthreads_per_block, + shmem, res.get_stream(), + G, E, F, + ni, nl, nj ); } stopTimer(); @@ -188,43 +197,55 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_3MM_1_NBLOCKS_CUDA; - poly_3mm_1_lam - <<>>(ni, nj, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_3MM_BODY1; - for (Index_type k=0; k < nk; ++k) { - POLYBENCH_3MM_BODY2; - } - POLYBENCH_3MM_BODY3; + + auto poly_3mm_1_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_3MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_3MM_BODY2; } - ); - cudaErrchk( cudaGetLastError() ); + POLYBENCH_3MM_BODY3; + }; + + RPlaunchCudaKernel( + (poly_3mm_1_lam), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_3mm_1_lambda ); POLY_3MM_2_NBLOCKS_CUDA; - poly_3mm_2_lam - <<>>(nj, nl, - [=] __device__ (Index_type j, Index_type l) { - POLYBENCH_3MM_BODY4; - for (Index_type m=0; m < nm; ++m) { - POLYBENCH_3MM_BODY5; - } - POLYBENCH_3MM_BODY6; - } - ); - cudaErrchk( cudaGetLastError() ); + + auto poly_3mm_2_lambda = [=] __device__ (Index_type j, Index_type l) { + POLYBENCH_3MM_BODY4; + for (Index_type m=0; m < nm; ++m) { + POLYBENCH_3MM_BODY5; + } + POLYBENCH_3MM_BODY6; + }; + + RPlaunchCudaKernel( + (poly_3mm_2_lam), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + nj, nl, poly_3mm_2_lambda ); POLY_3MM_3_NBLOCKS_CUDA; - poly_3mm_3_lam - <<>>(ni, nl, - [=] __device__ (Index_type i, Index_type l) { - POLYBENCH_3MM_BODY7; - for (Index_type j=0; j < nj; ++j) { - POLYBENCH_3MM_BODY8; - } - POLYBENCH_3MM_BODY9; + + auto poly_3mm_3_lambda = [=] __device__ (Index_type i, Index_type l) { + POLYBENCH_3MM_BODY7; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_3MM_BODY8; } - ); - cudaErrchk( cudaGetLastError() ); + POLYBENCH_3MM_BODY9; + }; + + RPlaunchCudaKernel( + (poly_3mm_3_lam), + nblocks3, nthreads_per_block, + shmem, res.get_stream(), + ni, nl, poly_3mm_3_lambda ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 2cdf3df13..53e106ad1 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -158,25 +158,31 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_3MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_1), - dim3(nblocks1) , dim3(nthreads_per_block), shmem, res.get_stream(), - E, A, B, - ni, nj, nk); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( + (poly_3mm_1), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + E, A, B, + ni, nj, nk ); POLY_3MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_2), - dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(), - F, C, D, - nj, nl, nm); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( + (poly_3mm_2), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + F, C, D, + nj, nl, nm ); POLY_3MM_3_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_3), - dim3(nblocks3), dim3(nthreads_per_block), shmem, res.get_stream(), - G, E, F, - ni, nl, nj); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( + (poly_3mm_3), + nblocks3, nthreads_per_block, + shmem, res.get_stream(), + G, E, F, + ni, nl, nj ); } stopTimer(); @@ -189,6 +195,8 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLY_3MM_THREADS_PER_BLOCK_HIP; constexpr size_t shmem = 0; + POLY_3MM_1_NBLOCKS_HIP; + auto poly_3mm_1_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_3MM_BODY1; for (Index_type k=0; k < nk; ++k) { @@ -197,11 +205,14 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLYBENCH_3MM_BODY3; }; - POLY_3MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_1_lam), - dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nj, poly_3mm_1_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_3mm_1_lam), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_3mm_1_lambda ); + + POLY_3MM_2_NBLOCKS_HIP; auto poly_3mm_2_lambda = [=] __device__ (Index_type j, Index_type l) { POLYBENCH_3MM_BODY4; @@ -211,11 +222,14 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLYBENCH_3MM_BODY6; }; - POLY_3MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_2_lam), - dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(), - nj, nl, poly_3mm_2_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_3mm_2_lam), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + nj, nl, poly_3mm_2_lambda ); + + POLY_3MM_3_NBLOCKS_HIP; auto poly_3mm_3_lambda = [=] __device__ (Index_type i, Index_type l) { POLYBENCH_3MM_BODY7; @@ -225,11 +239,12 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLYBENCH_3MM_BODY9; }; - POLY_3MM_3_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_3_lam), - dim3(nblocks3), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nl, poly_3mm_3_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_3mm_3_lam), + nblocks3, nthreads_per_block, + shmem, res.get_stream(), + ni, nl, poly_3mm_3_lambda ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp index 966853d7d..19b15098a 100644 --- a/src/polybench/POLYBENCH_3MM-OMP.cpp +++ b/src/polybench/POLYBENCH_3MM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp index c25a49dee..d165426d1 100644 --- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp index 619b2ff10..24098e109 100644 --- a/src/polybench/POLYBENCH_3MM-Seq.cpp +++ b/src/polybench/POLYBENCH_3MM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-Sycl.cpp b/src/polybench/POLYBENCH_3MM-Sycl.cpp new file mode 100644 index 000000000..b6abea7b9 --- /dev/null +++ b/src/polybench/POLYBENCH_3MM-Sycl.cpp @@ -0,0 +1,216 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_3MM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define in_wg_sz (32) +#define out_wg_sz (work_group_size / in_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_3MM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + sycl::range<3> global_dim1(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, in_wg_sz)); + + sycl::range<3> global_dim2(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); + + sycl::range<3> global_dim3(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); + + sycl::range<3> wkgroup_dim(1, out_wg_sz, in_wg_sz); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i < ni && j < nj) { + POLYBENCH_3MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_3MM_BODY2; + } + POLYBENCH_3MM_BODY3; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim2, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type j = item.get_global_id(1); + Index_type l = item.get_global_id(2); + + if (j < nj && l < nl) { + POLYBENCH_3MM_BODY4; + for (Index_type m=0; m < nm; ++m) { + POLYBENCH_3MM_BODY5; + } + POLYBENCH_3MM_BODY6; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim3, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type l = item.get_global_id(2); + + if (i < ni && l < nl) { + POLYBENCH_3MM_BODY7; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_3MM_BODY8; + } + POLYBENCH_3MM_BODY9; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_3MM_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_3MM_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Index_type k, + Real_type &dot) { + POLYBENCH_3MM_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_3MM_BODY3_RAJA; + } + + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nm}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_3MM_BODY4_RAJA; + }, + [=] (Index_type j, Index_type l, Index_type m, + Real_type &dot) { + POLYBENCH_3MM_BODY5_RAJA; + }, + [=] (Index_type j, Index_type l, + Real_type &dot) { + POLYBENCH_3MM_BODY6_RAJA; + } + + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nj}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_3MM_BODY7_RAJA; + }, + [=] (Index_type i, Index_type l, Index_type j, + Real_type &dot) { + POLYBENCH_3MM_BODY8_RAJA; + }, + [=] (Index_type i, Index_type l, + Real_type &dot) { + POLYBENCH_3MM_BODY9_RAJA; + } + + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_3MM : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_3MM, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index a649e2e89..eb8e63d66 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -35,11 +35,11 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) setDefaultProblemSize( ni_default * nj_default ); setDefaultReps(2); - m_ni = std::sqrt( getTargetProblemSize() ) + 1; + m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_nj = m_ni; - m_nk = nk_default; + m_nk = Index_type(double(nk_default)/ni_default*m_ni); m_nl = m_ni; - m_nm = nm_default; + m_nm = Index_type(double(nm_default)/ni_default*m_ni); setActualProblemSize( std::max( std::max( m_ni*m_nj, m_nj*m_nl ), @@ -47,17 +47,20 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) setItsPerRep( m_ni*m_nj + m_nj*m_nl + m_ni*m_nl ); setKernelsPerRep(3); - setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk + + setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + + 1*sizeof(Real_type ) * m_nj * m_nk + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_nj * m_nl + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nm + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nl * m_nm + + 1*sizeof(Real_type ) * m_nj * m_nm + + 1*sizeof(Real_type ) * m_nl * m_nm + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nl + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nl ); + 1*sizeof(Real_type ) * m_ni * m_nj + + 1*sizeof(Real_type ) * m_nj * m_nl ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj + + + 1*sizeof(Real_type ) * m_nj * m_nl + + + 1*sizeof(Real_type ) * m_ni * m_nl ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_ni*m_nj*m_nk + 2 * m_nj*m_nl*m_nm + 2 * m_ni*m_nj*m_nl ); @@ -86,6 +89,9 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_3MM::~POLYBENCH_3MM() diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 4331e3930..0d0cf79af 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -153,18 +153,23 @@ class POLYBENCH_3MM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index fc3348fff..ff5216dc9 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,7 +23,7 @@ namespace polybench template < size_t block_size > __launch_bounds__(block_size) -__global__ void adi1(const Index_type n, +__global__ void poly_adi1(const Index_type n, const Real_type a, const Real_type b, const Real_type c, const Real_type d, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) @@ -43,7 +43,7 @@ __global__ void adi1(const Index_type n, template < size_t block_size > __launch_bounds__(block_size) -__global__ void adi2(const Index_type n, +__global__ void poly_adi2(const Index_type n, const Real_type a, const Real_type c, const Real_type d, const Real_type e, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) @@ -63,7 +63,7 @@ __global__ void adi2(const Index_type n, template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void adi_lam(const Index_type n, +__global__ void poly_adi_lam(const Index_type n, Lambda body) { Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; @@ -92,15 +92,21 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); constexpr size_t shmem = 0; - adi1<<>>(n, - a, b, c, d, f, - P, Q, U, V); - cudaErrchk( cudaGetLastError() ); - - adi2<<>>(n, - a, c, d, e, f, - P, Q, U, V); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_adi1), + grid_size, block_size, + shmem, res.get_stream(), + n, + a, b, c, + d, f, + P, Q, U, V ); + + RPlaunchCudaKernel( (poly_adi2), + grid_size, block_size, + shmem, res.get_stream(), + n, + a, c, d, + e, f, + P, Q, U, V ); } // tstep loop @@ -117,33 +123,39 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); constexpr size_t shmem = 0; - adi_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_ADI_BODY2; - for (Index_type j = 1; j < n-1; ++j) { - POLYBENCH_ADI_BODY3; - } - POLYBENCH_ADI_BODY4; - for (Index_type k = n-2; k >= 1; --k) { - POLYBENCH_ADI_BODY5; - } + auto poly_adi1_lambda = [=] __device__ (Index_type i) { + POLYBENCH_ADI_BODY2; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY3; } - ); - cudaErrchk( cudaGetLastError() ); - - adi_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_ADI_BODY6; - for (Index_type j = 1; j < n-1; ++j) { - POLYBENCH_ADI_BODY7; - } - POLYBENCH_ADI_BODY8; - for (Index_type k = n-2; k >= 1; --k) { - POLYBENCH_ADI_BODY9; - } + POLYBENCH_ADI_BODY4; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY5; } - ); - cudaErrchk( cudaGetLastError() ); + }; + + RPlaunchCudaKernel( (poly_adi_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_adi1_lambda ); + + auto poly_adi2_lambda = [=] __device__ (Index_type i) { + POLYBENCH_ADI_BODY6; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY7; + } + POLYBENCH_ADI_BODY8; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY9; + } + }; + + RPlaunchCudaKernel( (poly_adi_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_adi2_lambda ); } // tstep loop diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index f5791ce88..51a69a986 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -23,7 +23,7 @@ namespace polybench template < size_t block_size > __launch_bounds__(block_size) -__global__ void adi1(const Index_type n, +__global__ void poly_adi1(const Index_type n, const Real_type a, const Real_type b, const Real_type c, const Real_type d, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) @@ -43,7 +43,7 @@ __global__ void adi1(const Index_type n, template < size_t block_size > __launch_bounds__(block_size) -__global__ void adi2(const Index_type n, +__global__ void poly_adi2(const Index_type n, const Real_type a, const Real_type c, const Real_type d, const Real_type e, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) @@ -63,7 +63,7 @@ __global__ void adi2(const Index_type n, template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void adi_lam(const Index_type n, +__global__ void poly_adi_lam(const Index_type n, Lambda body) { Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; @@ -92,19 +92,21 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((adi1), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), + RPlaunchHipKernel( (poly_adi1), + grid_size, block_size, + shmem, res.get_stream(), n, - a, b, c, d, f, - P, Q, U, V); - hipErrchk( hipGetLastError() ); + a, b, c, + d, f, + P, Q, U, V ); - hipLaunchKernelGGL((adi2), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), + RPlaunchHipKernel( (poly_adi2), + grid_size, block_size, + shmem, res.get_stream(), n, - a, c, d, e, f, - P, Q, U, V); - hipErrchk( hipGetLastError() ); + a, c, d, + e, f, + P, Q, U, V ); } // tstep loop @@ -121,7 +123,7 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); constexpr size_t shmem = 0; - auto adi1_lamda = [=] __device__ (Index_type i) { + auto poly_adi1_lambda = [=] __device__ (Index_type i) { POLYBENCH_ADI_BODY2; for (Index_type j = 1; j < n-1; ++j) { POLYBENCH_ADI_BODY3; @@ -132,12 +134,13 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((adi_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, adi1_lamda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_adi_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_adi1_lambda ); - auto adi2_lamda = [=] __device__ (Index_type i) { + auto poly_adi2_lambda = [=] __device__ (Index_type i) { POLYBENCH_ADI_BODY6; for (Index_type j = 1; j < n-1; ++j) { POLYBENCH_ADI_BODY7; @@ -148,10 +151,11 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((adi_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, adi2_lamda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_adi_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_adi2_lambda ); } // tstep loop diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp index 022888a54..08df1e9f7 100644 --- a/src/polybench/POLYBENCH_ADI-OMP.cpp +++ b/src/polybench/POLYBENCH_ADI-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp index c67e5a20a..8cee39e2c 100644 --- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp index 899a9a57b..3ec703a50 100644 --- a/src/polybench/POLYBENCH_ADI-Seq.cpp +++ b/src/polybench/POLYBENCH_ADI-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-Sycl.cpp b/src/polybench/POLYBENCH_ADI-Sycl.cpp new file mode 100644 index 000000000..0fb3dfb4c --- /dev/null +++ b/src/polybench/POLYBENCH_ADI-Sycl.cpp @@ -0,0 +1,169 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_ADI.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + +template < size_t work_group_size > +void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_ADI_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 1; t <= tsteps; ++t) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(n-2, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0) + 1; + + if (i < n-1) { + POLYBENCH_ADI_BODY2; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY3; + } + POLYBENCH_ADI_BODY4; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY5; + } + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0) + 1; + + if (i < n-1) { + POLYBENCH_ADI_BODY6; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY7; + } + POLYBENCH_ADI_BODY8; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY9; + } + } + + }); + }); + + } // tstep loop + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_ADI_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_2, + RAJA::statement::Lambda<0, RAJA::Segs<0>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>>, + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<3, RAJA::Segs<0,2>> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 1; t <= tsteps; ++t) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, + RAJA::RangeSegment{1, n-1}, + RAJA::RangeStrideSegment{n-2, 0, -1}), + res, + + [=] (Index_type i) { + POLYBENCH_ADI_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j) { + POLYBENCH_ADI_BODY3_RAJA; + }, + [=] (Index_type i) { + POLYBENCH_ADI_BODY4_RAJA; + }, + [=] (Index_type i, Index_type k) { + POLYBENCH_ADI_BODY5_RAJA; + } + ); + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, + RAJA::RangeSegment{1, n-1}, + RAJA::RangeStrideSegment{n-2, 0, -1}), + res, + + [=] (Index_type i) { + POLYBENCH_ADI_BODY6_RAJA; + }, + [=] (Index_type i, Index_type j) { + POLYBENCH_ADI_BODY7_RAJA; + }, + [=] (Index_type i) { + POLYBENCH_ADI_BODY8_RAJA; + }, + [=] (Index_type i, Index_type k) { + POLYBENCH_ADI_BODY9_RAJA; + } + ); + + } // tstep loop + + } // run_reps + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_ADI : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_ADI, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 7a31468a6..1347975f2 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -20,12 +20,12 @@ namespace polybench POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) : KernelBase(rajaperf::Polybench_ADI, params) { - Index_type n_default = 1000; + Index_type n_default = 1002; setDefaultProblemSize( (n_default-2) * (n_default-2) ); setDefaultReps(4); - m_n = std::sqrt( getTargetProblemSize() ) + 1; + m_n = std::sqrt( getTargetProblemSize() ) + 2 + std::sqrt(2)-1; m_tsteps = 4; setItsPerRep( m_tsteps * ( (m_n-2) + (m_n-2) ) ); @@ -34,8 +34,11 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) setActualProblemSize( (m_n-2) * (m_n-2) ); setKernelsPerRep( m_tsteps * 2 ); - setBytesPerRep( m_tsteps * ( (3*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_n * (m_n-2) + - (3*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_n * (m_n-2) ) ); + setBytesReadPerRep((3*sizeof(Real_type ) * m_n * (m_n-2) + + 3*sizeof(Real_type ) * m_n * (m_n-2)) * m_tsteps ); + setBytesWrittenPerRep((3*sizeof(Real_type ) * m_n * (m_n-2) + + 3*sizeof(Real_type ) * m_n * (m_n-2)) * m_tsteps ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( (15 + 2) * (m_n-2)*(m_n-2) + (15 + 2) * (m_n-2)*(m_n-2) ) ); @@ -63,6 +66,9 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_ADI::~POLYBENCH_ADI() diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index 848fb9dc4..613202509 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -195,17 +195,22 @@ class POLYBENCH_ADI : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_n; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index a787276ec..e83dc9590 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -83,11 +83,17 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - poly_atax_1<<>>(A, x, y, tmp, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_atax_1), + grid_size, block_size, + shmem, res.get_stream(), + A, x, y, tmp, + N ); - poly_atax_2<<>>(A, tmp, y, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_atax_2), + grid_size, block_size, + shmem, res.get_stream(), + A, tmp, y, + N ); } stopTimer(); @@ -100,27 +106,33 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - poly_atax_lam<<>>(N, - [=] __device__ (Index_type i) { - POLYBENCH_ATAX_BODY1; - for (Index_type j = 0; j < N; ++j ) { - POLYBENCH_ATAX_BODY2; - } - POLYBENCH_ATAX_BODY3; + auto poly_atax1_lambda = [=] __device__ (Index_type i) { + POLYBENCH_ATAX_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_ATAX_BODY2; } - ); - cudaErrchk( cudaGetLastError() ); - - poly_atax_lam<<>>(N, - [=] __device__ (Index_type j) { - POLYBENCH_ATAX_BODY4; - for (Index_type i = 0; i < N; ++i ) { - POLYBENCH_ATAX_BODY5; - } - POLYBENCH_ATAX_BODY6; + POLYBENCH_ATAX_BODY3; + }; + + RPlaunchCudaKernel( (poly_atax_lam), + grid_size, block_size, + shmem, res.get_stream(), + N, poly_atax1_lambda ); + + auto poly_atax2_lambda = [=] __device__ (Index_type j) { + POLYBENCH_ATAX_BODY4; + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_ATAX_BODY5; } - ); - cudaErrchk( cudaGetLastError() ); + POLYBENCH_ATAX_BODY6; + }; + + RPlaunchCudaKernel( (poly_atax_lam), + grid_size, block_size, + shmem, res.get_stream(), + N, poly_atax2_lambda ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 3ac954ab8..bbed90a83 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -83,15 +83,17 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_atax_1), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x, y, tmp, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_atax_1), + grid_size, block_size, + shmem, res.get_stream(), + A, x, y, tmp, + N ); - hipLaunchKernelGGL((poly_atax_2), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, tmp, y, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_atax_2), + grid_size, block_size, + shmem, res.get_stream(), + A, tmp, y, + N ); } stopTimer(); @@ -104,7 +106,7 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - auto poly_atax_1_lambda = [=] __device__ (Index_type i) { + auto poly_atax1_lambda = [=] __device__ (Index_type i) { POLYBENCH_ATAX_BODY1; for (Index_type j = 0; j < N; ++j ) { POLYBENCH_ATAX_BODY2; @@ -112,12 +114,13 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) POLYBENCH_ATAX_BODY3; }; - hipLaunchKernelGGL((poly_atax_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - N, poly_atax_1_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_atax_lam), + grid_size, block_size, + shmem, res.get_stream(), + N, poly_atax1_lambda ); - auto poly_atax_2_lambda = [=] __device__ (Index_type j) { + auto poly_atax2_lambda = [=] __device__ (Index_type j) { POLYBENCH_ATAX_BODY4; for (Index_type i = 0; i < N; ++i ) { POLYBENCH_ATAX_BODY5; @@ -125,10 +128,11 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) POLYBENCH_ATAX_BODY6; }; - hipLaunchKernelGGL((poly_atax_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - N, poly_atax_2_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_atax_lam), + grid_size, block_size, + shmem, res.get_stream(), + N, poly_atax2_lambda ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp index fda8ab7fd..a3880cc8f 100644 --- a/src/polybench/POLYBENCH_ATAX-OMP.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp index e9e13e9cd..ce7ba4843 100644 --- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp index 05a19093d..4791b7018 100644 --- a/src/polybench/POLYBENCH_ATAX-Seq.cpp +++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-Sycl.cpp b/src/polybench/POLYBENCH_ATAX-Sycl.cpp new file mode 100644 index 000000000..110e58cd0 --- /dev/null +++ b/src/polybench/POLYBENCH_ATAX-Sycl.cpp @@ -0,0 +1,161 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_ATAX.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + +template < size_t work_group_size > +void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_ATAX_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + + if (i < N) { + POLYBENCH_ATAX_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_ATAX_BODY2; + } + POLYBENCH_ATAX_BODY3; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type j = item.get_global_id(0); + + if (j < N) { + POLYBENCH_ATAX_BODY4; + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_ATAX_BODY5; + } + POLYBENCH_ATAX_BODY6; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_ATAX_VIEWS_RAJA; + + using EXEC_POL1 = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + > + >; + + using EXEC_POL2 = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<1, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Segs<1>, RAJA::Params<0>>, + RAJA::statement::For<0, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<1>, RAJA::Params<0>> + > + > + >; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + res, + + [=] (Index_type i, Real_type &dot) { + POLYBENCH_ATAX_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY2_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_ATAX_BODY3_RAJA; + } + + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + res, + + [=] (Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY4_RAJA; + }, + [=] (Index_type i, Index_type j , Real_type &dot) { + POLYBENCH_ATAX_BODY5_RAJA; + }, + [=] (Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY6_RAJA; + } + + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_ATAX : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_ATAX, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index 5a9d15e89..e7cb48875 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,18 +26,21 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(100); - m_N = std::sqrt( getTargetProblemSize() )+1; + m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; setActualProblemSize( m_N * m_N ); setItsPerRep( m_N + m_N ); setKernelsPerRep(2); - setBytesPerRep( (2*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N + - - (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N * m_N + + + 1*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N * m_N ); + setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_N*m_N + 2 * m_N*m_N ); @@ -65,6 +68,9 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_ATAX::~POLYBENCH_ATAX() diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index f94ade140..e6d43bfbc 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -115,17 +115,22 @@ class POLYBENCH_ATAX : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_N; Real_ptr m_tmp; diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 415e2fc94..b71772a99 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -160,23 +160,32 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - poly_fdtd2d_1<<>>(ey, fict, ny, t); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (poly_fdtd2d_1), + grid_size1, block_size, + shmem, res.get_stream(), + ey, fict, ny, t ); FDTD_2D_THREADS_PER_BLOCK_CUDA; FDTD_2D_NBLOCKS_CUDA; - poly_fdtd2d_2 - <<>>(ey, hz, nx, ny); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_fdtd2d_2), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + ey, hz, nx, ny ); - poly_fdtd2d_3 - <<>>(ex, hz, nx, ny); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_fdtd2d_3), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + ex, hz, nx, ny ); - poly_fdtd2d_4 - <<>>(hz, ex, ey, nx, ny); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_fdtd2d_4), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + hz, ex, ey, nx, ny ); } // tstep loop @@ -193,38 +202,55 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - poly_fdtd2d_1_lam<<>>(ny, - [=] __device__ (Index_type j) { - POLYBENCH_FDTD_2D_BODY1; - } - ); - FDTD_2D_THREADS_PER_BLOCK_CUDA; - FDTD_2D_NBLOCKS_CUDA; + auto poly_fdtd2d_1_lambda = [=] __device__ (Index_type j) { + POLYBENCH_FDTD_2D_BODY1; + }; - poly_fdtd2d_2_lam - <<>>(nx, ny, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY2; - } - ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_fdtd2d_1_lam), + grid_size1, block_size, + shmem, res.get_stream(), + ny, poly_fdtd2d_1_lambda ); - poly_fdtd2d_3_lam - <<>>(nx, ny, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY3; - } - ); - cudaErrchk( cudaGetLastError() ); + FDTD_2D_THREADS_PER_BLOCK_CUDA; + FDTD_2D_NBLOCKS_CUDA; - poly_fdtd2d_4_lam - <<>>(nx, ny, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY4; - } - ); - cudaErrchk( cudaGetLastError() ); + auto poly_fdtd2d_2_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY2; + }; + + RPlaunchCudaKernel( + (poly_fdtd2d_2_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_2_lambda ); + + auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY3; + }; + + RPlaunchCudaKernel( + (poly_fdtd2d_3_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_3_lambda ); + + auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY4; + }; + + RPlaunchCudaKernel( + (poly_fdtd2d_4_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_4_lambda ); } // tstep loop diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index ad8bd66d1..ddb62d9a5 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -159,28 +159,32 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - hipLaunchKernelGGL((poly_fdtd2d_1), - dim3(grid_size1), dim3(block_size), shmem, res.get_stream(), - ey, fict, ny, t); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (poly_fdtd2d_1), + grid_size1, block_size, + shmem, res.get_stream(), + ey, fict, ny, t ); FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_fdtd2d_2), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - ey, hz, nx, ny); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_fdtd2d_2), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + ey, hz, nx, ny ); - hipLaunchKernelGGL((poly_fdtd2d_3), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - ex, hz, nx, ny); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_fdtd2d_3), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + ex, hz, nx, ny ); - hipLaunchKernelGGL((poly_fdtd2d_4), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - hz, ex, ey, nx, ny); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_fdtd2d_4), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + hz, ex, ey, nx, ny ); } // tstep loop @@ -196,48 +200,56 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; + const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); + auto poly_fdtd2d_1_lambda = [=] __device__ (Index_type j) { POLYBENCH_FDTD_2D_BODY1; }; - const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - hipLaunchKernelGGL((poly_fdtd2d_1_lam), - dim3(grid_size1), dim3(block_size), shmem, res.get_stream(), - ny, poly_fdtd2d_1_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_fdtd2d_1_lam), + grid_size1, block_size, + shmem, res.get_stream(), + ny, poly_fdtd2d_1_lambda ); FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - auto poly_fdtd2d_2_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY2; - }; + auto poly_fdtd2d_2_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY2; + }; - hipLaunchKernelGGL((poly_fdtd2d_2_lam), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - nx, ny, poly_fdtd2d_2_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_fdtd2d_2_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_2_lambda ); - auto poly_fdtd2d_3_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY3; - }; + auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY3; + }; - hipLaunchKernelGGL((poly_fdtd2d_3_lam), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - nx, ny, poly_fdtd2d_3_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_fdtd2d_3_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_3_lambda ); - auto poly_fdtd2d_4_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY4; - }; + auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY4; + }; - hipLaunchKernelGGL((poly_fdtd2d_4_lam), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - nx, ny, poly_fdtd2d_4_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_fdtd2d_4_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_4_lambda ); } // tstep loop diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp index 28d06bdc7..6eaf696f3 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp index c34d939ad..be8c0491e 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp index cc23d5a18..ed8e43833 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp new file mode 100644 index 000000000..b409b7569 --- /dev/null +++ b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp @@ -0,0 +1,185 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_FDTD_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_FDTD_2D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (t = 0; t < tsteps; ++t) { + + const size_t global_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(ny, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size1, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type j = item.get_global_id(0); + if (j < ny) { + POLYBENCH_FDTD_2D_BODY1; + } + + }); + }); + + sycl::range<3> global_dim234(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(nx, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(ny, j_wg_sz)); + + sycl::range<3> wkgroup_dim234(1, i_wg_sz, j_wg_sz); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i > 0 && i < nx && j < ny) { + POLYBENCH_FDTD_2D_BODY2; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i < nx && j > 0 && j < ny) { + POLYBENCH_FDTD_2D_BODY3; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i < nx-1 && j < ny-1) { + POLYBENCH_FDTD_2D_BODY4; + } + + }); + }); + + } // tstep loop + + } // run_reps + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_FDTD_2D_VIEWS_RAJA; + + using EXEC_POL1 = RAJA::sycl_exec; + + using EXEC_POL234 = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (t = 0; t < tsteps; ++t) { + + RAJA::forall( res, RAJA::RangeSegment(0, ny), + [=] (Index_type j) { + POLYBENCH_FDTD_2D_BODY1_RAJA; + }); + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, nx}, + RAJA::RangeSegment{0, ny}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY2_RAJA; + } + ); + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, nx}, + RAJA::RangeSegment{1, ny}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY3_RAJA; + } + ); + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, nx-1}, + RAJA::RangeSegment{0, ny-1}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY4_RAJA; + } + ); + + } // tstep loop + + } // run_reps + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_FDTD_2D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_FDTD_2D, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index ed2432d87..7f87fd3ef 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -31,7 +31,7 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) nx_default * (ny_default-1) ) ); setDefaultReps(8); - m_nx = std::sqrt( getTargetProblemSize() ) + 1; + m_nx = std::sqrt( getTargetProblemSize() ) + 1 + std::sqrt(2)-1; m_ny = m_nx; m_tsteps = 40; @@ -43,18 +43,25 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) m_nx*(m_ny-1) + (m_nx-1)*(m_ny-1) ) ); setKernelsPerRep(m_tsteps * 4); - setBytesPerRep( m_tsteps * ( (0*sizeof(Real_type ) + 1*sizeof(Real_type )) + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ny + + setBytesReadPerRep((1*sizeof(Real_type ) + - (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * m_ny + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * m_ny + + 1*sizeof(Real_type ) * (m_nx-1) * m_ny + + 1*sizeof(Real_type ) * m_nx * m_ny + - (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * (m_ny-1) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * m_ny + + 1*sizeof(Real_type ) * m_nx * (m_ny-1) + + 1*sizeof(Real_type ) * m_nx * m_ny + - (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * (m_ny-1) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * m_ny + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * (m_ny-1) ) ); + 1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) + + 1*sizeof(Real_type ) * (m_nx-1) * m_ny + + 1*sizeof(Real_type ) * m_nx * (m_ny-1)) * m_tsteps ); + setBytesWrittenPerRep((1*sizeof(Real_type ) * m_ny + + + 1*sizeof(Real_type ) * (m_nx-1) * m_ny + + + 1*sizeof(Real_type ) * m_nx * (m_ny-1) + + + 1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1)) * m_tsteps ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( 0 * m_ny + 3 * (m_nx-1)*m_ny + 3 * m_nx*(m_ny-1) + @@ -84,6 +91,9 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_FDTD_2D::~POLYBENCH_FDTD_2D() diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index e1d1b67c3..685c4cf40 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -113,18 +113,23 @@ class POLYBENCH_FDTD_2D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_nx; Index_type m_ny; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index 7aff525c2..5edb43b97 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -85,10 +85,11 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_floyd_warshall - <<>>(pout, pin, - k, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_floyd_warshall), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + pout, pin, k, N ); } @@ -106,12 +107,17 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_floyd_warshall_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FLOYD_WARSHALL_BODY; - } - ); + auto poly_floyd_warshall_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FLOYD_WARSHALL_BODY; + }; + + RPlaunchCudaKernel( + (poly_floyd_warshall_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_floyd_warshall_lambda ); } @@ -138,10 +144,11 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, N}, - RAJA::RangeSegment{0, N}, - RAJA::RangeSegment{0, N}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + res, [=] __device__ (Index_type k, Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index c3581748c..ec408396a 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -85,11 +85,11 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_floyd_warshall), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - pout, pin, - k, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_floyd_warshall), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + pout, pin, k, N ); } @@ -103,20 +103,21 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) for (Index_type k = 0; k < N; ++k) { - auto poly_floyd_warshall_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FLOYD_WARSHALL_BODY; - }; - POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP; POLY_FLOYD_WARSHALL_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL( - (poly_floyd_warshall_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_floyd_warshall_lambda); - hipErrchk( hipGetLastError() ); + auto poly_floyd_warshall_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FLOYD_WARSHALL_BODY; + }; + + RPlaunchHipKernel( + (poly_floyd_warshall_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_floyd_warshall_lambda ); } @@ -143,10 +144,11 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, N}, - RAJA::RangeSegment{0, N}, - RAJA::RangeSegment{0, N}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + res, [=] __device__ (Index_type k, Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp index 1b2f57e4d..8aab52a55 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp index e0e5d6d54..f3af4a088 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp index 36ac66a84..8b088db8e 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp new file mode 100644 index 000000000..415470801 --- /dev/null +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp @@ -0,0 +1,115 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_FLOYD_WARSHALL.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_FLOYD_WARSHALL::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_FLOYD_WARSHALL_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(N, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(N, j_wg_sz)); + + sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = 0; k < N; ++k) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if ( i < N && j < N ) { + POLYBENCH_FLOYD_WARSHALL_BODY; + } + + }); + }); + + } + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::seq_exec, + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<2, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + res, + [=] (Index_type k, Index_type i, Index_type j) { + POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; + } + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_FLOYD_WARSHALL, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 03c1e65ba..149ae87aa 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,14 +26,16 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(8); - m_N = std::sqrt( getTargetProblemSize() ) + 1; + m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; setActualProblemSize( m_N * m_N ); setItsPerRep( m_N*m_N ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type ) * m_N * m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * m_N*m_N*m_N ); checksum_scale_factor = 1.0 * @@ -60,6 +62,9 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_FLOYD_WARSHALL::~POLYBENCH_FLOYD_WARSHALL() diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index e8a067377..618f6e0f6 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -76,18 +76,23 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_N; diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index afaa17185..2307900dd 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -90,11 +90,13 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) POLY_GEMM_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_gemm - <<>>(C, A, B, - alpha, beta, - ni, nj, nk); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_gemm), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + C, A, B, + alpha, beta, + ni, nj, nk ); } stopTimer(); @@ -108,18 +110,21 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) POLY_GEMM_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_gemm_lam - <<>>(ni, nj, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_GEMM_BODY1; - POLYBENCH_GEMM_BODY2; - for (Index_type k = 0; k < nk; ++k ) { - POLYBENCH_GEMM_BODY3; - } - POLYBENCH_GEMM_BODY4; + auto poly_gemm_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_GEMM_BODY1; + POLYBENCH_GEMM_BODY2; + for (Index_type k = 0; k < nk; ++k ) { + POLYBENCH_GEMM_BODY3; } - ); - cudaErrchk( cudaGetLastError() ); + POLYBENCH_GEMM_BODY4; + }; + + RPlaunchCudaKernel( + (poly_gemm_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_gemm_lambda ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index 4ee83f375..f92beaed0 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -90,11 +90,13 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) POLY_GEMM_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_gemm), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - C, A, B, alpha, beta, - ni, nj, nk); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_gemm), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + C, A, B, + alpha, beta, + ni, nj, nk ); } stopTimer(); @@ -117,10 +119,12 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) POLYBENCH_GEMM_BODY4; }; - hipLaunchKernelGGL((poly_gemm_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nj, poly_gemm_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_gemm_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_gemm_lambda ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp index 21f63e7f2..444af4df7 100644 --- a/src/polybench/POLYBENCH_GEMM-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp index a1d618b5b..a660ec35e 100644 --- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp index 84fa70002..1aaaab119 100644 --- a/src/polybench/POLYBENCH_GEMM-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-Sycl.cpp b/src/polybench/POLYBENCH_GEMM-Sycl.cpp new file mode 100644 index 000000000..2f4fc09d3 --- /dev/null +++ b/src/polybench/POLYBENCH_GEMM-Sycl.cpp @@ -0,0 +1,135 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GEMM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_GEMM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_GEMM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, j_wg_sz)); + + sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i < ni && j < nj) { + POLYBENCH_GEMM_BODY1; + POLYBENCH_GEMM_BODY2; + for (Index_type k = 0; k < nk; ++k) { + POLYBENCH_GEMM_BODY3; + } + POLYBENCH_GEMM_BODY4; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_GEMM_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>>, + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<2, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<3, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param_resource( + + RAJA::make_tuple( RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk} ), + RAJA::tuple{0.0}, // variable for dot + res, + + [=] (Real_type& dot) { + POLYBENCH_GEMM_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j) { + POLYBENCH_GEMM_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j, Index_type k, + Real_type& dot) { + POLYBENCH_GEMM_BODY3_RAJA; + }, + [=] (Index_type i, Index_type j, + Real_type& dot) { + POLYBENCH_GEMM_BODY4_RAJA; + } + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GEMM, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 6094ce908..48769b42f 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,9 +28,9 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setDefaultProblemSize( ni_default * nj_default ); setDefaultReps(4); - m_ni = std::sqrt( getTargetProblemSize() ) + 1; + m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_nj = m_ni; - m_nk = nk_default; + m_nk = Index_type(double(nk_default)/ni_default*m_ni); m_alpha = 0.62; m_beta = 1.002; @@ -40,9 +40,10 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setItsPerRep( m_ni * m_nj ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk ); + setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + + 1*sizeof(Real_type ) * m_nj * m_nk ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((1 + 3 * m_nk) * m_ni*m_nj); @@ -70,6 +71,9 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_GEMM::~POLYBENCH_GEMM() diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index 33ea77997..14c590596 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -99,18 +99,23 @@ class POLYBENCH_GEMM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 1360e93b2..d5119ede5 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -41,10 +41,10 @@ namespace polybench template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) -__global__ void poly_gemmver_1(Real_ptr A, - Real_ptr u1, Real_ptr v1, - Real_ptr u2, Real_ptr v2, - Index_type n) +__global__ void poly_gemver_1(Real_ptr A, + Real_ptr u1, Real_ptr v1, + Real_ptr u2, Real_ptr v2, + Index_type n) { Index_type i = blockIdx.y * i_block_size + threadIdx.y; Index_type j = blockIdx.x * j_block_size + threadIdx.x; @@ -56,7 +56,7 @@ __global__ void poly_gemmver_1(Real_ptr A, template < size_t j_block_size, size_t i_block_size, typename Lambda > __launch_bounds__(j_block_size*i_block_size) -__global__ void poly_gemmver_1_lam(Index_type n, Lambda body) +__global__ void poly_gemver_1_lam(Index_type n, Lambda body) { Index_type i = blockIdx.y * i_block_size + threadIdx.y; Index_type j = blockIdx.x * j_block_size + threadIdx.x; @@ -68,10 +68,10 @@ __global__ void poly_gemmver_1_lam(Index_type n, Lambda body) template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_2(Real_ptr A, - Real_ptr x, Real_ptr y, - Real_type beta, - Index_type n) +__global__ void poly_gemver_2(Real_ptr A, + Real_ptr x, Real_ptr y, + Real_type beta, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -85,8 +85,8 @@ __global__ void poly_gemmver_2(Real_ptr A, template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, - Index_type n) +__global__ void poly_gemver_3(Real_ptr x, Real_ptr z, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -96,10 +96,10 @@ __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_4(Real_ptr A, - Real_ptr x, Real_ptr w, - Real_type alpha, - Index_type n) +__global__ void poly_gemver_4(Real_ptr A, + Real_ptr x, Real_ptr w, + Real_type alpha, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -113,7 +113,7 @@ __global__ void poly_gemmver_4(Real_ptr A, template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void poly_gemmver_234_lam(Index_type n, Lambda body) +__global__ void poly_gemver_234_lam(Index_type n, Lambda body) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -140,26 +140,28 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) GEMVER_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_gemmver_1 - <<>>(A, u1, v1, u2, v2, - n); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_gemver_1), + nblocks1, nthreads_per_block1, + shmem, res.get_stream(), + A, u1, v1, u2, v2, n ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); - poly_gemmver_2<<>>(A, x, y, - beta, - n); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_gemver_2), + grid_size, block_size, + shmem, res.get_stream(), + A, x, y, beta, n ); - poly_gemmver_3<<>>(x, z, - n); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_gemver_3), + grid_size, block_size, + shmem, res.get_stream(), + x, z, n ); - poly_gemmver_4<<>>(A, x, w, - alpha, - n); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_gemver_4), + grid_size, block_size, + shmem, res.get_stream(), + A, x, w, alpha, n ); } stopTimer(); @@ -173,44 +175,56 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) GEMVER_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_gemmver_1_lam - <<>>(n, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_GEMVER_BODY1; - } - ); - cudaErrchk( cudaGetLastError() ); + auto poly_gemver1_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_GEMVER_BODY1; + }; + + RPlaunchCudaKernel( + (poly_gemver_1_lam), + nblocks1, nthreads_per_block1, + shmem, res.get_stream(), + n, poly_gemver1_lambda ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); - poly_gemmver_234_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY2; - for (Index_type j = 0; j < n; ++j) { - POLYBENCH_GEMVER_BODY3; - } - POLYBENCH_GEMVER_BODY4; + auto poly_gemver2_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY2; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY3; } - ); - cudaErrchk( cudaGetLastError() ); - - poly_gemmver_234_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY5; + POLYBENCH_GEMVER_BODY4; + }; + + RPlaunchCudaKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver2_lambda ); + + auto poly_gemver3_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY5; + }; + + RPlaunchCudaKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver3_lambda ); + + auto poly_gemver4_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY6; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY7; } - ); - cudaErrchk( cudaGetLastError() ); + POLYBENCH_GEMVER_BODY8; + }; - poly_gemmver_234_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY6; - for (Index_type j = 0; j < n; ++j) { - POLYBENCH_GEMVER_BODY7; - } - POLYBENCH_GEMVER_BODY8; - } - ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver4_lambda ); } stopTimer(); @@ -248,9 +262,10 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, n}, - RAJA::RangeSegment{0, n}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_GEMVER_BODY1_RAJA; } diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index ab1416bf0..f51e15d42 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -41,10 +41,10 @@ namespace polybench template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) -__global__ void poly_gemmver_1(Real_ptr A, - Real_ptr u1, Real_ptr v1, - Real_ptr u2, Real_ptr v2, - Index_type n) +__global__ void poly_gemver_1(Real_ptr A, + Real_ptr u1, Real_ptr v1, + Real_ptr u2, Real_ptr v2, + Index_type n) { Index_type i = blockIdx.y * i_block_size + threadIdx.y; Index_type j = blockIdx.x * j_block_size + threadIdx.x; @@ -56,7 +56,7 @@ __global__ void poly_gemmver_1(Real_ptr A, template < size_t j_block_size, size_t i_block_size, typename Lambda > __launch_bounds__(j_block_size*i_block_size) -__global__ void poly_gemmver_1_lam(Index_type n, Lambda body) +__global__ void poly_gemver_1_lam(Index_type n, Lambda body) { Index_type i = blockIdx.y * i_block_size + threadIdx.y; Index_type j = blockIdx.x * j_block_size + threadIdx.x; @@ -68,10 +68,10 @@ __global__ void poly_gemmver_1_lam(Index_type n, Lambda body) template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_2(Real_ptr A, - Real_ptr x, Real_ptr y, - Real_type beta, - Index_type n) +__global__ void poly_gemver_2(Real_ptr A, + Real_ptr x, Real_ptr y, + Real_type beta, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -85,8 +85,8 @@ __global__ void poly_gemmver_2(Real_ptr A, template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, - Index_type n) +__global__ void poly_gemver_3(Real_ptr x, Real_ptr z, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -96,10 +96,10 @@ __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_4(Real_ptr A, - Real_ptr x, Real_ptr w, - Real_type alpha, - Index_type n) +__global__ void poly_gemver_4(Real_ptr A, + Real_ptr x, Real_ptr w, + Real_type alpha, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -113,7 +113,7 @@ __global__ void poly_gemmver_4(Real_ptr A, template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void poly_gemmver_234_lam(Index_type n, Lambda body) +__global__ void poly_gemver_234_lam(Index_type n, Lambda body) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -140,27 +140,28 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) GEMVER_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_gemmver_1), - dim3(nblocks1), dim3(nthreads_per_block1), shmem, res.get_stream(), - A, u1, v1, u2, v2, n); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_gemver_1), + nblocks1, nthreads_per_block1, + shmem, res.get_stream(), + A, u1, v1, u2, v2, n ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n, block_size); - hipLaunchKernelGGL((poly_gemmver_2), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x, y, beta, n); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_gemver_2), + grid_size, block_size, + shmem, res.get_stream(), + A, x, y, beta, n ); - hipLaunchKernelGGL((poly_gemmver_3), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - x, z, n); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_gemver_3), + grid_size, block_size, + shmem, res.get_stream(), + x, z, n ); - hipLaunchKernelGGL((poly_gemmver_4), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x, w, alpha, n); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_gemver_4), + grid_size, block_size, + shmem, res.get_stream(), + A, x, w, alpha, n ); } stopTimer(); @@ -174,51 +175,56 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) GEMVER_NBLOCKS_HIP; constexpr size_t shmem = 0; - auto poly_gemmver_1_lambda = [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_GEMVER_BODY1; + auto poly_gemver1_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_GEMVER_BODY1; }; - hipLaunchKernelGGL((poly_gemmver_1_lam), - dim3(nblocks1), dim3(nthreads_per_block1), shmem, res.get_stream(), - n, poly_gemmver_1_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_gemver_1_lam), + nblocks1, nthreads_per_block1, + shmem, res.get_stream(), + n, poly_gemver1_lambda ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); - auto poly_gemmver_2_lambda = [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY2; - for (Index_type j = 0; j < n; ++j) { - POLYBENCH_GEMVER_BODY3; - } - POLYBENCH_GEMVER_BODY4; + auto poly_gemver2_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY2; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY3; + } + POLYBENCH_GEMVER_BODY4; }; - hipLaunchKernelGGL((poly_gemmver_234_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, poly_gemmver_2_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver2_lambda ); - auto poly_gemmver_3_lambda = [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY5; + auto poly_gemver3_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY5; }; - hipLaunchKernelGGL((poly_gemmver_234_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, poly_gemmver_3_lambda); - hipErrchk( hipGetLastError() ); - - auto poly_gemmver_4_lambda = [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY6; - for (Index_type j = 0; j < n; ++j) { - POLYBENCH_GEMVER_BODY7; - } - POLYBENCH_GEMVER_BODY8; + RPlaunchHipKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver3_lambda ); + + auto poly_gemver4_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY6; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY7; + } + POLYBENCH_GEMVER_BODY8; }; - hipLaunchKernelGGL((poly_gemmver_234_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, poly_gemmver_4_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver4_lambda ); } stopTimer(); @@ -256,9 +262,10 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, n}, - RAJA::RangeSegment{0, n}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_GEMVER_BODY1_RAJA; } diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp index a20872867..5af84061d 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp index b12be578a..29f487d73 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp index 5a4f9199a..c1524a2ef 100644 --- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-Sycl.cpp b/src/polybench/POLYBENCH_GEMVER-Sycl.cpp new file mode 100644 index 000000000..1242de063 --- /dev/null +++ b/src/polybench/POLYBENCH_GEMVER-Sycl.cpp @@ -0,0 +1,210 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GEMVER.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_GEMVER::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_GEMVER_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim1(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(n, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(n, j_wg_sz)); + sycl::range<3> wkgroup_dim1(1, i_wg_sz, j_wg_sz); + + const size_t global_size234 = work_group_size * RAJA_DIVIDE_CEILING_INT(n, work_group_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim1), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i < n && j < n) { + POLYBENCH_GEMVER_BODY1; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < n) { + POLYBENCH_GEMVER_BODY2; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY3; + } + POLYBENCH_GEMVER_BODY4; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < n) { + POLYBENCH_GEMVER_BODY5; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < n) { + POLYBENCH_GEMVER_BODY6; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY7; + } + POLYBENCH_GEMVER_BODY8; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_GEMVER_VIEWS_RAJA; + + using EXEC_POL1 = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + >; + + using EXEC_POL24 = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + > + >; + + using EXEC_POL3 = RAJA::sycl_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_GEMVER_BODY1_RAJA; + } + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + RAJA::tuple{0.0}, + res, + + [=] (Index_type /* i */, Real_type &dot) { + POLYBENCH_GEMVER_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_GEMVER_BODY3_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY4_RAJA; + } + ); + + RAJA::forall ( res, RAJA::RangeSegment{0, n}, + [=] (Index_type i) { + POLYBENCH_GEMVER_BODY5_RAJA; + } + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + RAJA::tuple{0.0}, + res, + + [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY6_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_GEMVER_BODY7_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY8_RAJA; + } + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_GEMVER : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GEMVER, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 7223f85fd..e0db7f361 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,7 +26,7 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) setDefaultProblemSize( n_default * n_default ); setDefaultReps(20); - m_n = std::sqrt( getTargetProblemSize() ) + 1; + m_n = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_alpha = 1.5; m_beta = 1.2; @@ -39,16 +39,24 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) m_n + m_n*m_n ); setKernelsPerRep(4); - setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n + - (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_n + + setBytesReadPerRep( 1*sizeof(Real_type ) * m_n * m_n + + 4*sizeof(Real_type ) * m_n + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n + - (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n + + 1*sizeof(Real_type ) * m_n * m_n + + 2*sizeof(Real_type ) * m_n + - (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n + + 2*sizeof(Real_type ) * m_n + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n + - (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n ); + 1*sizeof(Real_type ) * m_n * m_n + + 2*sizeof(Real_type ) * m_n ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_n * m_n + + + 1*sizeof(Real_type ) * m_n + + + 1*sizeof(Real_type ) * m_n + + + 1*sizeof(Real_type ) * m_n ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(4 * m_n*m_n + 3 * m_n*m_n + 1 * m_n + @@ -79,6 +87,9 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_GEMVER::~POLYBENCH_GEMVER() diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 07ecae962..ef5ad5a8a 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -152,18 +152,23 @@ class POLYBENCH_GEMVER : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_n; Real_type m_alpha; diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 3e921c2d2..24ed43947 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -56,11 +56,14 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - poly_gesummv<<>>(x, y, - A, B, - alpha, beta, - N); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (poly_gesummv), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + A, B, + alpha, beta, + N ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 7f4468849..5a156b799 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -56,13 +56,14 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_gesummv), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), + + RPlaunchHipKernel( (poly_gesummv), + grid_size, block_size, + shmem, res.get_stream(), x, y, - A, B, + A, B, alpha, beta, - N); - hipErrchk( hipGetLastError() ); + N ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp index f9efd4d31..bc59ada36 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp index 86e73b293..8f572c16a 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp index d7ba3fc70..34b70708f 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp b/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp new file mode 100644 index 000000000..83197e995 --- /dev/null +++ b/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp @@ -0,0 +1,117 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GESUMMV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + +template < size_t work_group_size > +void POLYBENCH_GESUMMV::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_GESUMMV_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + + if (i < N) { + POLYBENCH_GESUMMV_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_GESUMMV_BODY2; + } + POLYBENCH_GESUMMV_BODY3; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_GESUMMV_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Params<0,1>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0,1>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0,1>> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param_resource( + RAJA::make_tuple( RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N} ), + RAJA::make_tuple(static_cast(0.0), + static_cast(0.0)), + res, + + [=] (Real_type& tmpdot, + Real_type& ydot) { + POLYBENCH_GESUMMV_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type& tmpdot, + Real_type& ydot) { + POLYBENCH_GESUMMV_BODY2_RAJA; + }, + [=] (Index_type i, Real_type& tmpdot, + Real_type& ydot) { + POLYBENCH_GESUMMV_BODY3_RAJA; + } + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_GESUMMV : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GESUMMV, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index ea8e2224f..7764c4036 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,7 +26,7 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(120); - m_N = std::sqrt( getTargetProblemSize() ) + 1; + m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_alpha = 0.62; m_beta = 1.002; @@ -36,8 +36,10 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) setItsPerRep( m_N ); setKernelsPerRep(1); - setBytesPerRep( (2*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + + 2*sizeof(Real_type ) * m_N * m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((4 * m_N + 3 ) * m_N ); @@ -59,6 +61,9 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_GESUMMV::~POLYBENCH_GESUMMV() diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 32a1b0eae..3d80155ed 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -98,17 +98,22 @@ class POLYBENCH_GESUMMV : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_N; diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 1b63ee758..70fa00a76 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -100,13 +100,17 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) HEAT_3D_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_heat_3D_1 - <<>>(A, B, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_heat_3D_1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); - poly_heat_3D_2 - <<>>(A, B, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_heat_3D_2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); } @@ -124,21 +128,31 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) HEAT_3D_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_heat_3D_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j, Index_type k) { - POLYBENCH_HEAT_3D_BODY1; - } - ); - cudaErrchk( cudaGetLastError() ); - - poly_heat_3D_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j, Index_type k) { - POLYBENCH_HEAT_3D_BODY2; - } - ); - cudaErrchk( cudaGetLastError() ); + auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i, + Index_type j, + Index_type k) { + POLYBENCH_HEAT_3D_BODY1; + }; + + RPlaunchCudaKernel( + (poly_heat_3D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_heat_3D_1_lambda ); + + auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, + Index_type j, + Index_type k) { + POLYBENCH_HEAT_3D_BODY2; + }; + + RPlaunchCudaKernel( + (poly_heat_3D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_heat_3D_2_lambda ); } @@ -168,19 +182,21 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1_RAJA; } ); - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY2_RAJA; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 3a7d7f28e..6a2fb3329 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -100,15 +100,17 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) HEAT_3D_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_heat_3D_1), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - A, B, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_heat_3D_1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); - hipLaunchKernelGGL((poly_heat_3D_2), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - A, B, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_heat_3D_2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); } @@ -126,26 +128,31 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) HEAT_3D_NBLOCKS_HIP; constexpr size_t shmem = 0; - auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i, Index_type j, + auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i, + Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1; }; - auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, Index_type j, Index_type k) { + RPlaunchHipKernel( + (poly_heat_3D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_heat_3D_1_lambda ); + + auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, + Index_type j, + Index_type k) { POLYBENCH_HEAT_3D_BODY2; }; - hipLaunchKernelGGL((poly_heat_3D_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_heat_3D_1_lambda); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((poly_heat_3D_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_heat_3D_2_lambda); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_heat_3D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_heat_3D_2_lambda ); } @@ -174,19 +181,21 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1_RAJA; } ); - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY2_RAJA; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp index 1b9380a15..ba80f5022 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp index 7a70c3f87..1c3999279 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp index 25af09240..115661cc7 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp new file mode 100644 index 000000000..27341d447 --- /dev/null +++ b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp @@ -0,0 +1,148 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_HEAT_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define k_wg_sz (32) +#define j_wg_sz (work_group_size / k_wg_sz) +#define i_wg_sz (1) + + +template < size_t work_group_size > +void POLYBENCH_HEAT_3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_HEAT_3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + sycl::range<3> global_dim(i_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, j_wg_sz), + k_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, k_wg_sz)); + + sycl::range<3> wkgroup_dim(i_wg_sz, j_wg_sz, k_wg_sz); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = 1 + item.get_global_id(0); + Index_type j = 1 + item.get_global_id(1); + Index_type k = 1 + item.get_global_id(2); + + if (i < N-1 && j < N-1 && k < N-1) { + POLYBENCH_HEAT_3D_BODY1; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = 1 + item.get_global_id(0); + Index_type j = 1 + item.get_global_id(1); + Index_type k = 1 + item.get_global_id(2); + + if (i < N-1 && j < N-1 && k < N-1) { + POLYBENCH_HEAT_3D_BODY2; + } + + }); + }); + + } + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_HEAT_3D_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<2, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + > + >; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, + [=] (Index_type i, Index_type j, Index_type k) { + POLYBENCH_HEAT_3D_BODY1_RAJA; + } + ); + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, + [=] (Index_type i, Index_type j, Index_type k) { + POLYBENCH_HEAT_3D_BODY2_RAJA; + } + ); + + } + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_HEAT_3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_HEAT_3D, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 4f14b54f9..1e4272534 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -22,12 +22,12 @@ namespace polybench POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) : KernelBase(rajaperf::Polybench_HEAT_3D, params) { - Index_type N_default = 100; + Index_type N_default = 102; setDefaultProblemSize( (N_default-2)*(N_default-2)*(N_default-2) ); setDefaultReps(20); - m_N = std::cbrt( getTargetProblemSize() ) + 1; + m_N = std::cbrt( getTargetProblemSize() ) + 2 + std::cbrt(3)-1; m_tsteps = 20; @@ -35,14 +35,13 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) ); setKernelsPerRep( m_tsteps * 2 ); - setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - (m_N * m_N * m_N - 12*(m_N-2) - 8) + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - (m_N * m_N * m_N - 12*(m_N-2) - 8) ) ); + setBytesReadPerRep((1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8) + + + 1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8)) * m_tsteps); + setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) + + + 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2)) * m_tsteps); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( 15 * (m_N-2) * (m_N-2) * (m_N-2) + 15 * (m_N-2) * (m_N-2) * (m_N-2) ) ); @@ -70,6 +69,9 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_HEAT_3D::~POLYBENCH_HEAT_3D() diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 03150a267..590d7b326 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -122,18 +122,23 @@ class POLYBENCH_HEAT_3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index e2c728090..570792bbd 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -63,11 +63,15 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - poly_jacobi_1D_1<<>>(A, B, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_jacobi_1D_1), + grid_size, block_size, + shmem, res.get_stream(), + A, B, N ); - poly_jacobi_1D_2<<>>(A, B, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_jacobi_1D_2), + grid_size, block_size, + shmem, res.get_stream(), + A, B, N ); } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index b0f60255d..d77497459 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -63,13 +63,15 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_jacobi_1D_1), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, B, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_jacobi_1D_1), + grid_size, block_size, + shmem, res.get_stream(), + A, B, N ); - hipLaunchKernelGGL((poly_jacobi_1D_2), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, B, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_jacobi_1D_2), + grid_size, block_size, + shmem, res.get_stream(), + A, B, N ); } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp index 42ae4a0d5..0c7cbae57 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp index 39a2423df..35089be71 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index 20c8c9b73..5f3549b41 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -25,12 +25,14 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR POLYBENCH_JACOBI_1D_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto poly_jacobi1d_lam1 = [=] (Index_type i) { POLYBENCH_JACOBI_1D_BODY1; }; auto poly_jacobi1d_lam2 = [=] (Index_type i) { POLYBENCH_JACOBI_1D_BODY2; }; +#endif switch ( vid ) { diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp new file mode 100644 index 000000000..8a13f6567 --- /dev/null +++ b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_JACOBI_1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + +template < size_t work_group_size > +void POLYBENCH_JACOBI_1D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_JACOBI_1D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < N-1) { + POLYBENCH_JACOBI_1D_BODY1; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < N-1) { + POLYBENCH_JACOBI_1D_BODY2; + } + + }); + }); + + } + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + using EXEC_POL = RAJA::sycl_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::forall ( res, RAJA::RangeSegment{1, N-1}, + [=] (Index_type i) { + POLYBENCH_JACOBI_1D_BODY1; + }); + + RAJA::forall ( res, RAJA::RangeSegment{1, N-1}, + [=] (Index_type i) { + POLYBENCH_JACOBI_1D_BODY2; + }); + + } + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_JACOBI_1D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_JACOBI_1D, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index b2beb0dfd..59a2520b5 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -21,12 +21,12 @@ namespace polybench POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_1D, params) { - Index_type N_default = 1000000; + Index_type N_default = 1000002; setDefaultProblemSize( N_default-2 ); setDefaultReps(100); - m_N = getTargetProblemSize(); + m_N = getTargetProblemSize() + 2; m_tsteps = 16; @@ -34,14 +34,13 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) ); setKernelsPerRep(m_tsteps * 2); - setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - m_N + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - m_N ) ); + setBytesReadPerRep((1*sizeof(Real_type ) * m_N + + + 1*sizeof(Real_type ) * m_N) * m_tsteps); + setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) + + + 1*sizeof(Real_type ) * (m_N-2)) * m_tsteps); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( 3 * (m_N-2) + 3 * (m_N-2) ) ); @@ -67,6 +66,9 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_JACOBI_1D::~POLYBENCH_JACOBI_1D() diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 5c84e0682..f128e5947 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -68,17 +68,23 @@ class POLYBENCH_JACOBI_1D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 1e8a824bb..2620d0654 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -96,13 +96,17 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) JACOBI_2D_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_jacobi_2D_1 - <<>>(A, B, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_jacobi_2D_1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); - poly_jacobi_2D_2 - <<>>(A, B, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( + (poly_jacobi_2D_2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); } @@ -120,21 +124,29 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) JACOBI_2D_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_jacobi_2D_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_JACOBI_2D_BODY1; - } - ); - cudaErrchk( cudaGetLastError() ); - - poly_jacobi_2D_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_JACOBI_2D_BODY2; - } - ); - cudaErrchk( cudaGetLastError() ); + auto poly_jacobi_2D_1_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_JACOBI_2D_BODY1; + }; + + RPlaunchCudaKernel( + (poly_jacobi_2D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_jacobi_2D_1_lambda ); + + auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_JACOBI_2D_BODY2; + }; + + RPlaunchCudaKernel( + (poly_jacobi_2D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_jacobi_2D_2_lambda ); } @@ -161,17 +173,19 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel_resource(RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY1_RAJA; } ); - RAJA::kernel_resource(RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY2_RAJA; } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index 6590a8173..8aac79440 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -96,15 +96,17 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) JACOBI_2D_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_jacobi_2D_1), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - A, B, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_jacobi_2D_1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); - hipLaunchKernelGGL((poly_jacobi_2D_2), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - A, B, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( + (poly_jacobi_2D_2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); } @@ -122,25 +124,29 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) JACOBI_2D_NBLOCKS_HIP; constexpr size_t shmem = 0; - auto poly_jacobi_2D_1_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_JACOBI_2D_BODY1; - }; - - hipLaunchKernelGGL((poly_jacobi_2D_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_jacobi_2D_1_lambda); - hipErrchk( hipGetLastError() ); - - auto poly_jacobi_2D_2_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_JACOBI_2D_BODY2; - }; - - hipLaunchKernelGGL((poly_jacobi_2D_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_jacobi_2D_2_lambda); - hipErrchk( hipGetLastError() ); + auto poly_jacobi_2D_1_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_JACOBI_2D_BODY1; + }; + + RPlaunchHipKernel( + (poly_jacobi_2D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_jacobi_2D_1_lambda ); + + auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_JACOBI_2D_BODY2; + }; + + RPlaunchHipKernel( + (poly_jacobi_2D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_jacobi_2D_2_lambda ); } @@ -167,17 +173,19 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel_resource(RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY1_RAJA; } ); - RAJA::kernel_resource(RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY2_RAJA; } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp index 51f3cb146..d3d7b0471 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp index 97806cfac..e711660cc 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp index 107dd4ec4..18cc343cd 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp new file mode 100644 index 000000000..ff6dab08b --- /dev/null +++ b/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp @@ -0,0 +1,141 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_JACOBI_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_JACOBI_2D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_JACOBI_2D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + sycl::range<3> global_dim(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, j_wg_sz)); + + sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1) + 1; + Index_type j = item.get_global_id(2) + 1; + + if ( i < N-1 && j < N-1 ) { + POLYBENCH_JACOBI_2D_BODY1; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1) + 1; + Index_type j = item.get_global_id(2) + 1; + + if ( i < N-1 && j < N-1 ) { + POLYBENCH_JACOBI_2D_BODY2; + } + + }); + }); + + } + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_JACOBI_2D_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_JACOBI_2D_BODY1_RAJA; + } + ); + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_JACOBI_2D_BODY2_RAJA; + } + ); + + } + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_JACOBI_2D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_JACOBI_2D, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 9fe51e5c1..a3b077a1f 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -21,12 +21,12 @@ namespace polybench POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_2D, params) { - Index_type N_default = 1000; + Index_type N_default = 1002; - setDefaultProblemSize( N_default * N_default ); + setDefaultProblemSize( (N_default-2)*(N_default-2) ); setDefaultReps(50); - m_N = std::sqrt( getTargetProblemSize() ) + 1; + m_N = std::sqrt( getTargetProblemSize() ) + 2 + std::sqrt(2)-1; m_tsteps = 40; @@ -34,14 +34,13 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) setItsPerRep( m_tsteps * (2 * (m_N-2) * (m_N-2)) ); setKernelsPerRep(2); - setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - (m_N * m_N - 4) + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - (m_N * m_N - 4) ) ); + setBytesReadPerRep((1*sizeof(Real_type ) * (m_N * m_N - 4) + + + 1*sizeof(Real_type ) * (m_N * m_N - 4)) * m_tsteps); + setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) * (m_N-2) + + + 1*sizeof(Real_type ) * (m_N-2) * (m_N-2)) * m_tsteps); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( 5 * (m_N-2)*(m_N-2) + 5 * (m_N -2)*(m_N-2) ) ); @@ -69,6 +68,9 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_JACOBI_2D::~POLYBENCH_JACOBI_2D() diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index fe77836cb..df170306e 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -87,18 +87,23 @@ class POLYBENCH_JACOBI_2D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 871fef013..83ea50512 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -69,13 +69,17 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - constexpr size_t shmem = 0; + constexpr size_t shmem = 0; - poly_mvt_1<<>>(A, x1, y1, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_mvt_1), + grid_size, block_size, + shmem, res.get_stream(), + A, x1, y1, N ); - poly_mvt_2<<>>(A, x2, y2, N); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (poly_mvt_2), + grid_size, block_size, + shmem, res.get_stream(), + A, x2, y2, N ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 32b1b5161..636ad234c 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -69,17 +69,17 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - constexpr size_t shmem = 0; + constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_mvt_1), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x1, y1, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_mvt_1), + grid_size, block_size, + shmem, res.get_stream(), + A, x1, y1, N ); - hipLaunchKernelGGL((poly_mvt_2), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x2, y2, N); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (poly_mvt_2), + grid_size, block_size, + shmem, res.get_stream(), + A, x2, y2, N ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp index 159a86274..bb9c8f221 100644 --- a/src/polybench/POLYBENCH_MVT-OMP.cpp +++ b/src/polybench/POLYBENCH_MVT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp index c9ff17751..5b278628d 100644 --- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp index 9d63fd997..efa2ec452 100644 --- a/src/polybench/POLYBENCH_MVT-Seq.cpp +++ b/src/polybench/POLYBENCH_MVT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-Sycl.cpp b/src/polybench/POLYBENCH_MVT-Sycl.cpp new file mode 100644 index 000000000..c0a3879ad --- /dev/null +++ b/src/polybench/POLYBENCH_MVT-Sycl.cpp @@ -0,0 +1,153 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_MVT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + +template < size_t work_group_size > +void POLYBENCH_MVT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_MVT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + + if (i < N) { + POLYBENCH_MVT_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_MVT_BODY2; + } + POLYBENCH_MVT_BODY3; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + + if (i < N) { + POLYBENCH_MVT_BODY4; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_MVT_BODY5; + } + POLYBENCH_MVT_BODY6; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_MVT_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_MVT_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_MVT_BODY2_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_MVT_BODY3_RAJA; + } + + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_MVT_BODY4_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_MVT_BODY5_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_MVT_BODY6_RAJA; + } + + ); + + }); // end sequential region (for single-source code) + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_MVT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_MVT, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index c0a5b8bb9..e8da53a0c 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,17 +26,22 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(100); - m_N = std::sqrt( getTargetProblemSize() ) + 1; + m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; setActualProblemSize( m_N * m_N ); setItsPerRep( 2 * m_N ); setKernelsPerRep(2); - setBytesPerRep( (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N + - (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N ); + setBytesReadPerRep( 2*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N * m_N + + + 2*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N * m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N + + + 1*sizeof(Real_type ) * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_N*m_N + 2 * m_N*m_N ); @@ -62,6 +67,9 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_MVT::~POLYBENCH_MVT() diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index 518d75dd8..a54181833 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -112,17 +112,22 @@ class POLYBENCH_MVT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_N; Real_ptr m_x1; diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index d545c0b93..3d2588378 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -9,7 +9,7 @@ */ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -22,13 +22,20 @@ #include "RAJA/config.hpp" #include "camp/number.hpp" +#include "camp/list.hpp" +#include #include #cmakedefine RAJA_PERFSUITE_ENABLE_MPI #cmakedefine RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN +#if defined(RAJA_ENABLE_CUDA) +#define RAJA_PERFSUITE_TUNING_CUDA_ARCH @RAJA_PERFSUITE_TUNING_CUDA_ARCH@ +#endif + #if defined(RAJA_ENABLE_HIP) +#define RAJA_PERFSUITE_TUNING_HIP_ARCH @RAJA_PERFSUITE_TUNING_HIP_ARCH@ #include #if (HIP_VERSION_MAJOR > 5) || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) @@ -42,8 +49,23 @@ #include #endif +// Squash compiler warnings about unused variables +template < typename ... Ts > +inline void RAJAPERF_UNUSED_VAR(Ts&&...) { } + +// Squash compiler warnings about unused arguments +#define RAJAPERF_UNUSED_ARG(...) + namespace rajaperf { +namespace integer { + +// helper alias to convert comma separated integer literals into list +template < size_t... Is > +using list_type = camp::list< camp::integral_constant... >; + +} // closing brace for integer namespace + struct configuration { #if defined(RAJA_PERFSUITE_USE_CALIPER) @@ -85,18 +107,25 @@ const adiak::version adiak_compiler_version = std::string("@CMAKE_CXX_COMPILER_V const adiak::version adiak_cuda_compiler_version = std::string("@CMAKE_CUDA_COMPILER_VERSION@"); constexpr static const char* adiak_gpu_targets = "@GPU_TARGETS@"; constexpr static const char* adiak_cmake_hip_architectures = "@CMAKE_HIP_ARCHIECTURES@"; -const std::vector adiak_gpu_targets_block_sizes = {@RAJA_PERFSUITE_GPU_BLOCKSIZES@}; +constexpr static const char* adiak_tuning_cuda_arch = "@RAJA_PERFSUITE_TUNING_CUDA_ARCH@"; +constexpr static const char* adiak_tuning_hip_arch = "@RAJA_PERFSUITE_TUNING_HIP_ARCH@"; +const std::vector adiak_gpu_block_sizes = {@RAJA_PERFSUITE_GPU_BLOCKSIZES@}; +const std::vector adiak_atomic_replications = {@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@}; +const std::vector adiak_gpu_items_per_thread = {@RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD@}; const std::vector adiak_raja_hipcc_flags = str_to_list(std::string("@RAJA_HIPCC_FLAGS@")); const adiak::catstring adiak_mpi_cxx_compiler = std::string("@MPI_CXX_COMPILER@"); const adiak::catstring adiak_systype_build = std::string("@RAJAPERF_BUILD_SYSTYPE@"); const adiak::catstring adiak_machine_build = std::string("@RAJAPERF_BUILD_HOST@"); #endif -// helper alias to void trailing comma in no-arg case -template < size_t... Is > -using i_seq = camp::int_seq; // List of GPU block sizes -using gpu_block_sizes = i_seq<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; +using gpu_block_sizes = integer::list_type<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; + +// List of GPU atomic replications +using atomic_replications = integer::list_type<@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@>; + +// List of GPU items per thread +using gpu_items_per_thread = integer::list_type<@RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD@>; // Name of user who ran code std::string user_run; @@ -110,13 +139,27 @@ std::string machine_run; }; -} // closing brace for rajaperf namespace +#if __cplusplus < 201703L +// Implement std::conjunction from https://en.cppreference.com/w/cpp/types/conjunction +template struct conjunction : std::true_type {}; +template struct conjunction : B1 {}; +template +struct conjunction + : std::conditional_t, B1> {}; +#else +using std::conjunction; +#endif -// Squash compiler warnings about unused variables -template < typename ... Ts > -inline void RAJAPERF_UNUSED_VAR(Ts&&...) { } +//compile time loop over an integer sequence +//this allows for creating a loop over a compile time constant variable +template +inline void seq_for(camp::list const&, Func&& func) +{ + // braced init lists are evaluated in order + int seq_unused_array[] = {0, (func(Ts{}), 0)...}; + RAJAPERF_UNUSED_VAR(seq_unused_array); +} -// Squash compiler warnings about unused arguments -#define RAJAPERF_UNUSED_ARG(...) +} // closing brace for rajaperf namespace #endif // closing endif for header file include guard diff --git a/src/stream-kokkos/ADD-Kokkos.cpp b/src/stream-kokkos/ADD-Kokkos.cpp index 51e5bdf81..1a4280cc9 100644 --- a/src/stream-kokkos/ADD-Kokkos.cpp +++ b/src/stream-kokkos/ADD-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/CMakeLists.txt b/src/stream-kokkos/CMakeLists.txt index 4cd38bdf5..6ba8dbbb6 100644 --- a/src/stream-kokkos/CMakeLists.txt +++ b/src/stream-kokkos/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/stream-kokkos/COPY-Kokkos.cpp b/src/stream-kokkos/COPY-Kokkos.cpp index d363cd944..3312a57fa 100644 --- a/src/stream-kokkos/COPY-Kokkos.cpp +++ b/src/stream-kokkos/COPY-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/DOT-Kokkos.cpp b/src/stream-kokkos/DOT-Kokkos.cpp index ca6b0e304..ff1124068 100644 --- a/src/stream-kokkos/DOT-Kokkos.cpp +++ b/src/stream-kokkos/DOT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/MUL-Kokkos.cpp b/src/stream-kokkos/MUL-Kokkos.cpp index aa53b0d66..e1f17be92 100644 --- a/src/stream-kokkos/MUL-Kokkos.cpp +++ b/src/stream-kokkos/MUL-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/TRIAD-Kokkos.cpp b/src/stream-kokkos/TRIAD-Kokkos.cpp index 3b897a46a..2d5465939 100644 --- a/src/stream-kokkos/TRIAD-Kokkos.cpp +++ b/src/stream-kokkos/TRIAD-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index e8e095665..7b79f1b10 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,9 +52,11 @@ void ADD::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - add<<>>( c, a, b, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (add), + grid_size, block_size, + shmem, res.get_stream(), + c, a, b, iend ); } stopTimer(); @@ -64,13 +66,18 @@ void ADD::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto add_lambda = [=] __device__ (Index_type i) { + ADD_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - ADD_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, add_lambda ); } stopTimer(); diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 50ab42466..fe470d391 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,11 @@ void ADD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((add), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), c, a, b, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (add), + grid_size, block_size, + shmem, res.get_stream(), + c, a, b, iend ); } stopTimer(); @@ -69,9 +71,12 @@ void ADD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, add_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, add_lambda ); } stopTimer(); diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp index ddd24eb30..22f850da3 100644 --- a/src/stream/ADD-OMP.cpp +++ b/src/stream/ADD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp index 6e4302446..c1a1480cf 100644 --- a/src/stream/ADD-OMPTarget.cpp +++ b/src/stream/ADD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index a07fe24d6..516fe61a6 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Sycl.cpp b/src/stream/ADD-Sycl.cpp new file mode 100644 index 000000000..483672cb1 --- /dev/null +++ b/src/stream/ADD-Sycl.cpp @@ -0,0 +1,80 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace stream +{ + +template +void ADD::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + ADD_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + ADD_BODY + } + + }); + }); + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ADD_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n ADD : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ADD, Sycl) + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 02cf25107..510f39bb8 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ ADD::ADD(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); @@ -53,6 +54,9 @@ ADD::ADD(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 49e09a602..7b96dbf9e 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,18 +52,24 @@ class ADD : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt index 03351ff5d..bb4de4ce5 100644 --- a/src/stream/CMakeLists.txt +++ b/src/stream/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -14,29 +14,34 @@ blt_add_library( ADD-Cuda.cpp ADD-OMP.cpp ADD-OMPTarget.cpp + ADD-Sycl.cpp COPY.cpp COPY-Seq.cpp COPY-Hip.cpp COPY-Cuda.cpp COPY-OMP.cpp COPY-OMPTarget.cpp + COPY-Sycl.cpp DOT.cpp DOT-Seq.cpp DOT-Hip.cpp DOT-Cuda.cpp DOT-OMP.cpp DOT-OMPTarget.cpp + DOT-Sycl.cpp MUL.cpp MUL-Seq.cpp MUL-Hip.cpp MUL-Cuda.cpp MUL-OMP.cpp MUL-OMPTarget.cpp + MUL-Sycl.cpp TRIAD.cpp TRIAD-Seq.cpp TRIAD-Hip.cpp TRIAD-Cuda.cpp TRIAD-OMPTarget.cpp TRIAD-OMP.cpp + TRIAD-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 3bea59764..a45d45a16 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,11 @@ void COPY::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - copy<<>>( c, a, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (copy), + grid_size, block_size, + shmem, res.get_stream(), + c, a, iend ); } stopTimer(); @@ -63,13 +65,18 @@ void COPY::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto copy_lambda = [=] __device__ (Index_type i) { + COPY_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - COPY_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, copy_lambda ); } stopTimer(); diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 305892fdb..f5e19fac0 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,11 @@ void COPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((copy), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - c, a, iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (copy), + grid_size, block_size, + shmem, res.get_stream(), + c, a, iend ); } stopTimer(); @@ -69,9 +71,12 @@ void COPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, copy_lambda ); } stopTimer(); diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp index d9a0aa2a9..1718ff5ac 100644 --- a/src/stream/COPY-OMP.cpp +++ b/src/stream/COPY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp index a9250c4cd..f1dd5017d 100644 --- a/src/stream/COPY-OMPTarget.cpp +++ b/src/stream/COPY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 311f9754d..25b897707 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-Sycl.cpp b/src/stream/COPY-Sycl.cpp new file mode 100644 index 000000000..4f1049a6e --- /dev/null +++ b/src/stream/COPY-Sycl.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "COPY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace stream +{ + +template +void COPY::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + COPY_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + COPY_BODY + } + + }); + }); + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + COPY_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n COPY : Unknown Sycl variant id = " << vid << std::endl; + } + +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(COPY, Sycl) + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index c92018c63..9cfce257a 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ COPY::COPY(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature( Forall ); @@ -53,6 +54,9 @@ COPY::COPY(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 0544e0d2f..991406624 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,18 +51,24 @@ class COPY : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_c; diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index fbecc979f..031355a3e 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,9 @@ #include "common/CudaDataUtils.hpp" #include +#include +#include +#include namespace rajaperf @@ -45,24 +48,16 @@ __global__ void dot(Real_ptr a, Real_ptr b, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( dprod, pdot[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *dprod += pdot[ 0 ]; - } -#endif - } -template < size_t block_size > -void DOT::runCudaVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void DOT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -71,40 +66,65 @@ void DOT::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - Real_ptr dprod; - allocData(DataSpace::CudaDevice, dprod, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (dot), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1, 1); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - dot<<>>( - a, b, dprod, m_dot_init, iend ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (dot), + grid_size, block_size, + shmem, res.get_stream(), + a, b, dprod, m_dot_init, iend ); - Real_type lprod; - cudaErrchk( cudaMemcpyAsync( &lprod, dprod, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_dot += lprod; + RAJAPERF_CUDA_REDUCER_COPY_BACK(dprod, hdprod, 1, 1); + m_dot += hdprod[0]; } stopTimer(); - deallocData(DataSpace::CudaDevice, dprod); + RAJAPERF_CUDA_REDUCER_TEARDOWN(dprod, hdprod); + + } else { + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void DOT::runCudaVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; - } else if ( vid == RAJA_CUDA ) { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum dot(m_dot_init); + RAJA::ReduceSum dot(m_dot_init); - RAJA::forall< RAJA::cuda_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { DOT_BODY; }); @@ -119,7 +139,162 @@ void DOT::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Cuda) +template < size_t block_size, typename MappingHelper > +void DOT::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] __device__ (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void DOT::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void DOT::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + }); + + } + + }); + + } + +} } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 7bd1ef277..0c3c914a9 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -15,6 +15,9 @@ #include "common/HipDataUtils.hpp" #include +#include +#include +#include namespace rajaperf @@ -45,25 +48,16 @@ __global__ void dot(Real_ptr a, Real_ptr b, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { - //atomicAdd(dprod, pdot[ 0 ] ); - RAJA::atomicAdd(RAJA::hip_atomic{}, dprod, pdot[ 0 ] ); + RAJA::atomicAdd( dprod, pdot[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *dprod += pdot[ 0 ]; - } -#endif - } -template < size_t block_size > -void DOT::runHipVariantImpl(VariantID vid) +template < size_t block_size, typename MappingHelper > +void DOT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -72,41 +66,65 @@ void DOT::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - Real_ptr dprod; - allocData(DataSpace::HipDevice, dprod, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (dot), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1, 1); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - hipLaunchKernelGGL((dot), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), + RPlaunchHipKernel( (dot), + grid_size, block_size, + shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); - hipErrchk( hipGetLastError() ); - Real_type lprod; - hipErrchk( hipMemcpyAsync( &lprod, dprod, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_dot += lprod; + RAJAPERF_HIP_REDUCER_COPY_BACK(dprod, hdprod, 1, 1); + m_dot += hdprod[0]; } stopTimer(); - deallocData(DataSpace::HipDevice, dprod); + RAJAPERF_HIP_REDUCER_TEARDOWN(dprod, hdprod); - } else if ( vid == RAJA_HIP ) { + } else { + getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void DOT::runHipVariantRAJA(VariantID vid) +{ + using reduction_policy = std::conditional_t; + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum dot(m_dot_init); + RAJA::ReduceSum dot(m_dot_init); - RAJA::forall< RAJA::hip_exec >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { DOT_BODY; }); @@ -121,7 +139,160 @@ void DOT::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Hip) +template < size_t block_size, typename MappingHelper > +void DOT::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] __device__ (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown HIP variant id = " << vid << std::endl; + } +} + +void DOT::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBase(vid); + + } + + t += 1; + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJA(vid); + + } + + t += 1; + + }); + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void DOT::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + +} } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index 6b7d67e0e..d7112336a 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,7 +18,7 @@ namespace stream { -void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void DOT::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -76,20 +76,46 @@ void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum dot(m_dot_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DOT_BODY; - }); + RAJA::ReduceSum dot(m_dot_init); - m_dot += dot; + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DOT_BODY; + }); + + m_dot += dot; + + } + stopTimer(); + + } else if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown OpenMP tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -102,8 +128,17 @@ void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void DOT::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index 7ab5d578e..238f8fbae 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -26,7 +26,7 @@ namespace stream // const size_t threads_per_team = 256; -void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void DOT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -34,44 +34,89 @@ void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ DOT_DATA_SETUP; - if ( vid == Base_OpenMPTarget ) { + switch ( vid ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + case Base_OpenMPTarget : { - Real_type dot = m_dot_init; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp target is_device_ptr(a, b) device( did ) map(tofrom:dot) - #pragma omp teams distribute parallel for reduction(+:dot) \ - thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - DOT_BODY; - } + Real_type dot = m_dot_init; + + #pragma omp target is_device_ptr(a, b) device( did ) map(tofrom:dot) + #pragma omp teams distribute parallel for reduction(+:dot) \ + thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + DOT_BODY; + } + + m_dot += dot; - m_dot += dot; + } + stopTimer(); + break; } - stopTimer(); - } else if ( vid == RAJA_OpenMPTarget ) { + case RAJA_OpenMPTarget : { + + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } else if (tune_idx == 1) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum dot(m_dot_init); + Real_type tdot = m_dot_init; - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DOT_BODY; - }); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); - m_dot += static_cast(dot.get()); + m_dot += static_cast(tdot); + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown OMP Target tuning index = " << tune_idx << std::endl; + } + + break; + } + + default : { + getCout() << "\n DOT : Unknown OMP Target variant id = " << vid << std::endl; } - stopTimer(); - } else { - getCout() << "\n DOT : Unknown OMP Target variant id = " << vid << std::endl; + } + +} + +void DOT::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); } } diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index fe7568191..4d359775f 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -18,8 +18,11 @@ namespace stream { -void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void DOT::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -73,20 +76,45 @@ void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum dot(m_dot_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DOT_BODY; - }); + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } else if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; - m_dot += static_cast(dot.get()); + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + m_dot += static_cast(tdot); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Seq tuning index = " << tune_idx << std::endl; } - stopTimer(); break; } @@ -100,5 +128,13 @@ void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } +void DOT::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp new file mode 100644 index 000000000..250f0b680 --- /dev/null +++ b/src/stream/DOT-Sycl.cpp @@ -0,0 +1,103 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DOT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace stream +{ + +template +void DOT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + DOT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + Real_ptr dot; + allocAndInitSyclDeviceData(dot, &m_dot_init, 1, qu); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(dot, &m_dot_init, 1, qu); + + qu->submit([&] (sycl::handler& h) { + + auto sumReduction = sycl::reduction(dot, sycl::plus()); + + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sumReduction, + [=] (sycl::nd_item<1> item, auto& dot) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + DOT_BODY; + } + + }); + }); + + Real_type ldot; + Real_ptr pldot = &ldot; + getSyclDeviceData(pldot, dot, 1, qu); + m_dot += ldot; + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; + RAJA::forall< RAJA::sycl_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); + + } + stopTimer(); + + } else { + std::cout << "\n DOT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Sycl) + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index cc32be5f2..5249c8ebd 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,9 +28,10 @@ DOT::DOT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 2*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) + + 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize()); setUsesFeature( Forall ); @@ -53,6 +54,9 @@ DOT::DOT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 5912c120a..2626dbc5e 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,18 +51,37 @@ class DOT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); + + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index adfebfd01..55731255b 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,11 @@ void MUL::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - mul<<>>( b, c, alpha, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (mul), + grid_size, block_size, + shmem, res.get_stream(), + b, c, alpha, iend ); } stopTimer(); @@ -63,13 +65,18 @@ void MUL::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto mul_lambda = [=] __device__ (Index_type i) { + MUL_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - MUL_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, mul_lambda ); } stopTimer(); diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 8a2394612..0990ac09b 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,11 @@ void MUL::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((mul), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), b, c, alpha, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (mul), + grid_size, block_size, + shmem, res.get_stream(), + b, c, alpha, iend ); } stopTimer(); @@ -69,9 +71,12 @@ void MUL::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, mul_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, mul_lambda ); } stopTimer(); diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp index 3369d0f3d..e5a17864e 100644 --- a/src/stream/MUL-OMP.cpp +++ b/src/stream/MUL-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp index c5f20d6b3..07edb732f 100644 --- a/src/stream/MUL-OMPTarget.cpp +++ b/src/stream/MUL-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index 9107945fd..8bffdb3ca 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp new file mode 100644 index 000000000..01be5d872 --- /dev/null +++ b/src/stream/MUL-Sycl.cpp @@ -0,0 +1,80 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MUL.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace stream +{ + +template +void MUL::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MUL_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + MUL_BODY + } + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + MUL_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n MUL : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MUL, Sycl) + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_Sycl diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index fba825bf6..eedea75c7 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ MUL::MUL(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature( Forall ); @@ -53,6 +54,9 @@ MUL::MUL(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 3db59092a..6edd6381a 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -52,18 +52,24 @@ class MUL : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_b; Real_ptr m_c; diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index af3af1c63..89f931f6c 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,11 @@ void TRIAD::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - triad<<>>( a, b, c, alpha, - iend ); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (triad), + grid_size, block_size, + shmem, res.get_stream(), + a, b, c, alpha, iend ); } stopTimer(); @@ -63,13 +65,18 @@ void TRIAD::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto triad_lambda = [=] __device__ (Index_type i) { + TRIAD_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - TRIAD_BODY; - }); - cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, triad_lambda ); } stopTimer(); diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index a8a5b9f99..aebaa3ec1 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -51,9 +51,11 @@ void TRIAD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((triad), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), a, b, c, alpha, - iend ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (triad), + grid_size, block_size, + shmem, res.get_stream(), + a, b, c, alpha, iend ); } stopTimer(); @@ -69,9 +71,12 @@ void TRIAD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, triad_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, triad_lambda ); } stopTimer(); diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp index 5d9832d95..abbadb240 100644 --- a/src/stream/TRIAD-OMP.cpp +++ b/src/stream/TRIAD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp index dfea3158d..5ec18d155 100644 --- a/src/stream/TRIAD-OMPTarget.cpp +++ b/src/stream/TRIAD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index 96ab6ccea..132892f76 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp new file mode 100644 index 000000000..c8ecafdf7 --- /dev/null +++ b/src/stream/TRIAD-Sycl.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace stream +{ + +template +void TRIAD::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + TRIAD_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + TRIAD_BODY + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + TRIAD_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n TRIAD : Unknown Sycl variant id = " << vid << std::endl; + } + +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRIAD, Sycl) + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index d9897618c..da6386755 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -28,8 +28,9 @@ TRIAD::TRIAD(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize()); checksum_scale_factor = 0.001 * @@ -57,6 +58,9 @@ TRIAD::TRIAD(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 3f65bf804..afb06cd3c 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -53,18 +53,24 @@ class TRIAD : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 001c81190..88b61bbe5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -13,13 +13,23 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS lcals polybench stream - algorithm) + algorithm + comm) list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) - -raja_add_test( - NAME test-raja-perf-suite - SOURCES test-raja-perf-suite.cpp - DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} - ) + +if (RAJA_PERFSUITE_ENABLE_MPI) + raja_add_test( + NAME test-raja-perf-suite + SOURCES test-raja-perf-suite.cpp + DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} + NUM_MPI_TASKS ${RAJA_PERFSUITE_NUM_MPI_TASKS} + ) +else() + raja_add_test( + NAME test-raja-perf-suite + SOURCES test-raja-perf-suite.cpp + DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} + ) +endif() target_include_directories(test-raja-perf-suite.exe PRIVATE ${PROJECT_SOURCE_DIR}/src) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 26ebcbda5..6f36958c0 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -8,6 +8,10 @@ #include "gtest/gtest.h" +#if defined(RUN_KOKKOS) +#include +#endif + #include "common/Executor.hpp" #include "common/KernelBase.hpp" @@ -16,6 +20,33 @@ #include #include +#if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include +#endif + +int main( int argc, char** argv ) +{ + testing::InitGoogleTest(&argc, argv); + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_Init(&argc, &argv); +#endif +#if defined(RUN_KOKKOS) + Kokkos::initialize(argc, argv); +#endif + + int res = RUN_ALL_TESTS(); + +#if defined(RUN_KOKKOS) + Kokkos::finalize(); +#endif +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_Finalize(); +#endif + + return res; +} + TEST(ShortSuiteTest, Basic) { @@ -32,7 +63,7 @@ TEST(ShortSuiteTest, Basic) (HIP_VERSION_MAJOR < 5 || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) sargv.emplace_back(std::string("--exclude-kernels")); - sargv.emplace_back(std::string("HALOEXCHANGE_FUSED")); + sargv.emplace_back(std::string("HALO_PACKING_FUSED")); #endif #if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) diff --git a/tpl/RAJA b/tpl/RAJA index 9b5f61edf..378199aac 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 9b5f61edf3aa1e6fdbc9a4b30828c81504639963 +Subproject commit 378199aac342ee21c2ddfbcbb48413bd1dfac612