From 7f68aeb6d52ba756e6d1b3cd6857e17012cab32e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 13 Dec 2024 09:23:03 -0700 Subject: [PATCH 1/6] Update Kokkos library in LAMMPS to v4.5.0 --- lib/kokkos/CHANGELOG.md | 101 +- lib/kokkos/CMakeLists.txt | 417 ++- lib/kokkos/CONTRIBUTING.md | 2 + lib/kokkos/HOW_TO_SNAPSHOT | 73 - lib/kokkos/Makefile.kokkos | 280 +- lib/kokkos/Makefile.targets | 16 +- lib/kokkos/README.md | 6 +- lib/kokkos/algorithms/CMakeLists.txt | 12 +- lib/kokkos/algorithms/src/CMakeLists.txt | 33 +- lib/kokkos/algorithms/src/Kokkos_Random.hpp | 11 +- .../src/sorting/Kokkos_BinOpsPublicAPI.hpp | 22 +- .../src/sorting/Kokkos_BinSortPublicAPI.hpp | 3 +- .../src/sorting/Kokkos_SortPublicAPI.hpp | 20 +- .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 22 +- .../src/sorting/impl/Kokkos_SortImpl.hpp | 44 +- .../src/std_algorithms/Kokkos_Reduce.hpp | 24 +- .../std_algorithms/Kokkos_TransformReduce.hpp | 24 +- .../impl/Kokkos_Constraints.hpp | 21 +- .../impl/Kokkos_MoveBackward.hpp | 2 +- .../impl/Kokkos_RandomAccessIterator.hpp | 20 +- .../std_algorithms/impl/Kokkos_Reverse.hpp | 2 +- .../impl/Kokkos_ReverseCopy.hpp | 2 +- .../algorithms/unit_tests/CMakeLists.txt | 400 ++- .../algorithms/unit_tests/TestBinSortA.hpp | 49 +- .../algorithms/unit_tests/TestBinSortB.hpp | 4 + .../algorithms/unit_tests/TestNestedSort.hpp | 10 + .../algorithms/unit_tests/TestRandom.hpp | 11 +- .../unit_tests/TestRandomAccessIterator.cpp | 33 +- lib/kokkos/algorithms/unit_tests/TestSort.hpp | 15 +- .../algorithms/unit_tests/TestSortByKey.hpp | 20 +- .../TestStdAlgorithmsAdjacentDifference.cpp | 2 +- .../TestStdAlgorithmsAdjacentFind.cpp | 4 +- .../unit_tests/TestStdAlgorithmsCommon.hpp | 28 +- .../TestStdAlgorithmsConstraints.cpp | 19 +- .../unit_tests/TestStdAlgorithmsCopyIf.cpp | 8 +- .../TestStdAlgorithmsExclusiveScan.cpp | 2 +- .../unit_tests/TestStdAlgorithmsForEach.cpp | 2 - .../TestStdAlgorithmsHelperFunctors.hpp | 2 +- .../TestStdAlgorithmsInclusiveScan.cpp | 2 +- .../unit_tests/TestStdAlgorithmsIsSorted.cpp | 7 +- .../TestStdAlgorithmsIsSortedUntil.cpp | 5 +- .../unit_tests/TestStdAlgorithmsMismatch.cpp | 2 +- .../unit_tests/TestStdAlgorithmsModOps.cpp | 2 +- .../unit_tests/TestStdAlgorithmsModSeqOps.cpp | 2 +- .../TestStdAlgorithmsMoveBackward.cpp | 2 +- .../TestStdAlgorithmsPartitionCopy.cpp | 6 +- .../unit_tests/TestStdAlgorithmsRemove.cpp | 4 +- .../TestStdAlgorithmsRemoveCopy.cpp | 2 +- .../TestStdAlgorithmsRemoveCopyIf.cpp | 2 +- .../unit_tests/TestStdAlgorithmsRemoveIf.cpp | 6 +- .../unit_tests/TestStdAlgorithmsReplace.cpp | 4 +- .../TestStdAlgorithmsReplaceCopy.cpp | 4 +- .../TestStdAlgorithmsReplaceCopyIf.cpp | 4 +- .../unit_tests/TestStdAlgorithmsReplaceIf.cpp | 2 +- .../unit_tests/TestStdAlgorithmsReverse.cpp | 2 +- .../unit_tests/TestStdAlgorithmsRotate.cpp | 2 +- .../TestStdAlgorithmsRotateCopy.cpp | 4 +- .../unit_tests/TestStdAlgorithmsSearch.cpp | 2 +- .../unit_tests/TestStdAlgorithmsSearch_n.cpp | 4 +- .../unit_tests/TestStdAlgorithmsShiftLeft.cpp | 2 +- .../TestStdAlgorithmsShiftRight.cpp | 4 +- ...estStdAlgorithmsTeamAdjacentDifference.cpp | 8 +- .../unit_tests/TestStdAlgorithmsTeamCopy.cpp | 2 +- .../TestStdAlgorithmsTeamCopyIf.cpp | 2 +- .../TestStdAlgorithmsTeamCopy_n.cpp | 2 +- .../unit_tests/TestStdAlgorithmsTeamCount.cpp | 2 +- .../TestStdAlgorithmsTeamExclusiveScan.cpp | 4 +- .../unit_tests/TestStdAlgorithmsTeamFind.cpp | 2 +- .../TestStdAlgorithmsTeamFindEnd.cpp | 8 +- .../TestStdAlgorithmsTeamFindIf.cpp | 2 +- .../TestStdAlgorithmsTeamFindIfNot.cpp | 2 +- .../TestStdAlgorithmsTeamGenerate_n.cpp | 2 +- .../TestStdAlgorithmsTeamIsSorted.cpp | 2 +- .../TestStdAlgorithmsTeamIsSortedUntil.cpp | 10 +- .../TestStdAlgorithmsTeamMaxElement.cpp | 4 +- .../TestStdAlgorithmsTeamMinElement.cpp | 4 +- .../TestStdAlgorithmsTeamMinMaxElement.cpp | 4 +- .../unit_tests/TestStdAlgorithmsTeamMove.cpp | 2 +- .../TestStdAlgorithmsTeamRemove.cpp | 2 +- .../TestStdAlgorithmsTeamRemoveCopy.cpp | 4 +- .../TestStdAlgorithmsTeamRemoveCopyIf.cpp | 4 +- .../TestStdAlgorithmsTeamReplaceCopy.cpp | 4 +- .../TestStdAlgorithmsTeamReplaceCopyIf.cpp | 4 +- .../TestStdAlgorithmsTeamRotateCopy.cpp | 2 +- .../TestStdAlgorithmsTeamShiftRight.cpp | 2 +- .../TestStdAlgorithmsTeamSwapRanges.cpp | 2 +- ...tdAlgorithmsTeamTransformInclusiveScan.cpp | 4 +- .../TestStdAlgorithmsTeamUnique.cpp | 4 +- .../TestStdAlgorithmsTeamUniqueCopy.cpp | 10 +- ...estStdAlgorithmsTransformExclusiveScan.cpp | 4 +- ...estStdAlgorithmsTransformInclusiveScan.cpp | 4 +- .../unit_tests/TestStdAlgorithmsUnique.cpp | 2 +- .../TestStdAlgorithmsUniqueCopy.cpp | 4 +- .../algorithms/unit_tests/TestStdReducers.cpp | 6 +- lib/kokkos/appveyor.yml | 10 - lib/kokkos/benchmarks/CMakeLists.txt | 20 +- lib/kokkos/benchmarks/atomic/CMakeLists.txt | 5 +- .../benchmarks/bytes_and_flops/CMakeLists.txt | 9 +- .../bytes_and_flops/bench_unroll_stride.hpp | 6 +- lib/kokkos/benchmarks/gather/CMakeLists.txt | 5 +- lib/kokkos/benchmarks/gups/CMakeLists.txt | 5 +- lib/kokkos/benchmarks/gups/gups.cpp | 2 +- .../benchmarks/launch_latency/CMakeLists.txt | 5 +- .../launch_latency/launch_latency.cpp | 4 +- .../policy_performance/CMakeLists.txt | 5 +- lib/kokkos/benchmarks/stream/CMakeLists.txt | 5 +- .../view_copy_constructor/CMakeLists.txt | 5 +- lib/kokkos/bin/kokkos_launch_compiler | 4 +- lib/kokkos/cmake/Dependencies.cmake | 6 +- lib/kokkos/cmake/KokkosCore_config.h.in | 10 +- .../cmake/KokkosTrilinosConfig.cmake.in | 17 - lib/kokkos/cmake/Modules/CudaToolkit.cmake | 196 +- lib/kokkos/cmake/Modules/FindTPLCUDA.cmake | 68 +- lib/kokkos/cmake/Modules/FindTPLHPX.cmake | 11 +- lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake | 2 +- lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake | 2 +- .../cmake/Modules/FindTPLLIBQUADMATH.cmake | 20 +- lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake | 66 +- lib/kokkos/cmake/Modules/FindTPLROCM.cmake | 22 +- .../cmake/Modules/FindTPLROCTHRUST.cmake | 10 +- lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake | 23 +- lib/kokkos/cmake/README.md | 14 - lib/kokkos/cmake/build_env_info.cmake | 103 +- .../compile_tests/amd_apu.cc} | 24 +- lib/kokkos/cmake/cray.cmake | 11 +- lib/kokkos/cmake/deps/CUDA.cmake | 30 +- lib/kokkos/cmake/deps/HWLOC.cmake | 6 +- lib/kokkos/cmake/deps/Pthread.cmake | 38 +- lib/kokkos/cmake/deps/quadmath.cmake | 5 +- lib/kokkos/cmake/fake_tribits.cmake | 465 ++-- lib/kokkos/cmake/gnu.cmake | 38 +- lib/kokkos/cmake/intel.cmake | 29 +- lib/kokkos/cmake/kokkos_arch.cmake | 2261 +++++++++-------- lib/kokkos/cmake/kokkos_check_env.cmake | 27 +- lib/kokkos/cmake/kokkos_compiler_id.cmake | 437 ++-- .../cmake/kokkos_configure_trilinos.cmake | 38 + lib/kokkos/cmake/kokkos_corner_cases.cmake | 12 +- lib/kokkos/cmake/kokkos_enable_devices.cmake | 212 +- lib/kokkos/cmake/kokkos_enable_options.cmake | 358 +-- lib/kokkos/cmake/kokkos_functions.cmake | 1319 +++++----- lib/kokkos/cmake/kokkos_install.cmake | 78 +- lib/kokkos/cmake/kokkos_pick_cxx_std.cmake | 36 +- lib/kokkos/cmake/kokkos_test_cxx_std.cmake | 285 ++- lib/kokkos/cmake/kokkos_tpls.cmake | 202 +- lib/kokkos/cmake/kokkos_tribits.cmake | 732 +++--- lib/kokkos/cmake/msvc.cmake | 20 +- lib/kokkos/cmake/pgi.cmake | 10 +- lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake | 7 +- lib/kokkos/cmake/tpls/FindTPLPthread.cmake | 35 +- lib/kokkos/cmake/tpls/FindTPLquadmath.cmake | 5 +- lib/kokkos/containers/CMakeLists.txt | 14 +- .../performance_tests/CMakeLists.txt | 17 +- .../performance_tests/TestScatterView.hpp | 8 +- lib/kokkos/containers/src/CMakeLists.txt | 34 +- lib/kokkos/containers/src/Kokkos_Bitset.hpp | 2 +- lib/kokkos/containers/src/Kokkos_DualView.hpp | 241 +- .../containers/src/Kokkos_DynRankView.hpp | 1451 ++++------- .../containers/src/Kokkos_DynamicView.hpp | 104 +- .../containers/src/Kokkos_OffsetView.hpp | 961 ++----- .../containers/src/Kokkos_ScatterView.hpp | 78 +- .../containers/src/Kokkos_StaticCrsGraph.hpp | 2 +- .../containers/src/Kokkos_UnorderedMap.hpp | 16 +- lib/kokkos/containers/src/Kokkos_Vector.hpp | 5 +- .../containers/unit_tests/CMakeLists.txt | 67 +- .../containers/unit_tests/TestBitset.hpp | 2 +- .../containers/unit_tests/TestDualView.hpp | 117 +- .../unit_tests/TestDynRankViewTypedefs.cpp | 260 ++ .../TestDynRankView_TeamScratch.hpp | 72 + .../containers/unit_tests/TestDynViewAPI.hpp | 25 +- .../containers/unit_tests/TestDynamicView.hpp | 33 +- .../unit_tests/TestErrorReporter.hpp | 5 +- .../containers/unit_tests/TestOffsetView.hpp | 210 +- .../containers/unit_tests/TestScatterView.hpp | 18 +- .../unit_tests/TestStaticCrsGraph.hpp | 18 +- .../unit_tests/TestUnorderedMap.hpp | 5 +- .../TestViewCtorPropEmbeddedDim.hpp | 16 +- .../unit_tests/TestWithoutInitializing.hpp | 28 +- lib/kokkos/core/CMakeLists.txt | 30 +- lib/kokkos/core/perf_test/CMakeLists.txt | 254 +- lib/kokkos/core/perf_test/PerfTestHexGrad.cpp | 4 +- .../perf_test/PerfTest_CustomReduction.cpp | 2 - .../PerfTest_ExecSpacePartitioning.cpp | 3 +- .../core/perf_test/PerfTest_ViewCopy_Raw.cpp | 2 - .../core/perf_test/PerfTest_ViewFill_Raw.cpp | 2 - .../perf_test/PerfTest_ViewResize_Raw.cpp | 2 - lib/kokkos/core/perf_test/test_mempool.cpp | 4 +- .../core/perf_test/test_sharedSpace.cpp | 2 +- lib/kokkos/core/perf_test/test_taskdag.cpp | 9 + lib/kokkos/core/src/CMakeLists.txt | 314 ++- lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp | 1 - lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 9 +- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp | 18 +- .../src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp | 41 +- .../core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp | 32 +- .../core/src/Cuda/Kokkos_Cuda_Instance.cpp | 24 +- .../src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 13 +- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 58 +- .../src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 2 +- .../src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 37 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp | 11 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp | 186 +- .../src/Cuda/Kokkos_Cuda_Vectorization.hpp | 18 +- .../core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp | 11 +- lib/kokkos/core/src/HIP/Kokkos_HIP.cpp | 60 +- .../HIP/Kokkos_HIP_BlockSize_Deduction.hpp | 5 +- .../src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 43 +- .../core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 37 +- .../core/src/HIP/Kokkos_HIP_Instance.cpp | 40 +- .../core/src/HIP/Kokkos_HIP_Instance.hpp | 10 +- .../core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 44 +- .../HIP/Kokkos_HIP_ParallelFor_MDRange.hpp | 6 +- .../src/HIP/Kokkos_HIP_ParallelFor_Range.hpp | 4 +- .../src/HIP/Kokkos_HIP_ParallelFor_Team.hpp | 20 +- .../HIP/Kokkos_HIP_ParallelReduce_Team.hpp | 79 +- .../HIP/Kokkos_HIP_SharedAllocationRecord.cpp | 2 +- .../HIP/Kokkos_HIP_SharedAllocationRecord.hpp | 2 +- .../src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp | 10 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp | 49 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp | 44 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp | 197 +- .../src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp | 3 +- .../core/src/HIP/Kokkos_HIP_Vectorization.hpp | 22 +- .../core/src/HIP/Kokkos_HIP_ZeroMemset.cpp | 36 + .../core/src/HIP/Kokkos_HIP_ZeroMemset.hpp | 21 +- lib/kokkos/core/src/HPX/Kokkos_HPX.hpp | 117 +- lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp | 11 + .../core/src/KokkosExp_MDRangePolicy.hpp | 94 +- lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp | 8 +- lib/kokkos/core/src/Kokkos_Array.hpp | 40 +- lib/kokkos/core/src/Kokkos_Atomic.hpp | 1 - .../Kokkos_Atomics_Desul_Volatile_Wrapper.hpp | 196 -- .../core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 277 +- lib/kokkos/core/src/Kokkos_Complex.hpp | 30 +- lib/kokkos/core/src/Kokkos_Concepts.hpp | 51 +- lib/kokkos/core/src/Kokkos_CopyViews.hpp | 346 ++- lib/kokkos/core/src/Kokkos_Core.hpp | 10 +- lib/kokkos/core/src/Kokkos_Core_fwd.hpp | 9 +- lib/kokkos/core/src/Kokkos_Crs.hpp | 14 +- lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp | 6 +- lib/kokkos/core/src/Kokkos_ExecPolicy.hpp | 160 +- lib/kokkos/core/src/Kokkos_Extents.hpp | 2 +- lib/kokkos/core/src/Kokkos_Future.hpp | 37 +- lib/kokkos/core/src/Kokkos_Graph.hpp | 69 +- lib/kokkos/core/src/Kokkos_GraphNode.hpp | 86 +- lib/kokkos/core/src/Kokkos_HostSpace.hpp | 33 +- lib/kokkos/core/src/Kokkos_Layout.hpp | 36 +- lib/kokkos/core/src/Kokkos_Macros.hpp | 64 +- lib/kokkos/core/src/Kokkos_MemoryPool.hpp | 11 +- lib/kokkos/core/src/Kokkos_NumericTraits.hpp | 2 +- lib/kokkos/core/src/Kokkos_Pair.hpp | 6 +- lib/kokkos/core/src/Kokkos_Parallel.hpp | 24 +- .../core/src/Kokkos_Parallel_Reduce.hpp | 181 +- .../src/Kokkos_Profiling_ProfileSection.hpp | 2 +- .../src/Kokkos_Profiling_ScopedRegion.hpp | 2 +- lib/kokkos/core/src/Kokkos_ScratchSpace.hpp | 2 +- lib/kokkos/core/src/Kokkos_TaskScheduler.hpp | 68 +- .../core/src/Kokkos_TaskScheduler_fwd.hpp | 43 +- lib/kokkos/core/src/Kokkos_Timer.hpp | 2 +- lib/kokkos/core/src/Kokkos_Tuners.hpp | 125 +- lib/kokkos/core/src/Kokkos_TypeInfo.hpp | 103 + lib/kokkos/core/src/Kokkos_View.hpp | 2014 +-------------- .../core/src/Kokkos_WorkGraphPolicy.hpp | 4 +- .../core/src/OpenACC/Kokkos_OpenACC.cpp | 55 + .../core/src/OpenACC/Kokkos_OpenACC.hpp | 5 +- .../core/src/OpenACC/Kokkos_OpenACCSpace.hpp | 10 + .../OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp | 2 +- .../src/OpenACC/Kokkos_OpenACC_Instance.cpp | 15 +- .../src/OpenACC/Kokkos_OpenACC_Instance.hpp | 3 +- .../Kokkos_OpenACC_ParallelFor_MDRange.hpp | 620 +++-- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 560 +++- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 111 +- .../Kokkos_OpenACC_ParallelScan_Range.hpp | 2 +- .../src/OpenACC/Kokkos_OpenACC_Traits.hpp | 5 +- lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp | 14 +- .../core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 11 + .../src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 2 - .../OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp | 81 +- .../Kokkos_OpenMPTarget_DeepCopy.hpp | 101 + .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 130 - .../Kokkos_OpenMPTarget_FunctorAdapter.hpp | 48 + .../Kokkos_OpenMPTarget_Instance.cpp | 88 +- .../Kokkos_OpenMPTarget_Instance.hpp | 21 +- .../Kokkos_OpenMPTarget_Parallel.hpp | 41 +- ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 129 +- .../Kokkos_OpenMPTarget_ParallelFor_Range.hpp | 24 +- .../Kokkos_OpenMPTarget_ParallelFor_Team.hpp | 41 +- ...os_OpenMPTarget_ParallelReduce_MDRange.hpp | 336 +-- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 24 +- ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 76 +- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 58 +- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 216 +- .../Kokkos_OpenMPTarget_Reducer.hpp | 160 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp | 251 -- .../OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp | 319 --- lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp | 12 +- lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp | 15 +- .../core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp | 36 +- .../src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp | 37 +- .../src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp | 14 +- .../core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp | 64 +- .../core/src/SYCL/Kokkos_SYCL_Instance.cpp | 49 +- .../core/src/SYCL/Kokkos_SYCL_Instance.hpp | 45 +- .../src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp | 11 +- .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 13 +- .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 18 +- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 24 +- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 16 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 19 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 29 +- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 23 +- .../core/src/SYCL/Kokkos_SYCL_Space.cpp | 32 +- .../core/src/SYCL/Kokkos_SYCL_Space.hpp | 106 +- lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp | 163 +- .../core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp | 5 +- .../core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp | 15 +- .../core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp | 11 +- lib/kokkos/core/src/Serial/Kokkos_Serial.hpp | 5 +- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 10 + .../Serial/Kokkos_Serial_Parallel_Range.hpp | 38 +- .../Serial/Kokkos_Serial_Parallel_Team.hpp | 20 +- .../core/src/Serial/Kokkos_Serial_Task.hpp | 15 +- .../Serial/Kokkos_Serial_WorkGraphPolicy.hpp | 4 +- .../src/Serial/Kokkos_Serial_ZeroMemset.hpp | 12 +- .../src/Threads/Kokkos_Threads_Instance.cpp | 20 +- .../src/Threads/Kokkos_Threads_Instance.hpp | 8 +- .../Kokkos_Threads_ParallelFor_MDRange.hpp | 4 +- .../Kokkos_Threads_ParallelFor_Range.hpp | 8 +- .../Kokkos_Threads_ParallelFor_Team.hpp | 24 +- .../Kokkos_Threads_ParallelReduce_MDRange.hpp | 10 +- .../Kokkos_Threads_ParallelReduce_Range.hpp | 8 +- .../Kokkos_Threads_ParallelReduce_Team.hpp | 13 +- .../Kokkos_Threads_ParallelScan_Range.hpp | 8 +- .../src/Threads/Kokkos_Threads_Spinwait.cpp | 2 +- .../src/Threads/Kokkos_Threads_Spinwait.hpp | 3 +- .../core/src/Threads/Kokkos_Threads_Team.hpp | 173 +- .../Kokkos_Threads_WorkGraphPolicy.hpp | 4 +- lib/kokkos/core/src/View/Kokkos_BasicView.hpp | 652 +++++ lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp | 308 +-- .../Kokkos_ViewAtomic.hpp} | 10 +- .../src/{impl => View}/Kokkos_ViewCtor.hpp | 87 +- .../Kokkos_ViewDataAnalysis.hpp | 15 +- .../core/src/View/Kokkos_ViewLegacy.hpp | 1604 ++++++++++++ .../src/{impl => View}/Kokkos_ViewMapping.hpp | 453 ++-- .../src/{impl => View}/Kokkos_ViewTracker.hpp | 0 .../core/src/View/Kokkos_ViewTraits.hpp | 457 ++++ .../{impl => View}/Kokkos_ViewUniformType.hpp | 12 +- .../View/MDSpan/Kokkos_MDSpan_Accessor.hpp | 203 +- .../src/View/MDSpan/Kokkos_MDSpan_Layout.hpp | 119 +- .../core/src/decl/Kokkos_Declare_CUDA.hpp | 2 + .../core/src/decl/Kokkos_Declare_SYCL.hpp | 10 + lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp | 2 - .../src/impl/KokkosExp_Host_IterateTile.hpp | 71 +- .../src/impl/KokkosExp_IterateTileGPU.hpp | 8 +- .../core/src/impl/Kokkos_AnalyzePolicy.hpp | 2 +- lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp | 14 +- lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp | 11 +- .../core/src/impl/Kokkos_Combined_Reducer.hpp | 26 +- .../core/src/impl/Kokkos_ConcurrentBitset.hpp | 30 +- lib/kokkos/core/src/impl/Kokkos_Core.cpp | 25 +- .../impl/Kokkos_Default_GraphNodeKernel.hpp | 39 +- .../impl/Kokkos_Default_GraphNode_Impl.hpp | 30 +- .../src/impl/Kokkos_Default_Graph_Impl.hpp | 37 +- lib/kokkos/core/src/impl/Kokkos_EBO.hpp | 20 +- .../core/src/impl/Kokkos_ExecPolicy.cpp | 2 +- .../core/src/impl/Kokkos_ExecSpaceManager.hpp | 8 +- .../src/impl/Kokkos_FixedBufferMemoryPool.hpp | 279 -- .../core/src/impl/Kokkos_FunctorAnalysis.hpp | 62 +- lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp | 6 +- .../src/impl/Kokkos_GraphImpl_Utilities.hpp | 6 +- .../impl/Kokkos_GraphNodeCustomization.hpp | 2 +- .../core/src/impl/Kokkos_GraphNodeImpl.hpp | 43 +- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 26 +- .../core/src/impl/Kokkos_HostBarrier.hpp | 6 +- lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp | 8 +- .../src/impl/Kokkos_HostSpace_ZeroMemset.hpp | 9 +- .../src/impl/Kokkos_HostSpace_deepcopy.cpp | 23 +- .../src/impl/Kokkos_HostSpace_deepcopy.hpp | 6 +- .../core/src/impl/Kokkos_HostThreadTeam.cpp | 6 +- .../core/src/impl/Kokkos_HostThreadTeam.hpp | 158 +- lib/kokkos/core/src/impl/Kokkos_LIFO.hpp | 8 +- .../core/src/impl/Kokkos_LinkedListNode.hpp | 2 +- .../src/impl/Kokkos_MemoryPoolAllocator.hpp | 103 - .../src/impl/Kokkos_MultipleTaskQueue.hpp | 57 +- lib/kokkos/core/src/impl/Kokkos_Profiling.cpp | 2 - lib/kokkos/core/src/impl/Kokkos_Profiling.hpp | 30 +- .../src/impl/Kokkos_Profiling_C_Interface.h | 2 +- .../src/impl/Kokkos_Profiling_Interface.hpp | 9 + .../core/src/impl/Kokkos_SharedAlloc.cpp | 29 +- .../core/src/impl/Kokkos_SharedAlloc.hpp | 18 +- .../src/impl/Kokkos_SimpleTaskScheduler.hpp | 9 + .../core/src/impl/Kokkos_SingleTaskQueue.hpp | 12 +- .../core/src/impl/Kokkos_Stacktrace.cpp | 8 +- .../src/impl/Kokkos_StringManipulation.hpp | 10 +- lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp | 28 +- lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp | 22 +- .../core/src/impl/Kokkos_TaskPolicyData.hpp | 16 +- lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp | 22 +- .../core/src/impl/Kokkos_TaskQueueCommon.hpp | 12 +- .../impl/Kokkos_TaskQueueMemoryManager.hpp | 5 + .../src/impl/Kokkos_TaskQueueMultiple.hpp | 22 +- .../impl/Kokkos_TaskQueueMultiple_impl.hpp | 5 + .../core/src/impl/Kokkos_TaskQueue_impl.hpp | 5 + .../core/src/impl/Kokkos_TaskResult.hpp | 5 + .../core/src/impl/Kokkos_TaskTeamMember.hpp | 8 +- .../core/src/impl/Kokkos_Tools_Generic.hpp | 304 ++- lib/kokkos/core/src/impl/Kokkos_Traits.hpp | 5 +- .../core/src/impl/Kokkos_ZeroMemset_fwd.hpp | 2 +- lib/kokkos/core/src/impl/Kokkos_hwloc.cpp | 2 +- .../core/src/setup/Kokkos_Setup_Cuda.hpp | 8 + .../core/src/setup/Kokkos_Setup_HIP.hpp | 17 + .../core/src/setup/Kokkos_Setup_SYCL.hpp | 8 + .../core/src/traits/Kokkos_IndexTypeTrait.hpp | 6 +- .../traits/Kokkos_IterationPatternTrait.hpp | 2 +- .../traits/Kokkos_OccupancyControlTrait.hpp | 9 +- .../core/src/traits/Kokkos_WorkTagTrait.hpp | 4 +- lib/kokkos/core/unit_test/CMakeLists.txt | 1419 +++++------ .../core/unit_test/IncrementalTest.cpp.in | 2 - lib/kokkos/core/unit_test/Makefile | 21 +- lib/kokkos/core/unit_test/TestAbort.hpp | 11 +- lib/kokkos/core/unit_test/TestArray.cpp | 96 + lib/kokkos/core/unit_test/TestArrayOps.hpp | 5 + .../core/unit_test/TestAtomicOperations.hpp | 138 +- .../TestAtomicOperations_complexdouble.hpp | 2 +- .../TestAtomicOperations_complexfloat.hpp | 4 + .../unit_test/TestAtomicOperations_double.hpp | 4 + .../unit_test/TestAtomicOperations_float.hpp | 4 + .../unit_test/TestAtomicOperations_int.hpp | 4 + .../TestAtomicOperations_longint.hpp | 4 + .../TestAtomicOperations_longlongint.hpp | 4 + .../unit_test/TestAtomicOperations_shared.hpp | 4 + .../TestAtomicOperations_unsignedint.hpp | 4 + .../TestAtomicOperations_unsignedlongint.hpp | 4 + ...stAtomicOperations_unsignedlonglongint.hpp | 4 + lib/kokkos/core/unit_test/TestAtomicViews.hpp | 291 +-- lib/kokkos/core/unit_test/TestAtomics.hpp | 273 +- .../unit_test/TestBitManipulationBuiltins.hpp | 4 +- ...e_d.cpp => TestCStyleMemoryManagement.cpp} | 25 +- lib/kokkos/core/unit_test/TestCTestDevice.cpp | 76 +- lib/kokkos/core/unit_test/TestCXX11.hpp | 21 +- .../core/unit_test/TestCompilerMacros.cpp | 6 +- lib/kokkos/core/unit_test/TestComplex.hpp | 48 +- lib/kokkos/core/unit_test/TestConcepts.hpp | 5 +- .../core/unit_test/TestDeepCopyAlignment.hpp | 6 +- .../core/unit_test/TestDetectionIdiom.cpp | 16 +- .../unit_test/TestExecSpacePartitioning.hpp | 10 +- .../unit_test/TestExecSpaceThreadSafety.hpp | 53 +- .../core/unit_test/TestExecutionSpace.hpp | 3 +- .../core/unit_test/TestFunctorAnalysis.hpp | 30 +- lib/kokkos/core/unit_test/TestGraph.hpp | 562 +++- .../core/unit_test/TestHalfConversion.hpp | 4 - .../core/unit_test/TestHalfOperators.hpp | 121 +- .../TestHostSharedPtrAccessOnDevice.hpp | 16 +- lib/kokkos/core/unit_test/TestInit.hpp | 3 - .../unit_test/TestInitializationSettings.cpp | 11 +- lib/kokkos/core/unit_test/TestInterOp.cpp | 75 +- .../core/unit_test/TestIrregularLayout.hpp | 4 +- .../core/unit_test/TestLocalDeepCopy.hpp | 2 - lib/kokkos/core/unit_test/TestMDRange.hpp | 36 +- .../TestMDRangePolicyConstructors.hpp | 59 + .../core/unit_test/TestMDRangeReduce.hpp | 2 - lib/kokkos/core/unit_test/TestMDRange_g.hpp | 2 +- .../core/unit_test/TestMDSpanConversion.hpp | 51 + .../unit_test/TestMathematicalFunctions.hpp | 213 +- .../TestMathematicalSpecialFunctions.hpp | 40 +- lib/kokkos/core/unit_test/TestMemoryPool.hpp | 5 +- .../core/unit_test/TestNestedReducerCTAD.cpp | 8 +- .../core/unit_test/TestNumericTraits.hpp | 26 +- .../TestParseCmdLineArgsAndEnvVars.cpp | 5 +- lib/kokkos/core/unit_test/TestRange.hpp | 25 - .../unit_test/TestRangePolicyConstructors.hpp | 105 +- .../core/unit_test/TestRangePolicyRequire.hpp | 25 - lib/kokkos/core/unit_test/TestReduce.hpp | 49 +- .../unit_test/TestReduceCombinatorical.hpp | 26 +- lib/kokkos/core/unit_test/TestReducers.hpp | 267 +- lib/kokkos/core/unit_test/TestSharedAlloc.hpp | 4 +- lib/kokkos/core/unit_test/TestSharedSpace.cpp | 8 +- .../TestSpaceAwareAccessorAccessViolation.hpp | 2 +- lib/kokkos/core/unit_test/TestStackTrace.hpp | 2 + .../core/unit_test/TestTaskScheduler.hpp | 53 +- lib/kokkos/core/unit_test/TestTeam.hpp | 315 ++- lib/kokkos/core/unit_test/TestTeamBasic.hpp | 2 +- .../unit_test/TestTeamCombinedReducers.hpp | 6 - lib/kokkos/core/unit_test/TestTeamMDRange.hpp | 10 +- .../unit_test/TestTeamMDRangePolicyCTAD.cpp | 4 +- .../core/unit_test/TestTeamReductionScan.hpp | 109 +- lib/kokkos/core/unit_test/TestTeamScan.hpp | 21 +- lib/kokkos/core/unit_test/TestTeamScratch.hpp | 2 - lib/kokkos/core/unit_test/TestTeamVector.hpp | 22 +- lib/kokkos/core/unit_test/TestTypeInfo.cpp | 74 + lib/kokkos/core/unit_test/TestTypeList.cpp | 8 +- lib/kokkos/core/unit_test/TestUtilities.hpp | 12 +- lib/kokkos/core/unit_test/TestViewAPI.hpp | 34 +- lib/kokkos/core/unit_test/TestViewAPI_b.hpp | 30 + lib/kokkos/core/unit_test/TestViewAPI_e.hpp | 23 +- .../core/unit_test/TestViewBadAlloc.hpp | 12 +- lib/kokkos/core/unit_test/TestViewCopy_b.hpp | 4 +- .../core/unit_test/TestViewCtorDimMatch.hpp | 46 +- .../core/unit_test/TestViewCtorProp.hpp | 95 + .../unit_test/TestViewCtorPropEmbeddedDim.hpp | 16 +- .../core/unit_test/TestViewIsAssignable.hpp | 40 +- .../core/unit_test/TestViewMapping_a.hpp | 213 +- .../unit_test/TestViewMapping_subview.hpp | 8 +- .../TestViewMemoryAccessViolation.hpp | 2 +- .../unit_test/TestViewOutOfBoundsAccess.hpp | 2 +- lib/kokkos/core/unit_test/TestViewRank.cpp | 4 +- lib/kokkos/core/unit_test/TestViewSubview.hpp | 174 +- .../core/unit_test/TestViewTypedefs.cpp | 274 ++ lib/kokkos/core/unit_test/TestView_64bit.hpp | 2 - .../unit_test/TestWithoutInitializing.hpp | 50 +- .../UnitTest_CMakePassCmdLineArgs.cpp | 9 +- .../UnitTest_CMakeTriBITSCompatibility.cpp | 33 + .../TestCudaHostPinned_Category.hpp | 4 +- .../TestSYCLHostUSM_Category.hpp | 2 +- .../TestSYCLSharedUSM_Category.hpp | 2 +- .../category_files/TestSYCL_Category.hpp | 2 +- .../unit_test/cuda/TestCuda_InterOp_Graph.cpp | 151 ++ .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 2 +- .../default/TestDefaultDeviceTypeViewAPI.cpp | 32 +- lib/kokkos/core/unit_test/diffconfig.sh | 18 - .../headers_self_contained/CMakeLists.txt | 14 +- .../unit_test/hip/TestHIP_InterOp_Graph.cpp | 127 + .../core/unit_test/hip/TestHIP_Spaces.cpp | 6 +- .../hip/TestHIP_UnifiedMemory_ZeroMemset.cpp | 44 + .../incremental/Test01_execspace.hpp | 4 + .../Test04_ParallelFor_RangePolicy.hpp | 2 +- .../Test05_ParallelReduce_RangePolicy.hpp | 8 +- .../incremental/Test10_HierarchicalBasics.hpp | 4 +- .../Test11a_ParallelFor_TeamThreadRange.hpp | 2 +- .../Test11b_ParallelFor_TeamVectorRange.hpp | 2 +- .../Test11c_ParallelFor_ThreadVectorRange.hpp | 2 +- .../incremental/Test12a_ThreadScratch.hpp | 10 +- .../incremental/Test12b_TeamScratch.hpp | 7 +- .../Test13c_ParallelRed_ThreadVectorRange.hpp | 2 +- .../incremental/Test16_ParallelScan.hpp | 6 +- .../unit_test/sycl/TestSYCL_InterOp_Graph.cpp | 114 + .../unit_test/sycl/TestSYCL_InterOp_Init.cpp | 2 +- .../sycl/TestSYCL_InterOp_Init_Context.cpp | 11 +- .../sycl/TestSYCL_InterOp_Streams.cpp | 2 +- .../core/unit_test/sycl/TestSYCL_Spaces.cpp | 247 +- .../sycl/TestSYCL_TeamScratchStreams.cpp | 34 +- lib/kokkos/core/unit_test/testmake.sh | 18 - .../unit_test/tools/TestEventCorrectness.hpp | 41 +- .../core/unit_test/tools/TestKernelNames.cpp | 219 ++ .../unit_test/tools/TestProfilingSection.cpp | 12 +- .../core/unit_test/tools/TestScopedRegion.cpp | 12 +- .../core/unit_test/tools/TestTuning.cpp | 14 +- .../tools/include/ToolTestingUtilities.hpp | 144 +- .../core/unit_test/view/TestBasicView.hpp | 264 ++ .../view/TestBasicViewMDSpanConversion.cpp | 95 + .../view/TestExtentsDatatypeConversion.cpp | 6 +- .../view/TestReferenceCountedAccessor.hpp | 156 ++ .../view/TestReferenceCountedDataHandle.hpp | 208 ++ lib/kokkos/example/CMakeLists.txt | 11 +- .../example/query_device/CMakeLists.txt | 15 +- .../example/query_device/query_device.cpp | 2 +- .../relocatable_function/CMakeLists.txt | 6 + .../example/relocatable_function/Makefile | 33 + .../relocatable_function/functor.cpp} | 6 +- .../example/relocatable_function/main.cpp | 50 + .../tutorial/01_hello_world/CMakeLists.txt | 11 +- .../01_hello_world_lambda/CMakeLists.txt | 11 +- .../hello_world_lambda.cpp | 5 +- .../tutorial/02_simple_reduce/CMakeLists.txt | 10 +- .../02_simple_reduce_lambda/CMakeLists.txt | 11 +- .../simple_reduce_lambda.cpp | 14 +- .../tutorial/03_simple_view/CMakeLists.txt | 10 +- .../tutorial/03_simple_view/simple_view.cpp | 2 +- .../03_simple_view_lambda/CMakeLists.txt | 10 +- .../simple_view_lambda.cpp | 26 +- .../04_simple_memoryspaces/CMakeLists.txt | 10 +- .../simple_memoryspaces.cpp | 2 +- .../tutorial/05_simple_atomics/CMakeLists.txt | 11 +- .../06_simple_mdrangepolicy/CMakeLists.txt | 10 +- .../01_data_layouts/CMakeLists.txt | 10 +- .../02_memory_traits/CMakeLists.txt | 10 +- .../Advanced_Views/03_subviews/CMakeLists.txt | 10 +- .../04_dualviews/CMakeLists.txt | 10 +- .../Advanced_Views/04_dualviews/dual_view.cpp | 6 +- .../05_NVIDIA_UVM/CMakeLists.txt | 16 +- .../tutorial/Advanced_Views/CMakeLists.txt | 15 +- .../01_random_numbers/CMakeLists.txt | 5 + .../tutorial/Algorithms/CMakeLists.txt | 1 + lib/kokkos/example/tutorial/CMakeLists.txt | 26 +- .../01_thread_teams/CMakeLists.txt | 10 +- .../01_thread_teams_lambda/CMakeLists.txt | 11 +- .../thread_teams_lambda.cpp | 5 +- .../02_nested_parallel_for/CMakeLists.txt | 10 +- .../03_vectorization/CMakeLists.txt | 11 +- .../04_team_scan/CMakeLists.txt | 11 +- .../Hierarchical_Parallelism/CMakeLists.txt | 10 +- .../tutorial/launch_bounds/CMakeLists.txt | 10 +- .../launch_bounds/launch_bounds_reduce.cpp | 5 +- lib/kokkos/master_history.txt | 1 + lib/kokkos/simd/CMakeLists.txt | 8 +- lib/kokkos/simd/src/CMakeLists.txt | 25 +- lib/kokkos/simd/src/Kokkos_SIMD.hpp | 2 +- lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp | 72 +- lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp | 98 +- lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp | 80 +- lib/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp | 2 +- lib/kokkos/simd/unit_tests/CMakeLists.txt | 12 +- .../unit_tests/include/SIMDTesting_Ops.hpp | 4 + .../unit_tests/include/TestSIMD_MathOps.hpp | 15 +- .../include/TestSIMD_Reductions.hpp | 7 + .../include/TestSIMD_WhereExpressions.hpp | 8 +- lib/kokkos/tpls/.clang-format | 1 - .../include/desul/atomics/Atomic_Ref.hpp | 16 + .../desul/atomics/Compare_Exchange_SYCL.hpp | 8 + .../atomics/Lock_Based_Fetch_Op_SYCL.hpp | 8 + .../experimental/__p0009_bits/layout_left.hpp | 3 + .../__p0009_bits/layout_right.hpp | 3 + .../__p0009_bits/layout_stride.hpp | 36 +- .../experimental/__p0009_bits/utility.hpp | 100 + .../__p2630_bits/submdspan_extents.hpp | 119 +- .../__p2630_bits/submdspan_mapping.hpp | 58 +- .../__p2642_bits/layout_padded.hpp | 26 +- .../__p2642_bits/layout_padded_fwd.hpp | 6 + 617 files changed, 21611 insertions(+), 17367 deletions(-) delete mode 100644 lib/kokkos/HOW_TO_SNAPSHOT delete mode 100644 lib/kokkos/appveyor.yml delete mode 100644 lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in rename lib/kokkos/{core/unit_test/sycl/TestSYCL_Task.cpp => cmake/compile_tests/amd_apu.cc} (57%) create mode 100644 lib/kokkos/cmake/kokkos_configure_trilinos.cmake create mode 100644 lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp create mode 100644 lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp create mode 100644 lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp delete mode 100644 lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp create mode 100644 lib/kokkos/core/src/Kokkos_TypeInfo.hpp create mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp create mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp create mode 100644 lib/kokkos/core/src/View/Kokkos_BasicView.hpp rename lib/kokkos/core/src/{impl/Kokkos_Atomic_View.hpp => View/Kokkos_ViewAtomic.hpp} (96%) rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewCtor.hpp (84%) rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewDataAnalysis.hpp (96%) create mode 100644 lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewMapping.hpp (90%) rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewTracker.hpp (100%) create mode 100644 lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewUniformType.hpp (88%) delete mode 100644 lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp rename lib/kokkos/core/unit_test/{default/TestDefaultDeviceType_d.cpp => TestCStyleMemoryManagement.cpp} (73%) create mode 100644 lib/kokkos/core/unit_test/TestTypeInfo.cpp create mode 100644 lib/kokkos/core/unit_test/TestViewCtorProp.hpp create mode 100644 lib/kokkos/core/unit_test/TestViewTypedefs.cpp create mode 100644 lib/kokkos/core/unit_test/UnitTest_CMakeTriBITSCompatibility.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Graph.cpp delete mode 100755 lib/kokkos/core/unit_test/diffconfig.sh create mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Graph.cpp create mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_UnifiedMemory_ZeroMemset.cpp create mode 100644 lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Graph.cpp delete mode 100755 lib/kokkos/core/unit_test/testmake.sh create mode 100644 lib/kokkos/core/unit_test/tools/TestKernelNames.cpp create mode 100644 lib/kokkos/core/unit_test/view/TestBasicView.hpp create mode 100644 lib/kokkos/core/unit_test/view/TestBasicViewMDSpanConversion.cpp create mode 100644 lib/kokkos/core/unit_test/view/TestReferenceCountedAccessor.hpp create mode 100644 lib/kokkos/core/unit_test/view/TestReferenceCountedDataHandle.hpp create mode 100644 lib/kokkos/example/relocatable_function/CMakeLists.txt create mode 100644 lib/kokkos/example/relocatable_function/Makefile rename lib/kokkos/{core/src/impl/KokkosExp_ViewMapping.hpp => example/relocatable_function/functor.cpp} (81%) create mode 100644 lib/kokkos/example/relocatable_function/main.cpp create mode 100644 lib/kokkos/example/tutorial/Algorithms/01_random_numbers/CMakeLists.txt create mode 100644 lib/kokkos/example/tutorial/Algorithms/CMakeLists.txt diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 7b1d69e5663..6c237ebca86 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,7 +1,101 @@ # CHANGELOG +## 4.5.00 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.01...4.5.00) + +### Features + +* SYCL backend graduated to production ready +* Introduce new `SequentialHostInit` view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) (backported in 4.4.01) +* Support building with Run-Time Type Information (RTTI) disabled +* Add new `KOKKOS_RELOCATABLE_FUNCTION` function annotation macro [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +### Backend and Architecture Enhancements + +#### CUDA + +* Adding occupancy tuning for CUDA architectures [\#6788](https://github.com/kokkos/kokkos/pull/6788) +* By default disable `cudaMallocAsync` (i.e., revert the change made in version 4.2) [\#7353](https://github.com/kokkos/kokkos/pull/7353) + +#### HIP + +* Add support for AMD Phoenix APUs with Radeon 740M/760M/780M/880M/890M [\#7162](https://github.com/kokkos/kokkos/pull/7162) +* Update maximum waves per CU values for consumer card [\#7347](https://github.com/kokkos/kokkos/pull/7347) +* Check that Kokkos is running on the architecture it was compiled for [\#7379](https://github.com/kokkos/kokkos/pull/7379) +* Add opt-in option to use `hipMallocAsync` instead of `hipMalloc` [\#7324](https://github.com/kokkos/kokkos/pull/7324) +* Introduce new architecture option `AMD_GFX942_APU` for MI300A [\#7462](https://github.com/kokkos/kokkos/pull/7462) + +#### SYCL + +* Move the `SYCL` backend out of the `Experimental` namespace [\#7171](https://github.com/kokkos/kokkos/pull/7171) +* Introduce `KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE` as CMake option [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +#### OpenACC + +* Add support for building with the Clacc compiler [\#7198](https://github.com/kokkos/kokkos/pull/7198) +* Workaround NVHPC collapse clause bug for `MDRangePolicy` [\#7425](https://github.com/kokkos/kokkos/pull/7425) + +#### HPX + +* Implement `Experimental::partition_space` to produce truly independent execution spaces [\#7287](https://github.com/kokkos/kokkos/pull/7287) + +#### Threads + +* Fix compilation for `parallel_reduce` `MDRange` with `Dynamic` scheduling [\#7478](https://github.com/kokkos/kokkos/pull/7478) +* Fix race conditions on ARM architectures [\#7498](https://github.com/kokkos/kokkos/pull/7498) + +#### OpenMP + +* Fix run time behavior when compiling with `-fvisibility-hidden` [\#7284](https://github.com/kokkos/kokkos/pull/7284) (backported in 4.4.01) +* Fix linking with Cray Clang compiler [\#7341](https://github.com/kokkos/kokkos/pull/7341) + +#### Serial + +* Allow `Kokkos_ENABLE_ATOMICS_BYPASS` to skip mutexes to remediate performance regression in 4.4 [\#7369](https://github.com/kokkos/kokkos/pull/7369) + +### General Enhancements + +* Improve `View` initialization/destruction for non-scalar trivial and trivially-destructible types [\#7219](https://github.com/kokkos/kokkos/pull/7219) [\#7225](https://github.com/kokkos/kokkos/pull/7225) +* Add getters for default tile sizes used in `MDRangePolicy` [\#6839](https://github.com/kokkos/kokkos/pull/6839) +* Improve performance of `Kokkos::sort` when `std::sort` is used [\#7264](https://github.com/kokkos/kokkos/pull/7264) +* Add range-based for loop support for `Array` [\#7293](https://github.com/kokkos/kokkos/pull/7293) +* Allow functors as reducers for nested team parallel reduce [\#6921](https://github.com/kokkos/kokkos/pull/6921) +* Avoid making copies of string rvalue reference arguments to `view_alloc()` [\#7364](https://github.com/kokkos/kokkos/pull/7364) +* Add `atomic_{mod,xor,nand,lshift,rshift}` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Allow using `SequentialHostInit` with `Kokkos::DualView` [\#7456](https://github.com/kokkos/kokkos/pull/7456) +* Add `Graph::instantiate()` [\#7240](https://github.com/kokkos/kokkos/pull/7240) +* Allow an arbitrary execution space instance to be used in `Kokkos::Graph::submit()` [\#7249](https://github.com/kokkos/kokkos/pull/7249) +* Enable compile-time diagnostic of illegal reduction target for graphs [\#7460](https://github.com/kokkos/kokkos/pull/7460) + +### Build System Changes + +* Make sure backend-specific options such as `IMPL_CUDA_MALLOC_ASYNC` only show when that backend is actually enabled [\#7228](https://github.com/kokkos/kokkos/pull/7228) +* Major refactoring removing `TriBITS` paths [\#6164](https://github.com/kokkos/kokkos/pull/6164) +* Add support for SpacemiT K60 (RISC-V) [\#7160](https://github.com/kokkos/kokkos/pull/7160) + +### Deprecations + +* Deprecate Tasking interface [\#7393](https://github.com/kokkos/kokkos/pull/7393) +* Deprecate `atomic_query_version`, `atomic_assign`, `atomic_compare_exchange_strong`, `atomic_{inc, dec}rement` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Deprecate `{OpenMP,HPX}::is_asynchronous()` [\#7322](https://github.com/kokkos/kokkos/pull/7322) + +### Bug Fixes + +* Fix undefined behavior in `BinSort` when sorting within bins on host [\#7223](https://github.com/kokkos/kokkos/pull/7223) +* Using CUDA limits to set extents for blocks, grids [\#7235](https://github.com/kokkos/kokkos/pull/7235) +* Fix `deep_copy (serial_exec, dst, src)` with multiple host backends [\#7245](https://github.com/kokkos/kokkos/pull/7245) +* Skip `RangePolicy` bounds conversion checks if roundtrip convertibility is not provided [\#7172](https://github.com/kokkos/kokkos/pull/7172) +* Allow extracting host and device views from `DualView` with `const` value type [\#7242](https://github.com/kokkos/kokkos/pull/7242) +* Fix `TeamPolicy` array reduction for CUDA and HIP [\#6296](https://github.com/kokkos/kokkos/pull/6296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix configuring without architecture flags for SYCL [\#7303](https://github.com/kokkos/kokkos/pull/7303) +* Set an initial value index during join of `MinLoc`, `MaxLoc` or `MinMaxLoc` [\#7330](https://github.com/kokkos/kokkos/pull/7330) +* Fix storage lifetime of driver for global launch of graph nodes for CUDA and HIP [\#7365](https://github.com/kokkos/kokkos/pull/7365) +* Make `value_type` for `RandomAccessIterator` non-`const` [\#7485](https://github.com/kokkos/kokkos/pull/7485) + ## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01) -[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.00...4.4.01) ### Features: * Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) @@ -13,7 +107,7 @@ ### Bug Fixes * OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284) -* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) ## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00) @@ -57,6 +151,7 @@ * SIMD: Allow flexible vector width for 32 bit types [\#6802](https://github.com/kokkos/kokkos/pull/6802) * Updates for `Kokkos::Array`: add `kokkos_swap(Array)` specialization [\#6943](https://github.com/kokkos/kokkos/pull/6943), add `Kokkos::to_array` [\#6375](https://github.com/kokkos/kokkos/pull/6375), make `Kokkos::Array` equality-comparable [\#7148](https://github.com/kokkos/kokkos/pull/7148) * Structured binding support for `Kokkos::complex` [\#7040](https://github.com/kokkos/kokkos/pull/7040) +* Introduce `KOKKOS_DEDUCTION_GUIDE` macro to allow for portable user-defined deduction guides [\#6954](https://github.com/kokkos/kokkos/pull/6954) ### Build System Changes * Do not require OpenMP support for languages other than CXX [\#6965](https://github.com/kokkos/kokkos/pull/6965) @@ -1388,7 +1483,7 @@ **Closed issues:** - Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097) -- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) +- Remove KOKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) - Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083) - In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081) - Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 736cbac218c..f0bf8e3634a 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -1,12 +1,11 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) # Disable in-source builds to prevent source tree corruption. -if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) - message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." ) -endif() - -if (COMMAND TRIBITS_PACKAGE) - TRIBITS_PACKAGE(Kokkos) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") + message( + FATAL_ERROR + "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." + ) endif() # We want to determine if options are given with the wrong case @@ -15,143 +14,142 @@ endif() # form a list of all the given variables. If it begins with any # case of KoKkOS, we add it to the list. -GET_CMAKE_PROPERTY(_variableNames VARIABLES) -SET(KOKKOS_GIVEN_VARIABLES) -FOREACH (var ${_variableNames}) - STRING(TOUPPER ${var} UC_VAR) - STRING(FIND ${UC_VAR} KOKKOS IDX) - IF (${IDX} EQUAL 0) - LIST(APPEND KOKKOS_GIVEN_VARIABLES ${var}) - ENDIF() -ENDFOREACH() +get_cmake_property(_variableNames VARIABLES) +set(KOKKOS_GIVEN_VARIABLES) +foreach(var ${_variableNames}) + string(TOUPPER ${var} UC_VAR) + string(FIND ${UC_VAR} KOKKOS IDX) + if(${IDX} EQUAL 0) + list(APPEND KOKKOS_GIVEN_VARIABLES ${var}) + endif() +endforeach() # Basic initialization (Used in KOKKOS_SETTINGS) -SET(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) - -# Is this a build as part of Trilinos? -IF(COMMAND TRIBITS_PACKAGE_DECL) - SET(KOKKOS_HAS_TRILINOS ON) -ELSE() - SET(KOKKOS_HAS_TRILINOS OFF) - SET(PACKAGE_NAME Kokkos) - SET(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -ENDIF() -# Is this build a subdirectory of another project -GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY) +set(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(PACKAGE_NAME Kokkos) +set(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) +# Is this build a subdirectory of another project +get_directory_property(HAS_PARENT PARENT_DIRECTORY) -SET(KOKKOS_ENABLED_OPTIONS) #exported in config file -SET(KOKKOS_ENABLED_DEVICES) #exported in config file -SET(KOKKOS_ENABLED_TPLS) #exported in config file -SET(KOKKOS_ENABLED_ARCH_LIST) #exported in config file +include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) + +set(KOKKOS_ENABLED_OPTIONS) #exported in config file +set(KOKKOS_ENABLED_DEVICES) #exported in config file +set(KOKKOS_ENABLED_TPLS) #exported in config file +set(KOKKOS_ENABLED_ARCH_LIST) #exported in config file #These are helper flags used for sanity checks during config #Certain features should depend on other features being configured first -SET(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies -SET(KOKKOS_CFG_DAG_DEVICES_DONE Off) -SET(KOKKOS_CFG_DAG_OPTIONS_DONE Off) -SET(KOKKOS_CFG_DAG_ARCH_DONE Off) -SET(KOKKOS_CFG_DAG_CXX_STD_DONE Off) -SET(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) -FUNCTION(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) - SET(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) - SET(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) - IF (NOT ${PRE_FLAG}) - MESSAGE(FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured") - ENDIF() - GLOBAL_SET(${POST_FLAG} On) -ENDFUNCTION() - - -LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) - -IF(NOT KOKKOS_HAS_TRILINOS) - set(CMAKE_DISABLE_SOURCE_CHANGES ON) - set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) - - # What language are we compiling Kokkos as - # downstream dependencies need to match this! - SET(KOKKOS_COMPILE_LANGUAGE CXX) - # use lower case here since we didn't parse options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) - - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as CUDA only - # because otherwise the C++ features don't work etc. - # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even - # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 - # days. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - - SET(KOKKOS_COMPILE_LANGUAGE CUDA) - ENDIF() - # use lower case here since we haven't parsed options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) - - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as HIP only - # because otherwise the C++ features don't work etc. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - - SET(KOKKOS_COMPILE_LANGUAGE HIP) - ENDIF() - - IF (Spack_WORKAROUND) - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - MESSAGE(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") - ENDIF() - - #if we are explicitly using Spack for development, - #nuke the Spack compiler - SET(SPACK_CXX $ENV{SPACK_CXX}) - IF(SPACK_CXX) - SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) - SET(ENV{CXX} ${SPACK_CXX}) - ENDIF() - ENDIF() - # Always call the project command to define Kokkos_ variables - # and to make sure that C++ is an enabled language - PROJECT(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) - IF(NOT HAS_PARENT) - IF (NOT CMAKE_BUILD_TYPE) - SET(DEFAULT_BUILD_TYPE "RelWithDebInfo") - MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") - SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING - "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." - FORCE) - ENDIF() - ENDIF() -ELSE() - SET(KOKKOS_COMPILE_LANGUAGE CXX) -ENDIF() - -IF (NOT CMAKE_SIZEOF_VOID_P) - STRING(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) - IF (NOT FIND_IDX STREQUAL -1) - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation") - ENDIF() -ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) - IF(CMAKE_SIZEOF_VOID_P EQUAL 4) - MESSAGE(WARNING "32-bit builds are experimental and not officially supported.") - SET(KOKKOS_IMPL_32BIT ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;") - ENDIF() -ENDIF() +set(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies +set(KOKKOS_CFG_DAG_DEVICES_DONE Off) +set(KOKKOS_CFG_DAG_OPTIONS_DONE Off) +set(KOKKOS_CFG_DAG_ARCH_DONE Off) +set(KOKKOS_CFG_DAG_CXX_STD_DONE Off) +set(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) +function(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) + set(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) + set(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) + if(NOT ${PRE_FLAG}) + message( + FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured" + ) + endif() + global_set(${POST_FLAG} On) +endfunction() + +list(APPEND CMAKE_MODULE_PATH cmake/Modules) + +set(CMAKE_DISABLE_SOURCE_CHANGES ON) +set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) + +# What language are we compiling Kokkos as +# downstream dependencies need to match this! +set(KOKKOS_COMPILE_LANGUAGE CXX) +# use lower case here since we didn't parse options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) + + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as CUDA only + # because otherwise the C++ features don't work etc. + # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even + # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 + # days. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + + set(KOKKOS_COMPILE_LANGUAGE CUDA) +endif() +# use lower case here since we haven't parsed options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) + + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as HIP only + # because otherwise the C++ features don't work etc. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + set(KOKKOS_COMPILE_LANGUAGE HIP) +endif() + +if(Spack_WORKAROUND) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + message(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") + endif() + + #if we are explicitly using Spack for development, + #nuke the Spack compiler + set(SPACK_CXX $ENV{SPACK_CXX}) + if(SPACK_CXX) + set(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) + set(ENV{CXX} ${SPACK_CXX}) + endif() +endif() +# Always call the project command to define Kokkos_ variables +# and to make sure that C++ is an enabled language +project(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) +if(NOT HAS_PARENT) + if(NOT CMAKE_BUILD_TYPE) + set(DEFAULT_BUILD_TYPE "RelWithDebInfo") + message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") + set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" + CACHE STRING "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." FORCE + ) + endif() +endif() + +if(NOT CMAKE_SIZEOF_VOID_P) + string(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) + if(NOT FIND_IDX STREQUAL -1) + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured." + ) + else() + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation" + ) + endif() +elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + message(WARNING "32-bit builds are experimental and not officially supported.") + set(KOKKOS_IMPL_32BIT ON) + else() + message( + FATAL_ERROR + "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;" + ) + endif() +endif() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 5) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -164,58 +162,54 @@ math(EXPR KOKKOS_VERSION_PATCH "${KOKKOS_VERSION} % 100") # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) +include(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # If we are building CUDA, we have tricked CMake because we declare a CXX project # If the default C++ standard for a given compiler matches the requested # standard, then CMake just omits the -std flag in later versions of CMake # This breaks CUDA compilation (CUDA compiler can have a different default # -std then the underlying host compiler by itself). Setting this variable # forces CMake to always add the -std flag even if it thinks it doesn't need it - GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) -ENDIF() + global_set(CMAKE_CXX_STANDARD_DEFAULT 98) +endif() # These are the variables we will append to as we go # I really wish these were regular variables # but scoping issues can make it difficult -GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS) -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) -GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) -GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) +global_set(KOKKOS_COMPILE_OPTIONS) +global_set(KOKKOS_LINK_OPTIONS) +global_set(KOKKOS_AMDGPU_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDAFE_OPTIONS) +global_set(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos -GLOBAL_SET(KOKKOS_TPL_EXPORTS) +global_set(KOKKOS_TPL_EXPORTS) # KOKKOS_DEPENDENCE is used by kokkos_launch_compiler -GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +global_set(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) # MSVC never goes through kokkos_launch_compiler -IF(NOT MSVC) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) -ENDIF() +if(NOT MSVC) + global_append(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +endif() + +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/kokkos_configure_trilinos.cmake) -IF(Kokkos_ENABLE_TESTS AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_TESTS) find_package(GTest QUIET) -ENDIF() +endif() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS # These are functions like KOKKOS_INCLUDE_DIRECTORIES -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) - +include(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) # Check the environment and set certain variables # to allow platform-specific checks -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) -IF(NOT KOKKOS_HAS_TRILINOS) - # This does not work in Trilinos and we simply don't care - # to fix it for Trilinos - # Gather information about the runtime environment - INCLUDE(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) - check_git_setup() -ENDIF() +include(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) +check_git_setup() # The build environment setup goes in the following steps # 1) Check all the enable options. This includes checking Kokkos_DEVICES @@ -223,102 +217,54 @@ ENDIF() # 3) Check the CXX standard and select important CXX flags # 4) Check for any third-party libraries (TPLs) like hwloc # 5) Check if optimizing for a particular architecture and add arch-specific flags -KOKKOS_SETUP_BUILD_ENVIRONMENT() +kokkos_setup_build_environment() # Finish off the build # 6) Recurse into subdirectories and configure individual libraries # 7) Export and install targets -OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) -SET(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) -SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) +set(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) +set_property(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) -IF (KOKKOS_HAS_TRILINOS) - SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) - SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR}) - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSEIF(HAS_PARENT) - SET(KOKKOS_HEADER_DIR "include/kokkos") - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSE() - SET(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") - SET(KOKKOS_IS_SUBDIRECTORY FALSE) -ENDIF() +if(HAS_PARENT) + set(KOKKOS_HEADER_DIR "include/kokkos") + set(KOKKOS_IS_SUBDIRECTORY TRUE) +else() + set(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") + set(KOKKOS_IS_SUBDIRECTORY FALSE) +endif() #------------------------------------------------------------------------------ # # A) Forward declare the package so that certain options are also defined for # subpackages -## This restores the old behavior of ProjectCompilerPostConfig.cmake -# We must do this before KOKKOS_PACKAGE_DECL -IF (KOKKOS_HAS_TRILINOS) - # Overwrite the old flags at the top-level - # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags - # we have to match the annoying behavior, also we have to preserve quotes - # which needs another workaround. - SET(KOKKOS_COMPILE_OPTIONS_TMP) - IF (KOKKOS_ENABLE_HIP) - LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS}) - ENDIF() - FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) - STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) - IF(OPTION_HAS_WHITESPACE EQUAL -1) - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}") - ELSE() - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"") - ENDIF() - ENDFOREACH() - STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) - IF (KOKKOS_ENABLE_CUDA) - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) - ENDIF() - FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS}) - SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) - ENDFOREACH() - IF (KOKKOS_ENABLE_CUDA) - STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") - FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) - SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) - ENDFOREACH() - ENDIF() - #These flags get set up in KOKKOS_PACKAGE_DECL, which means they - #must be configured before KOKKOS_PACKAGE_DECL - SET(KOKKOS_ALL_COMPILE_OPTIONS - $<$:${KOKKOS_ALL_COMPILE_OPTIONS}>) -ENDIF() - - #------------------------------------------------------------------------------ # # D) Process the subpackages (subdirectories) for Kokkos # -KOKKOS_PROCESS_SUBPACKAGES() - +kokkos_process_subpackages() #------------------------------------------------------------------------------ # # E) If Kokkos itself is enabled, process the Kokkos package # -KOKKOS_PACKAGE_POSTPROCESS() -KOKKOS_CONFIGURE_CORE() +kokkos_configure_core() -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - ADD_LIBRARY(kokkos INTERFACE) +if(NOT Kokkos_INSTALL_TESTING) + add_library(kokkos INTERFACE) #Make sure in-tree projects can reference this as Kokkos:: #to match the installed target names - ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos) + add_library(Kokkos::kokkos ALIAS kokkos) # all_libs target is required for TriBITS-compliance - ADD_LIBRARY(Kokkos::all_libs ALIAS kokkos) - TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos) -ENDIF() -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) + add_library(Kokkos::all_libs ALIAS kokkos) + target_link_libraries(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) + kokkos_internal_add_library_install(kokkos) +endif() +include(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler. # Kokkos needs nvcc_wrapper in order to build. Other libraries and @@ -327,16 +273,15 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # as relative to ${CMAKE_INSTALL_PATH}. # KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated # files -KOKKOS_INSTALL_ADDITIONAL_FILES() - +kokkos_install_additional_files() # Finally - if we are a subproject - make sure the enabled devices are visible -IF (HAS_PARENT) - FOREACH(DEV Kokkos_ENABLED_DEVICES) +if(HAS_PARENT) + foreach(DEV Kokkos_ENABLED_DEVICES) #I would much rather not make these cache variables or global properties, but I can't #make any guarantees on whether PARENT_SCOPE is good enough to make #these variables visible where I need them - SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) - SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) - ENDFOREACH() -ENDIF() + set(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) + set_property(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) + endforeach() +endif() diff --git a/lib/kokkos/CONTRIBUTING.md b/lib/kokkos/CONTRIBUTING.md index b4f3057cef2..e97f8c4d89c 100644 --- a/lib/kokkos/CONTRIBUTING.md +++ b/lib/kokkos/CONTRIBUTING.md @@ -7,6 +7,8 @@ We actively welcome pull requests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. +Before sending your patch for review, please try to ensure that it is formatted properly. We use clang-format version 16 for this. + ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. diff --git a/lib/kokkos/HOW_TO_SNAPSHOT b/lib/kokkos/HOW_TO_SNAPSHOT deleted file mode 100644 index ad3f78efb4f..00000000000 --- a/lib/kokkos/HOW_TO_SNAPSHOT +++ /dev/null @@ -1,73 +0,0 @@ - -Developers of Kokkos (those who commit modifications to Kokkos) -must maintain the snapshot of Kokkos in the Trilinos repository. - -This file contains instructions for how to -snapshot Kokkos from github.com/kokkos to Trilinos. - ------------------------------------------------------------------------- -*** EVERYTHING GOES RIGHT WORKFLOW *** - -1) Given a 'git clone' of Kokkos and of Trilinos repositories. -1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone. - This path *must* terminate with the directory name 'kokkos'; - e.g., ${HOME}/kokkos . -1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory. - -2) Given that the Kokkos build & test is clean and - changes are committed to the Kokkos clone. - -3) Snapshot the current commit in the Kokkos clone into the Trilinos clone. - This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}: - ${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages - -4) Verify the snapshot commit happened as expected - cd ${TRILINOS}/packages/kokkos - git log -1 --name-only - -5) Modify, build, and test Trilinos with the Kokkos snapshot. - -6) Given that that the Trilinos build & test is clean and - changes are committed to the Trilinos clone. - -7) Attempt push to the Kokkos repository. - If push fails then you must 'remove the Kokkos snapshot' - from your Trilinos clone. - See below. - -8) Attempt to push to the Trilinos repository. - If updating for a failed push requires you to change Kokkos you must - 'remove the Kokkos snapshot' from your Trilinos clone. - See below. - ------------------------------------------------------------------------- -*** WHEN SOMETHING GOES WRONG AND YOU MUST *** -*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE *** - -1) Query the Trilinos clone commit log. - git log --oneline - -2) Note the of the commit to the Trillinos clone - immediately BEFORE the Kokkos snapshot commit. - Copy this for use in the next command. - -3) IF more than one outstanding commit then you can remove just the - Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file. - Remove or comment out the Kokkos snapshot commit entry. - git rebase -i - -4) IF the Kokkos snapshot commit is the one and only - outstanding commit then remove just than commit. - git reset --hard HEAD~1 - ------------------------------------------------------------------------- -*** REGARDING 'snapshot.py' TOOL *** - -The 'snapshot.py' tool is developed and maintained by the -Center for Computing Research (CCR) -Software Engineering, Maintenance, and Support (SEMS) team. - -Contact Brent Perschbacher for questions> - ------------------------------------------------------------------------- - diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 6b627dcc369..75dcbb95364 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,8 +11,8 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 5 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -40,16 +40,19 @@ KOKKOS_TRIBITS ?= "no" KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. -# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async -KOKKOS_CUDA_OPTIONS ?= "disable_malloc_async" +# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,enable_malloc_async +KOKKOS_CUDA_OPTIONS ?= "" -# Options: rdc +# Options: rdc,enable_malloc_async KOKKOS_HIP_OPTIONS ?= "" # Default settings specific options. # Options: enable_async_dispatch KOKKOS_HPX_OPTIONS ?= "" +#Options : force_host_as_device +KOKKOS_OPENACC_OPTIONS ?= "" + # Helper functions for conversion to upper case uppercase_TABLE:=a,A b,B c,C d,D e,E f,F g,G h,H i,I j,J k,K l,L m,M n,N o,O p,P q,Q r,R s,S t,T u,U v,V w,W x,X y,Y z,Z uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(wordlist 2,$(words $1),$1),$2)),$2) @@ -92,7 +95,7 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS), KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) -KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),disable_malloc_async) +KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_malloc_async) KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) # deprecated KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) @@ -103,6 +106,8 @@ KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPT KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings) KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc) +KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),enable_malloc_async) +KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE := $(call kokkos_has_string,$(KOKKOS_OPENACC_OPTIONS),force_host_as_device) # Check for Kokkos Host Execution Spaces one of which must be on. KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP) @@ -178,7 +183,7 @@ KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2 KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_NVHPC := $(strip $(shell $(CXX) --version 2>&1 | grep -c "nvc++")) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++")) +KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -v "error:" | grep -c "clang++")) KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI) KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) @@ -292,6 +297,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) # Set OpenACC flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) KOKKOS_INTERNAL_OPENACC_FLAG := -acc + else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_OPENACC_FLAG := -fopenacc -fopenacc-fake-async-wait -fopenacc-implicit-worker=vector -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version -Wno-pass-failed else $(error Makefile.kokkos: OpenACC is enabled but the compiler must be NVHPC (got version string $(KOKKOS_CXX_VERSION))) endif @@ -411,8 +418,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH) @@ -466,6 +473,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1103) +KOKKOS_INTERNAL_USE_ARCH_AMD := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103)) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) @@ -561,6 +576,9 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE") + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -733,7 +751,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) + ifeq ($(KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC */") @@ -1024,86 +1042,122 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--offload-arch + endif +endif + # Do not add this flag if its the cray compiler or the nvhpc compiler. ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 0) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) - # Lets start with adding architecture defines - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + # Lets start with adding architecture defines + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 endif endif @@ -1119,6 +1173,9 @@ ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif endif endif @@ -1126,43 +1183,43 @@ endif # Figure out the architecture flag for ROCm. ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX906") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx906\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx906 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX908") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx908\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx908 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX90A") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx90A\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx90a endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx940\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx940 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1030\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1030 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1100") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1100\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1100 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1103") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1103 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1103\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1103 endif @@ -1171,8 +1228,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) - KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) - KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) + KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE") @@ -1182,6 +1239,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_CXXFLAGS+=-fno-gpu-rdc KOKKOS_LDFLAGS+=-fno-gpu-rdc endif + + ifeq ($(KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC") + else + tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC */") + endif +endif + +ifneq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 0) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + endif + endif endif # Figure out Intel architecture flags. @@ -1235,6 +1307,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) KOKKOS_LDFLAGS+=-fsycl KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) + + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) @@ -1322,6 +1396,8 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/MDSpan/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) @@ -1374,6 +1450,48 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENACC_LIB) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + ifneq ($(CUDA_PATH),) + ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1) + CUDA_PATH := $(CUDA_PATH:/compilers=/cuda) + endif + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(CUDA_PATH),) + KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 + endif + KOKKOS_LIBS += -lcudart + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_LIBS += -cuda + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(ROCM_PATH),) + KOKKOS_CPPFLAGS += -I$(ROCM_PATH)/include + KOKKOS_LDFLAGS += -L$(ROCM_PATH)/lib + endif + KOKKOS_LIBS += -lamdhip64 + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=multicore + endif + else + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=gpu,multicore + endif + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -1484,7 +1602,11 @@ else endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) - tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") + endif else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") endif @@ -1512,6 +1634,12 @@ $(DESUL_CONFIG_HEADER): KOKKOS_CPP_DEPENDS := $(DESUL_CONFIG_HEADER) KokkosCore_config.h $(KOKKOS_HEADERS) +# Tasking is deprecated +ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + TMP_KOKKOS_SRC := $(KOKKOS_SRC) + KOKKOS_SRC = $(patsubst %Task.cpp,, $(TMP_KOKKOS_SRC)) +endif + KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index e8e429e0275..be535eea3e7 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -16,8 +16,6 @@ Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ho $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp -Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp @@ -38,17 +36,21 @@ Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort. ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +endif Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp endif @@ -73,6 +75,8 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp +Kokkos_HIP_ZeroMemset.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp endif @@ -89,26 +93,26 @@ Kokkos_OpenMP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_Ope $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) -Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp -Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index c8c6f8f7cf5..0ea07f9ea2f 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -30,12 +30,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.3.01](https://github.com/kokkos/kokkos/releases/tag/4.3.01). +The current release is [4.5.00](https://github.com/kokkos/kokkos/releases/tag/4.5.00). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +wget https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz ``` To clone the latest development version of Kokkos from GitHub: diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt index 368984647e9..73ce9f7ec55 100644 --- a/lib/kokkos/algorithms/CMakeLists.txt +++ b/lib/kokkos/algorithms/CMakeLists.txt @@ -1,7 +1,7 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) - KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -ENDIF() +if(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) + kokkos_add_test_directories(unit_tests) +endif() diff --git a/lib/kokkos/algorithms/src/CMakeLists.txt b/lib/kokkos/algorithms/src/CMakeLists.txt index b490caca628..9f10b85e021 100644 --- a/lib/kokkos/algorithms/src/CMakeLists.txt +++ b/lib/kokkos/algorithms/src/CMakeLists.txt @@ -1,34 +1,29 @@ #I have to leave these here for tribits -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -FILE(GLOB ALGO_HEADERS *.hpp) -FILE(GLOB ALGO_SOURCES *.cpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) +file(GLOB ALGO_HEADERS *.hpp) +file(GLOB ALGO_SOURCES *.cpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) #----------------------------------------------------------------------------- # We have to pass the sources in here for Tribits # These will get ignored for standalone CMake and a true interface library made -KOKKOS_ADD_INTERFACE_LIBRARY( - kokkosalgorithms - NOINSTALLHEADERS ${ALGO_HEADERS} - SOURCES ${ALGO_SOURCES} -) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_add_interface_library(kokkosalgorithms NOINSTALLHEADERS ${ALGO_HEADERS} SOURCES ${ALGO_SOURCES}) +kokkos_lib_include_directories( + kokkosalgorithms ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) -KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) +kokkos_link_tpl(kokkoscontainers PUBLIC ROCTHRUST) +kokkos_link_tpl(kokkoscore PUBLIC ONEDPL) diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 7df12b8518e..b28ea4c2ca9 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -615,7 +615,7 @@ template struct Random_UniqueIndex { using locks_view_type = View; KOKKOS_FUNCTION - static int get_state_idx(const locks_view_type) { + static int get_state_idx(const locks_view_type&) { KOKKOS_IF_ON_HOST( (return DeviceType::execution_space::impl_hardware_thread_id();)) @@ -665,17 +665,16 @@ struct Random_UniqueIndex< #ifdef KOKKOS_ENABLE_SYCL template -struct Random_UniqueIndex< - Kokkos::Device> { +struct Random_UniqueIndex> { using locks_view_type = - View>; + View>; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; std::size_t gridDim[3] = { @@ -1121,7 +1120,7 @@ class Random_XorShift1024_Pool { using execution_space = typename device_type::execution_space; using locks_type = View; using int_view_type = View; - using state_data_type = View; + using state_data_type = View; locks_type locks_ = {}; state_data_type state_ = {}; diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp index 73e751f572c..8e7de32a07b 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp @@ -35,11 +35,11 @@ struct BinOp1D { #endif // Construct BinOp with number of bins, minimum value and maximum value - BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + BinOp1D(int max_bins, typename KeyViewType::const_value_type min, typename KeyViewType::const_value_type max) - : max_bins_(max_bins__ + 1), + : max_bins_(max_bins + 1), // Cast to double to avoid possible overflow when using integer - mul_(static_cast(max_bins__) / + mul_(static_cast(max_bins) / (static_cast(max) - static_cast(min))), min_(static_cast(min)) { // For integral types the number of bins may be larger than the range @@ -47,7 +47,7 @@ struct BinOp1D { // and then don't need to sort bins. if (std::is_integral::value && (static_cast(max) - static_cast(min)) <= - static_cast(max_bins__)) { + static_cast(max_bins)) { mul_ = 1.; } } @@ -82,16 +82,16 @@ struct BinOp3D { BinOp3D() = delete; #endif - BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + BinOp3D(int max_bins[], typename KeyViewType::const_value_type min[], typename KeyViewType::const_value_type max[]) { - max_bins_[0] = max_bins__[0]; - max_bins_[1] = max_bins__[1]; - max_bins_[2] = max_bins__[2]; - mul_[0] = static_cast(max_bins__[0]) / + max_bins_[0] = max_bins[0]; + max_bins_[1] = max_bins[1]; + max_bins_[2] = max_bins[2]; + mul_[0] = static_cast(max_bins[0]) / (static_cast(max[0]) - static_cast(min[0])); - mul_[1] = static_cast(max_bins__[1]) / + mul_[1] = static_cast(max_bins[1]) / (static_cast(max[1]) - static_cast(min[1])); - mul_[2] = static_cast(max_bins__[2]) / + mul_[2] = static_cast(max_bins[2]) / (static_cast(max[2]) - static_cast(min[2])); min_[0] = static_cast(min[0]); min_[1] = static_cast(min[1]); diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp index c399279fe48..f417b6b13b3 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -388,7 +388,8 @@ class BinSort { // reasonable experimentally. if (use_std_sort && bin_size > 10) { KOKKOS_IF_ON_HOST( - (std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + (std::sort(sort_order.data() + lower_bound, + sort_order.data() + upper_bound, [this](int p, int q) { return bin_op(keys_rnd, p, q); });)) } else { for (int k = lower_bound + 1; k < upper_bound; ++k) { diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index 308e9e3a008..20026c77e41 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -53,9 +53,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort without comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size()); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last); + } } else { Impl::sort_device_view_without_comparator(exec, view); } @@ -107,9 +111,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort with comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last, comparator); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size(), comparator); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last, comparator); + } } else { Impl::sort_device_view_with_comparator(exec, view, comparator); } diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index f11f8070484..2a8f761d9b4 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -30,6 +30,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -76,13 +77,10 @@ namespace Kokkos::Impl { template constexpr inline bool is_admissible_to_kokkos_sort_by_key = - ::Kokkos::is_view::value&& T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value); + ::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same_v || + std::is_same_v || + std::is_same_v); template KOKKOS_INLINE_FUNCTION constexpr void @@ -144,7 +142,7 @@ void sort_by_key_rocthrust( #if defined(KOKKOS_ENABLE_ONEDPL) template -inline constexpr bool sort_on_device_v = +inline constexpr bool sort_on_device_v = std::is_same_v || std::is_same_v; @@ -152,7 +150,7 @@ inline constexpr bool sort_on_device_v = template void sort_by_key_onedpl( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, MaybeComparator&&... maybeComparator) { @@ -176,7 +174,7 @@ template void applyPermutation(const ExecutionSpace& space, const PermutationView& permutation, const ViewType& view) { - static_assert(std::is_integral::value); + static_assert(std::is_integral_v); auto view_copy = Kokkos::create_mirror( Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, @@ -335,7 +333,7 @@ void sort_by_key_device_view_without_comparator( template void sort_by_key_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values) { #ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY @@ -392,7 +390,7 @@ void sort_by_key_device_view_with_comparator( template void sort_by_key_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, const ComparatorType& comparator) { diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index 08946228919..734ce450f69 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -34,6 +34,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -146,7 +147,7 @@ void sort_via_binsort(const ExecutionSpace& exec, bool sort_in_bins = true; // TODO: figure out better max_bins then this ... int64_t max_bins = view.extent(0) / 2; - if (std::is_integral::value) { + if (std::is_integral_v) { // Cast to double to avoid possible overflow when using integer auto const max_val = static_cast(result.max_val); auto const min_val = static_cast(result.min_val); @@ -157,7 +158,7 @@ void sort_via_binsort(const ExecutionSpace& exec, sort_in_bins = false; } } - if (std::is_floating_point::value) { + if (std::is_floating_point_v) { KOKKOS_ASSERT(std::isfinite(static_cast(result.max_val) - static_cast(result.min_val))); } @@ -211,11 +212,11 @@ void sort_rocthrust(const HIP& space, #if defined(KOKKOS_ENABLE_ONEDPL) template -void sort_onedpl(const Kokkos::Experimental::SYCL& space, +void sort_onedpl(const Kokkos::SYCL& space, const Kokkos::View& view, MaybeComparator&&... maybeComparator) { using ViewType = Kokkos::View; - static_assert(SpaceAccessibility::accessible, "SYCL execution space is not able to access the memory space " "of the View argument!"); @@ -268,19 +269,29 @@ void copy_to_host_run_stdsort_copy_back( KE::copy(exec, view, view_dc); // run sort on the mirror of view_dc - auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); - auto first = KE::begin(mv_h); - auto last = KE::end(mv_h); - std::sort(first, last, std::forward(maybeComparator)...); + auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); + if (view.span_is_contiguous()) { + std::sort(mv_h.data(), mv_h.data() + mv_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(mv_h); + auto last = KE::end(mv_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view_dc, mv_h); // copy back to argument view KE::copy(exec, KE::cbegin(view_dc), KE::cend(view_dc), KE::begin(view)); } else { auto view_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view); - auto first = KE::begin(view_h); - auto last = KE::end(view_h); - std::sort(first, last, std::forward(maybeComparator)...); + if (view.span_is_contiguous()) { + std::sort(view_h.data(), view_h.data() + view_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(view_h); + auto last = KE::end(view_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view, view_h); } } @@ -310,7 +321,7 @@ void sort_device_view_without_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& view) { using ViewType = Kokkos::View; static_assert( @@ -365,8 +376,7 @@ void sort_device_view_with_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, - const Kokkos::View& view, + const Kokkos::SYCL& exec, const Kokkos::View& view, const ComparatorType& comparator) { using ViewType = Kokkos::View; static_assert( @@ -397,12 +407,12 @@ sort_device_view_with_comparator( // and then copies data back. Potentially, this can later be changed // with a better solution like our own quicksort on device or similar. - using ViewType = Kokkos::View; - using MemSpace = typename ViewType::memory_space; // Note with HIP unified memory this code path is still the right thing to do // if we end up here when RocThrust is not enabled. // The create_mirror_view_and_copy will do the right thing (no copy). -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY + using ViewType = Kokkos::View; + using MemSpace = typename ViewType::memory_space; static_assert(!SpaceAccessibility::accessible, "Impl::sort_device_view_with_comparator: should not be called " "on a view that is already accessible on the host"); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp index b84f00f8bb5..ea7e55ca619 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp @@ -91,7 +91,7 @@ template = 0> ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl( @@ -105,7 +105,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl(label, ex, first, last, @@ -119,7 +119,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -137,7 +137,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -157,7 +157,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -172,7 +172,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -186,7 +186,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -204,7 +204,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -258,7 +258,7 @@ template < KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_team_impl(teamHandle, first, last, @@ -273,7 +273,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -294,7 +294,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_team_impl(teamHandle, first, last, @@ -309,7 +309,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp index 101f5113f68..89585ddbea0 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp @@ -117,7 +117,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -136,7 +136,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -157,7 +157,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -182,7 +182,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -208,7 +208,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType first1, IteratorType last1, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -228,7 +228,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -248,7 +248,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -270,7 +270,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -345,7 +345,7 @@ KOKKOS_FUNCTION ValueType transform_reduce( const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -366,7 +366,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -393,7 +393,7 @@ KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -412,7 +412,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 54bb13e25b9..da16141f5a7 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -33,12 +33,12 @@ struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)>> + (std::is_same_v || + std::is_same_v || + std::is_same_v)>> : std::true_type {}; template @@ -102,8 +102,8 @@ struct are_random_access_iterators; template struct are_random_access_iterators { static constexpr bool value = - is_iterator_v && std::is_base_of::value; + is_iterator_v && std::is_base_of_v; }; template @@ -165,9 +165,8 @@ struct iterators_have_matching_difference_type { template struct iterators_have_matching_difference_type { - static constexpr bool value = - std::is_same::value; + static constexpr bool value = std::is_same_v; }; template diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp index 9075562d460..dc910861d50 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdMoveBackwardFunctor { using index_type = typename IteratorType1::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdMoveBackwardFunctor requires signed index type"); IteratorType1 m_last; diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5bce89e98f7..e8c638c94c7 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -36,18 +36,18 @@ class RandomAccessIterator< ::Kokkos::View > { using iterator_type = RandomAccessIterator; using iterator_category = std::random_access_iterator_tag; - using value_type = typename view_type::value_type; + using value_type = typename view_type::non_const_value_type; using difference_type = ptrdiff_t; using pointer = typename view_type::pointer_type; using reference = typename view_type::reference_type; static_assert(view_type::rank == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), "RandomAccessIterator only supports 1D Views with LayoutLeft, " "LayoutRight, LayoutStride."); @@ -61,9 +61,9 @@ class RandomAccessIterator< ::Kokkos::View > { #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond template - requires(std::is_constructible_v) KOKKOS_FUNCTION - explicit(!std::is_convertible_v) - RandomAccessIterator(const RandomAccessIterator& other) + requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) : m_view(other.m_view), m_current_index(other.m_current_index) {} #else template < diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index b4046c7645b..e6caa072880 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseFunctor requires signed index type"); InputIterator m_first; diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp index dd20d90e399..7aa0e4fc44c 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseCopyFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseCopyFunctor requires signed index type"); InputIterator m_last; diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index db184bc8a99..31247af159b 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -1,12 +1,10 @@ - #Leave these here for now - I don't need transitive deps anyway -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) - +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) -SET(ALGORITHM UnitTestMain.cpp) +set(ALGORITHM UnitTestMain.cpp) foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) string(TOUPPER ${Tag} DEVICE) @@ -23,21 +21,11 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # Generate a .cpp file for each one that runs it on the current backend (Tag), # and add this .cpp file to the sources for UnitTest_RandomAndSort. set(ALGO_SORT_SOURCES) - foreach(SOURCE_Input - TestSort - TestSortByKey - TestSortCustomComp - TestBinSortA - TestBinSortB - TestNestedSort - ) + foreach(SOURCE_Input TestSort TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB TestNestedSort) set(file ${dir}/${SOURCE_Input}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_SORT_SOURCES ${file}) endforeach() @@ -47,14 +35,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # ------------------------------------------ # do as above set(ALGO_RANDOM_SOURCES) - foreach(SOURCE_Input - TestRandom - ) + foreach(SOURCE_Input TestRandom) set(file ${dir}/${SOURCE_Input}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() @@ -65,11 +48,7 @@ endforeach() # std set A # ------------------------------------------ set(STDALGO_SOURCES_A) -foreach(Name - StdReducers - StdAlgorithmsConstraints - RandomAccessIterator - ) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator) list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) endforeach() @@ -77,10 +56,7 @@ endforeach() # std set B # ------------------------------------------ set(STDALGO_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsMinMaxElementOps - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps) list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) endforeach() @@ -88,22 +64,23 @@ endforeach() # std set C # ------------------------------------------ set(STDALGO_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsLexicographicalCompare - StdAlgorithmsForEach - StdAlgorithmsFind - StdAlgorithmsFindFirstOf - StdAlgorithmsFindEnd - StdAlgorithmsCount - StdAlgorithmsEqual - StdAlgorithmsAllAnyNoneOf - StdAlgorithmsAdjacentFind - StdAlgorithmsSearch - StdAlgorithmsSearch_n - StdAlgorithmsMismatch - StdAlgorithmsMoveBackward - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsLexicographicalCompare + StdAlgorithmsForEach + StdAlgorithmsFind + StdAlgorithmsFindFirstOf + StdAlgorithmsFindEnd + StdAlgorithmsCount + StdAlgorithmsEqual + StdAlgorithmsAllAnyNoneOf + StdAlgorithmsAdjacentFind + StdAlgorithmsSearch + StdAlgorithmsSearch_n + StdAlgorithmsMismatch + StdAlgorithmsMoveBackward +) list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) endforeach() @@ -111,27 +88,28 @@ endforeach() # std set D # ------------------------------------------ set(STDALGO_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsModOps - StdAlgorithmsModSeqOps - StdAlgorithmsReplace - StdAlgorithmsReplaceIf - StdAlgorithmsReplaceCopy - StdAlgorithmsReplaceCopyIf - StdAlgorithmsCopyIf - StdAlgorithmsUnique - StdAlgorithmsUniqueCopy - StdAlgorithmsRemove - StdAlgorithmsRemoveIf - StdAlgorithmsRemoveCopy - StdAlgorithmsRemoveCopyIf - StdAlgorithmsRotate - StdAlgorithmsRotateCopy - StdAlgorithmsReverse - StdAlgorithmsShiftLeft - StdAlgorithmsShiftRight - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsModOps + StdAlgorithmsModSeqOps + StdAlgorithmsReplace + StdAlgorithmsReplaceIf + StdAlgorithmsReplaceCopy + StdAlgorithmsReplaceCopyIf + StdAlgorithmsCopyIf + StdAlgorithmsUnique + StdAlgorithmsUniqueCopy + StdAlgorithmsRemove + StdAlgorithmsRemoveIf + StdAlgorithmsRemoveCopy + StdAlgorithmsRemoveCopyIf + StdAlgorithmsRotate + StdAlgorithmsRotateCopy + StdAlgorithmsReverse + StdAlgorithmsShiftLeft + StdAlgorithmsShiftRight +) list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) endforeach() @@ -139,20 +117,21 @@ endforeach() # std set E # ------------------------------------------ set(STDALGO_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsIsSorted - StdAlgorithmsIsSortedUntil - StdAlgorithmsPartitioningOps - StdAlgorithmsPartitionCopy - StdAlgorithmsNumerics - StdAlgorithmsAdjacentDifference - StdAlgorithmsExclusiveScan - StdAlgorithmsInclusiveScan - StdAlgorithmsTransformUnaryOp - StdAlgorithmsTransformExclusiveScan - StdAlgorithmsTransformInclusiveScan - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsIsSorted + StdAlgorithmsIsSortedUntil + StdAlgorithmsPartitioningOps + StdAlgorithmsPartitionCopy + StdAlgorithmsNumerics + StdAlgorithmsAdjacentDifference + StdAlgorithmsExclusiveScan + StdAlgorithmsInclusiveScan + StdAlgorithmsTransformUnaryOp + StdAlgorithmsTransformExclusiveScan + StdAlgorithmsTransformInclusiveScan +) list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) endforeach() @@ -160,11 +139,7 @@ endforeach() # std team Q # ------------------------------------------ set(STDALGO_TEAM_SOURCES_Q) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamInclusiveScan - StdAlgorithmsTeamTransformInclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) endforeach() @@ -172,11 +147,7 @@ endforeach() # std team P # ------------------------------------------ set(STDALGO_TEAM_SOURCES_P) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamExclusiveScan - StdAlgorithmsTeamTransformExclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) endforeach() @@ -184,14 +155,9 @@ endforeach() # std team M # ------------------------------------------ set(STDALGO_TEAM_SOURCES_M) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamTransformUnaryOp - StdAlgorithmsTeamTransformBinaryOp - StdAlgorithmsTeamGenerate - StdAlgorithmsTeamGenerate_n - StdAlgorithmsTeamSwapRanges - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp + StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges +) list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) endforeach() @@ -199,14 +165,9 @@ endforeach() # std team L # ------------------------------------------ set(STDALGO_TEAM_SOURCES_L) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamIsSorted - StdAlgorithmsTeamIsSortedUntil - StdAlgorithmsTeamIsPartitioned - StdAlgorithmsTeamPartitionCopy - StdAlgorithmsTeamPartitionPoint - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil + StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint +) list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) endforeach() @@ -214,13 +175,9 @@ endforeach() # std team I # ------------------------------------------ set(STDALGO_TEAM_SOURCES_I) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamUnique - StdAlgorithmsTeamAdjacentDifference - StdAlgorithmsTeamReduce - StdAlgorithmsTeamTransformReduce - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce + StdAlgorithmsTeamTransformReduce +) list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) endforeach() @@ -228,18 +185,19 @@ endforeach() # std team H # ------------------------------------------ set(STDALGO_TEAM_SOURCES_H) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamCopy - StdAlgorithmsTeamCopy_n - StdAlgorithmsTeamCopyBackward - StdAlgorithmsTeamCopyIf - StdAlgorithmsTeamUniqueCopy - StdAlgorithmsTeamRemove - StdAlgorithmsTeamRemoveIf - StdAlgorithmsTeamRemoveCopy - StdAlgorithmsTeamRemoveCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamCopy + StdAlgorithmsTeamCopy_n + StdAlgorithmsTeamCopyBackward + StdAlgorithmsTeamCopyIf + StdAlgorithmsTeamUniqueCopy + StdAlgorithmsTeamRemove + StdAlgorithmsTeamRemoveIf + StdAlgorithmsTeamRemoveCopy + StdAlgorithmsTeamRemoveCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) endforeach() @@ -247,13 +205,9 @@ endforeach() # std team G # ------------------------------------------ set(STDALGO_TEAM_SOURCES_G) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMove - StdAlgorithmsTeamMoveBackward - StdAlgorithmsTeamShiftLeft - StdAlgorithmsTeamShiftRight - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft + StdAlgorithmsTeamShiftRight +) list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) endforeach() @@ -261,13 +215,9 @@ endforeach() # std team F # ------------------------------------------ set(STDALGO_TEAM_SOURCES_F) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamReverse - StdAlgorithmsTeamReverseCopy - StdAlgorithmsTeamRotate - StdAlgorithmsTeamRotateCopy - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate + StdAlgorithmsTeamRotateCopy +) list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) endforeach() @@ -275,15 +225,16 @@ endforeach() # std team E # ------------------------------------------ set(STDALGO_TEAM_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFill - StdAlgorithmsTeamFill_n - StdAlgorithmsTeamReplace - StdAlgorithmsTeamReplaceIf - StdAlgorithmsTeamReplaceCopy - StdAlgorithmsTeamReplaceCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFill + StdAlgorithmsTeamFill_n + StdAlgorithmsTeamReplace + StdAlgorithmsTeamReplaceIf + StdAlgorithmsTeamReplaceCopy + StdAlgorithmsTeamReplaceCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) endforeach() @@ -291,12 +242,7 @@ endforeach() # std team D # ------------------------------------------ set(STDALGO_TEAM_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMinElement - StdAlgorithmsTeamMaxElement - StdAlgorithmsTeamMinMaxElement - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement) list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) endforeach() @@ -304,16 +250,17 @@ endforeach() # std team C # ------------------------------------------ set(STDALGO_TEAM_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFind - StdAlgorithmsTeamFindIf - StdAlgorithmsTeamFindIfNot - StdAlgorithmsTeamAllOf - StdAlgorithmsTeamAnyOf - StdAlgorithmsTeamNoneOf - StdAlgorithmsTeamSearchN - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFind + StdAlgorithmsTeamFindIf + StdAlgorithmsTeamFindIfNot + StdAlgorithmsTeamAllOf + StdAlgorithmsTeamAnyOf + StdAlgorithmsTeamNoneOf + StdAlgorithmsTeamSearchN +) list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) endforeach() @@ -321,13 +268,9 @@ endforeach() # std team B # ------------------------------------------ set(STDALGO_TEAM_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamEqual - StdAlgorithmsTeamSearch - StdAlgorithmsTeamFindEnd - StdAlgorithmsTeamFindFirstOf - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd + StdAlgorithmsTeamFindFirstOf +) list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) endforeach() @@ -335,34 +278,33 @@ endforeach() # std team A # ------------------------------------------ set(STDALGO_TEAM_SOURCES_A) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamAdjacentFind - StdAlgorithmsTeamCount - StdAlgorithmsTeamCountIf - StdAlgorithmsTeamForEach - StdAlgorithmsTeamForEachN - StdAlgorithmsTeamLexicographicalCompare - StdAlgorithmsTeamMismatch - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamAdjacentFind + StdAlgorithmsTeamCount + StdAlgorithmsTeamCountIf + StdAlgorithmsTeamForEach + StdAlgorithmsTeamForEachN + StdAlgorithmsTeamLexicographicalCompare + StdAlgorithmsTeamMismatch +) list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. -if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - list(REMOVE_ITEM ALGO_SORT_SOURCES - TestSort.cpp - ) +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION + VERSION_GREATER_EQUAL 16.0.0 +) + list(REMOVE_ITEM ALGO_SORT_SOURCES TestSort.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases # the impl needs to use either Kokkos or tailored reducers # which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L - TestStdAlgorithmsTeamIsPartitioned.cpp - TestStdAlgorithmsTeamPartitionPoint.cpp - TestStdAlgorithmsTeamPartitionCopy.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L TestStdAlgorithmsTeamIsPartitioned.cpp + TestStdAlgorithmsTeamPartitionPoint.cpp TestStdAlgorithmsTeamPartitionCopy.cpp ) endif() @@ -370,7 +312,9 @@ endif() # in these cases the impl needs to use either Kokkos or # tailored reducers which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_C + list( + REMOVE_ITEM + STDALGO_TEAM_SOURCES_C TestStdAlgorithmsTeamFind.cpp TestStdAlgorithmsTeamFindIf.cpp TestStdAlgorithmsTeamFindIfNot.cpp @@ -386,35 +330,20 @@ endif() # FRIZZI: 04/26/2023: not sure if the compilation error is still applicable # but we conservatively leave this guard on if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Sort - SOURCES - UnitTestMain.cpp - TestStdAlgorithmsCommon.cpp - ${ALGO_SORT_SOURCES} + kokkos_add_executable_and_test( + UnitTest_Sort SOURCES UnitTestMain.cpp TestStdAlgorithmsCommon.cpp ${ALGO_SORT_SOURCES} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Random - SOURCES - UnitTestMain.cpp - ${ALGO_RANDOM_SOURCES} - ) + kokkos_add_executable_and_test(UnitTest_Random SOURCES UnitTestMain.cpp ${ALGO_RANDOM_SOURCES}) endif() # FIXME_OPENMPTARGET: These tests cause internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) - list(REMOVE_ITEM STDALGO_SOURCES_D - TestStdAlgorithmsCopyIf.cpp - TestStdAlgorithmsRemoveCopy.cpp - TestStdAlgorithmsUnique.cpp - TestStdAlgorithmsUniqueCopy.cpp - ) - list(REMOVE_ITEM STDALGO_SOURCES_E - TestStdAlgorithmsExclusiveScan.cpp - TestStdAlgorithmsInclusiveScan.cpp + list(REMOVE_ITEM STDALGO_SOURCES_D TestStdAlgorithmsCopyIf.cpp TestStdAlgorithmsRemoveCopy.cpp + TestStdAlgorithmsUnique.cpp TestStdAlgorithmsUniqueCopy.cpp ) + list(REMOVE_ITEM STDALGO_SOURCES_E TestStdAlgorithmsExclusiveScan.cpp TestStdAlgorithmsInclusiveScan.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget @@ -422,48 +351,31 @@ endif() if(KOKKOS_ENABLE_OPENMPTARGET) # the following use either Kokkos or tailored reducers # which results in runtime memory errors. - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B - TestStdAlgorithmsTeamFindEnd.cpp - TestStdAlgorithmsTeamFindFirstOf.cpp - TestStdAlgorithmsTeamSearch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B TestStdAlgorithmsTeamFindEnd.cpp TestStdAlgorithmsTeamFindFirstOf.cpp + TestStdAlgorithmsTeamSearch.cpp ) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A - TestStdAlgorithmsTeamAdjacentFind.cpp - TestStdAlgorithmsTeamLexicographicalCompare.cpp - TestStdAlgorithmsTeamMismatch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A TestStdAlgorithmsTeamAdjacentFind.cpp + TestStdAlgorithmsTeamLexicographicalCompare.cpp TestStdAlgorithmsTeamMismatch.cpp ) # this causes an illegal memory access if team_members_have_matching_result # is called - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M - TestStdAlgorithmsTeamTransformBinaryOp.cpp - ) + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M TestStdAlgorithmsTeamTransformBinaryOp.cpp) endif() foreach(ID A;B;C;D;E) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_SOURCES_${ID}} - ) + kokkos_add_executable_and_test(AlgorithmsUnitTest_StdSet_${ID} SOURCES UnitTestMain.cpp ${STDALGO_SOURCES_${ID}}) endforeach() foreach(ID A;B;C;D;E;F;G;H;I;L;M;P;Q) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_Team_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_TEAM_SOURCES_${ID}} - ) + kokkos_add_executable_and_test( + AlgorithmsUnitTest_StdSet_Team_${ID} SOURCES UnitTestMain.cpp ${STDALGO_TEAM_SOURCES_${ID}} + ) endforeach() # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE( - AlgorithmsUnitTest_StdAlgoCompileOnly - SOURCES TestStdAlgorithmsCompileOnly.cpp - ) + kokkos_add_executable(AlgorithmsUnitTest_StdAlgoCompileOnly SOURCES TestStdAlgorithmsCompileOnly.cpp) endif() diff --git a/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp b/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp index dd3569e6715..bb074f24803 100644 --- a/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp @@ -31,13 +31,13 @@ struct bin3d_is_sorted_struct { using value_type = unsigned int; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; int max_bins; Scalar min; Scalar max; - bin3d_is_sorted_struct(Kokkos::View keys_, + bin3d_is_sorted_struct(Kokkos::View keys_, int max_bins_, Scalar min_, Scalar max_) : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {} KOKKOS_INLINE_FUNCTION @@ -65,9 +65,9 @@ struct sum3D { using value_type = double; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; - sum3D(Kokkos::View keys_) : keys(keys_) {} + sum3D(Kokkos::View keys_) : keys(keys_) {} KOKKOS_INLINE_FUNCTION void operator()(int i, double& count) const { count += keys(i, 0); @@ -77,8 +77,8 @@ struct sum3D { }; template -void test_3D_sort_impl(unsigned int n) { - using KeyViewType = Kokkos::View; +void test_3D_sort_impl(size_t n) { + using KeyViewType = Kokkos::View; KeyViewType keys("Keys", n * n * n); @@ -207,7 +207,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -219,6 +219,10 @@ void test_sort_integer_overflow() { } // namespace BinSortSetA TEST(TEST_CATEGORY, BinSortGenericTests) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -246,11 +250,11 @@ TEST(TEST_CATEGORY, BinSortEmptyView) { // does not matter if we use int or something else Kokkos::View v("v", 0); - // test all exposed public sort methods - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Sorter.sort(v)); + // test all exposed public sort methods are callable and do not throw + Sorter.sort(ExecutionSpace(), v, 0, 0); + Sorter.sort(v, 0, 0); + Sorter.sort(ExecutionSpace(), v); + Sorter.sort(v); } TEST(TEST_CATEGORY, BinSortEmptyKeysView) { @@ -263,7 +267,26 @@ TEST(TEST_CATEGORY, BinSortEmptyKeysView) { BinOp_t binOp(5, 0, 10); Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp); - ASSERT_NO_THROW(Sorter.create_permute_vector(ExecutionSpace{})); + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw +} + +// BinSort may delegate sorting within bins to std::sort when running on host +// and having a sufficiently large number of items within a single bin (10 by +// default). Test that this is done without undefined behavior when accessing +// the boundaries of the bin. Should be used in conjunction with a memory +// sanitizer or bounds check. +TEST(TEST_CATEGORY, BinSort_issue_7221) { + using ExecutionSpace = TEST_EXECSPACE; + + using KeyViewType = Kokkos::View; + KeyViewType kv("kv", 11); + + using BinOp_t = Kokkos::BinOp1D; + BinOp_t binOp(1, -10, 10); + Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp, + /*sort_within_bins*/ true); + + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw } } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp b/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp index a90224bf315..d11b53a9a61 100644 --- a/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp @@ -185,6 +185,10 @@ void run_for_rank2() { } // namespace BinSortSetB TEST(TEST_CATEGORY, BinSortUnsignedKeyLayoutStrideValues) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExeSpace = TEST_EXECSPACE; using key_type = unsigned; BinSortSetB::run_for_rank1(); diff --git a/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp b/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp index 1b7a3f48fc5..cd57fd23ecf 100644 --- a/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp @@ -386,6 +386,11 @@ void test_nested_sort_by_key(unsigned int N, KeyType minKey, KeyType maxKey, } // namespace NestedSortImpl TEST(TEST_CATEGORY, NestedSort) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; NestedSortImpl::test_nested_sort(171, 0U, UINT_MAX); NestedSortImpl::test_nested_sort(42, -1e6f, 1e6f); @@ -394,6 +399,11 @@ TEST(TEST_CATEGORY, NestedSort) { } TEST(TEST_CATEGORY, NestedSortByKey) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; // Second/third template arguments are key and value respectively. diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index 472af1403b2..6960b912d0e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -542,6 +542,11 @@ void test_duplicate_stream() { } // namespace AlgoRandomImpl TEST(TEST_CATEGORY, Random_XorShift64) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ @@ -562,6 +567,10 @@ TEST(TEST_CATEGORY, Random_XorShift64) { TEST(TEST_CATEGORY, Random_XorShift1024_0) { using ExecutionSpace = TEST_EXECSPACE; + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ defined(KOKKOS_ENABLE_HIP) @@ -589,7 +598,7 @@ TEST(TEST_CATEGORY, Multi_streams) { #endif #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { GTEST_SKIP() << "Failing on NVIDIA GPUs"; // FIXME_SYCL } #endif diff --git a/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp index 7d484136b6d..5ab348cb193 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct random_access_iterator_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); @@ -264,6 +264,37 @@ TEST_F(random_access_iterator_test, traits_helpers) { static_assert(KE::Impl::are_iterators_v); static_assert(KE::Impl::are_random_access_iterators_v); static_assert(!KE::Impl::are_iterators_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); } } // namespace stdalgos diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp index 968fb8950b7..5ea88ae5d62 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -197,7 +197,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -209,6 +209,10 @@ void test_sort_integer_overflow() { } // namespace SortImpl TEST(TEST_CATEGORY, SortUnsignedValueType) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -224,14 +228,19 @@ TEST(TEST_CATEGORY, SortUnsignedValueType) { } TEST(TEST_CATEGORY, SortEmptyView) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else Kokkos::View v("v", 0); + // checking that it does not throw // TODO check the synchronous behavior of the calls below - ASSERT_NO_THROW(Kokkos::sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Kokkos::sort(v)); + Kokkos::sort(ExecutionSpace(), v); + Kokkos::sort(v); } } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp index 9e5bd4a5748..44abe4e73a4 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -83,8 +83,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyView) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } // Test #7036 @@ -95,8 +95,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyViewHost) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } TEST(TEST_CATEGORY, SortByKey) { @@ -183,12 +183,12 @@ TEST(TEST_CATEGORY, SortByKeyStaticExtents) { Kokkos::View keys("keys"); Kokkos::View values_static("values_static"); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_static)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_static); Kokkos::View values_dynamic("values_dynamic", 10); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic); } template @@ -234,7 +234,9 @@ TEST(TEST_CATEGORY, SortByKeyWithStrides) { ASSERT_EQ(sort_fails, 0u); } -TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { +TEST(TEST_CATEGORY_DEATH, SortByKeyKeysLargerThanValues) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 75ad533f6ee..208b46b15f2 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -96,7 +96,7 @@ void fill_view(DestViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, aux_v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp index fa4ff48dbef..d8b80675c9d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp @@ -173,7 +173,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -243,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { { auto res_it = KE::adjacent_find(exespace(), KE::cbegin(view), - KE::cend(view), args...); + KE::cend(view), args...); const auto my_diff = res_it - KE::cbegin(view); verify(my_diff, view, args...); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 67052e2f9d4..dadce2d4748 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -534,10 +534,10 @@ void fill_views_inc(ViewType view, ViewHostType host_view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); for (std::size_t i = 0; i < view_h.extent(0); i++) { @@ -546,10 +546,10 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); using non_strided_view_t = Kokkos::View; @@ -566,11 +566,11 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); auto expected_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), expected); @@ -583,11 +583,11 @@ compare_views(ViewType1 expected, const ViewType2 actual) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); using non_strided_view_t = Kokkos::View; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 2a4525a8c33..923ea970f91 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,7 +81,7 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } -TEST(std_algorithms, expect_no_overlap) { +TEST(std_algorithms_DeathTest, expect_no_overlap) { namespace KE = Kokkos::Experimental; using value_type = double; @@ -104,6 +104,8 @@ TEST(std_algorithms, expect_no_overlap) { // Overlapping because iterators are identical #if defined(KOKKOS_ENABLE_DEBUG) + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + auto first_s = KE::begin(static_view_1d); auto last_s = first_s + extent0; EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, @@ -148,8 +150,7 @@ TEST(std_algorithms, expect_no_overlap) { auto last_st0 = first_st0 + strided_view_1d_0.extent(0); auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) // Does not overlap since offset (=3) is not divisible by stride (=2) - EXPECT_NO_THROW( - { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); }); + KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); // Iterating over the same range without overlapping Kokkos::View static_view_2d{ @@ -160,9 +161,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); - }); + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); Kokkos::View dynamic_view_2d{ "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; @@ -172,9 +171,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); - }); + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; Kokkos::View strided_view_2d{ @@ -185,9 +182,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); - }); + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); } } // namespace stdalgos diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index 5778e37be04..7c9e8f84bfa 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -107,7 +107,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name, } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -202,7 +202,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } @@ -224,7 +224,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } @@ -233,7 +233,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index b364c53a888..a85e63fe345 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp index 793b98a67f1..b24730ff009 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp @@ -55,7 +55,6 @@ void test_for_each(const ViewType view) { std::for_each(KE::begin(expected), KE::end(expected), non_mod_functor); compare_views(expected, view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) const auto mod_lambda = KOKKOS_LAMBDA(value_t & i) { ++i; }; // pass view, lambda takes non-const ref @@ -79,7 +78,6 @@ void test_for_each(const ViewType view) { KE::for_each(exespace(), KE::cbegin(view), KE::cend(view), non_mod_lambda); std::for_each(KE::cbegin(expected), KE::cend(expected), non_mod_lambda); compare_views(expected, view); -#endif } // std::for_each_n is C++17, so we cannot compare results directly diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp index 8dbd6cd7e30..2b3361743e4 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp @@ -104,7 +104,7 @@ struct AssignIndexFunctor { template struct IsEvenFunctor { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "IsEvenFunctor uses operator%, so ValueType must be int"); KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index a08a7372108..b4f40b4651d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index 75d4f0afebc..18928a35266 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -122,7 +122,8 @@ bool compute_gold(const std::string& name) { } else if (name == "large-b") { return false; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return false; // unreachable } } @@ -154,7 +155,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[0] = KE::is_sorted(exespace(), KE::cbegin(view), KE::cend(view), comp); resultsB[1] = KE::is_sorted("label", exespace(), KE::cbegin(view), - KE::cend(view), comp); + KE::cend(view), comp); resultsB[2] = KE::is_sorted(exespace(), view, comp); resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index 29ac7cc9bc1..8327bfe13c0 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -123,7 +123,8 @@ auto compute_gold(ViewType view, const std::string& name) { } else if (name == "large-b") { return KE::begin(view) + 156; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return KE::end(view); // unreachable } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp index f3b3e269c44..df5df756d2a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp @@ -86,7 +86,7 @@ void run_single_scenario(ViewType view1, ViewType view2, v2_h(ext2 / 2) = -5; } } else { - throw std::runtime_error("Kokkos: stdalgo: test: mismatch: Invalid string"); + FAIL() << "Kokkos: stdalgo: test: mismatch: Invalid string"; } Kokkos::deep_copy(aux_view1, v1_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 1b1a02f39c4..6918185bc08 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value); + static_assert(std::is_rvalue_reference_v); // move constr MyMovableType b(std::move(a)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp index f80f30797e4..42a17d73779 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct std_algorithms_mod_seq_ops_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp index b201ab95c1a..88e2a68ff17 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp @@ -56,7 +56,7 @@ void run_single_scenario(const InfoType& scenario_info, int apiId) { ASSERT_EQ(dist, 5); } else if (apiId == 1) { auto rit = KE::move_backward("mylabel", exespace(), KE::begin(v), - KE::end(v), KE::end(v2)); + KE::end(v), KE::end(v2)); const int dist = KE::distance(KE::begin(v2), rit); ASSERT_EQ(dist, 5); } else if (apiId == 2) { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index a36c9db2b9e..e47cacdd7d9 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -95,7 +95,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -110,9 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value); + std::is_same_v); static_assert( - std::is_same::value); + std::is_same_v); const std::size_t ext = view_from.extent(0); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index c35fc5c24b2..f897e9b6574 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -99,7 +99,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -147,7 +147,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove(exespace(), KE::begin(view), KE::end(view), - (ValueType)match_value); + (ValueType)match_value); verify_data(data_h, view, rit); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 3d7c52108be..3137880ea81 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index cb699aa9235..d88ab5473de 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index f06f2234eed..e42788799e4 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -144,7 +144,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if(exespace(), KE::begin(view), KE::end(view), - remove_if_even); + remove_if_even); verify_data(data_h, view, rit, remove_if_even); } @@ -154,7 +154,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if("label", exespace(), KE::begin(view), - KE::end(view), remove_if_even); + KE::end(view), remove_if_even); verify_data(data_h, view, rit, remove_if_even); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp index a22ab32d764..4596726cf3c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -153,7 +153,7 @@ void verify_data(const std::string& name, ViewType1 test_view, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp index a964ec8e173..b18c859af59 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp index ceeba889711..82f859bac12 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index 802c0093c5c..5ae2ff42785 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -96,7 +96,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp index 6e6ca727830..3c934d64850 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp @@ -62,7 +62,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index 5638cbee4a6..bf5c2ee7828 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp index d0caca7cea3..1a860c58cee 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void run_single_scenario(const InfoType& scenario_info, create_view(Tag{}, view_ext, "rotate_copy_dest"); auto n_it = KE::cbegin(view_from) + rotation_point; auto rit = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it, - KE::cend(view_from), KE::begin(view_dest)); + KE::cend(view_from), KE::begin(view_dest)); verify_data(view_from, view_dest, rotation_point); ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp index 021609c444d..195f88a0b73 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp @@ -256,7 +256,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, { auto myrit = KE::search(exespace(), KE::cbegin(view), KE::cend(view), - KE::cbegin(s_view), KE::cend(s_view), args...); + KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); ASSERT_EQ(mydiff, stddiff); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp index 53ad8daa2ec..79d88bec23f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp @@ -154,7 +154,7 @@ void fill_view(ViewType dest_view, ValueType value, std::size_t count, } else { - throw std::runtime_error("Kokkos: test: search_n: this should not happen"); + FAIL() << "Kokkos: test: search_n: this should not happen"; } Kokkos::deep_copy(aux_view, v_h); @@ -208,7 +208,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count, { auto myrit = KE::search_n("label", exespace(), KE::cbegin(view), - KE::cend(view), count, value, args...); + KE::cend(view), count, value, args...); const auto mydiff = myrit - KE::cbegin(view); ASSERT_EQ(mydiff, stddiff); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index 0b5fe9216ea..12835d5a2f7 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -150,7 +150,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_left or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_left("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index 8e4ae943759..3e350cf3b38 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -141,7 +141,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right(exespace(), KE::begin(view), KE::end(view), - shift_value); + shift_value); verify_data(rit, view, view_h, shift_value); } @@ -152,7 +152,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp index c388cadc9bb..5a2c0469394 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp @@ -62,8 +62,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest)); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -73,8 +73,8 @@ struct TestFunctorA { case 1: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_binaryOp); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_binaryOp); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp index e24ac37bf01..071ecd5a9a8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index 7c3c465dc8d..3f83ac7404f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate); + KE::begin(rowDest), predicate); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp index 7cbc788f8e3..9b509af55bf 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp @@ -53,7 +53,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy_n(member, KE::begin(myRowViewFrom), m_copyCount, - KE::begin(myRowViewDest)); + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp index 922424afbd9..38df5c30cec 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp @@ -111,7 +111,7 @@ void test_A(const bool searched_value_exist, std::size_t numTeams, using rand_pool = Kokkos::Random_XorShift64_Pool; - rand_pool pool(lowerBound * upperBound); + rand_pool pool(static_cast(lowerBound) * upperBound); if (searched_value_exist) { Kokkos::View randomIndices( diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index 7cb9851087a..0c35c5e5993 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::exclusive_scan(member, KE::cbegin(rowViewSrc), - KE::cend(rowViewSrc), - KE::begin(rowViewDest), initVal); + KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal); resultDist = KE::distance(KE::begin(rowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp index 430e4917e06..88c5e21f312 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp @@ -51,7 +51,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), searchedValue); + KE::cend(myRowViewFrom), searchedValue); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp index 83eca33569e..d350bc62cdb 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp @@ -86,9 +86,9 @@ struct TestFunctorA { case 2: { auto it = KE::find_end(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::cbegin(myRowSearchedSeqView), - KE::cend(myRowSearchedSeqView), m_binaryPred); + KE::cend(myRowViewFrom), + KE::cbegin(myRowSearchedSeqView), + KE::cend(myRowSearchedSeqView), m_binaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -99,7 +99,7 @@ struct TestFunctorA { case 3: { auto it = KE::find_end(member, myRowViewFrom, myRowSearchedSeqView, - m_binaryPred); + m_binaryPred); resultDist = KE::distance(KE::begin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp index ee4bbed7a30..70f2be77f63 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp index b9448c1a3e6..873e8faf4ca 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if_not(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp index 4b66dd9131f..265cdf47461 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::generate_n(member, myRowView, m_count, - GenerateFunctor()); + GenerateFunctor()); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp index 850e80dde1e..f76a595b3f4 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp @@ -62,7 +62,7 @@ struct TestFunctorA { } else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp index e3b95527c77..5bc49e46007 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp @@ -61,7 +61,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView)); + KE::cend(myRowView)); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -77,8 +77,8 @@ struct TestFunctorA { else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView), - CustomLessThanComparator{}); + KE::cend(myRowView), + CustomLessThanComparator{}); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -88,7 +88,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -210,7 +210,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::is_sorted_until(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp index 283525dbd10..452a48df216 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::max_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::max_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp index 8579b48315d..2c79370b926 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::min_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::min_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp index 51010fdff59..25a4487855b 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp @@ -84,7 +84,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto itPair = KE::minmax_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist1 = KE::distance(KE::begin(myRowView), itPair.first); resultDist2 = KE::distance(KE::begin(myRowView), itPair.second); @@ -160,7 +160,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } else { auto itPair = std::minmax_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance[0] = KE::distance(KE::cbegin(myRow), itPair.first); stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp index 1122d6d554a..2c445dacf8e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::move(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp index fb9c70391b3..2defa1dc6fc 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove(member, KE::begin(myRowView), KE::end(myRowView), - m_targetValue); + m_targetValue); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 6bb0d249988..71a50e39e3e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove_copy(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_targetValue); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_targetValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index cff9aa178a2..d5b5304f631 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -65,8 +65,8 @@ struct TestFunctorA { GreaterThanValueFunctor predicate(m_threshold); if (m_apiPick == 0) { auto it = KE::remove_copy_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), predicate); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), predicate); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp index 70dbf10574b..64f172e401c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp @@ -78,7 +78,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy(member, myRowViewFrom, myRowViewDest, - m_targetValue, m_newValue); + m_targetValue, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -172,7 +172,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), targetVal, newVal); + KE::begin(rowDest), targetVal, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp index d0217aed7a8..9c3699320d8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp @@ -76,7 +76,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy_if(member, myRowViewFrom, myRowViewDest, - predicate, m_newValue); + predicate, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -151,7 +151,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate, newVal); + KE::begin(rowDest), predicate, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp index e865b998f60..51f600fabad 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp @@ -136,7 +136,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, std::size_t pivotShift, auto pivot = KE::cbegin(myRowFrom) + pivotShift; auto it = std::rotate_copy(KE::cbegin(myRowFrom), pivot, - KE::cend(myRowFrom), KE::begin(myRowDest)); + KE::cend(myRowFrom), KE::begin(myRowDest)); const std::size_t stdDistance = KE::distance(KE::begin(myRowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp index 00a80c5ef07..08ff8fbbca6 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp @@ -47,7 +47,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::shift_right(member, KE::begin(myRowView), - KE::end(myRowView), m_shift); + KE::end(myRowView), m_shift); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp index 5fc9612caa7..60cb3f08377 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp @@ -49,7 +49,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::swap_ranges(member, KE::begin(myRowView1), - KE::end(myRowView1), KE::begin(myRowView2)); + KE::end(myRowView1), KE::begin(myRowView2)); resultDist = KE::distance(KE::begin(myRowView2), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 0b0d798fd80..78a21c44305 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -91,7 +91,7 @@ struct TestFunctorA { case 1: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp); + m_binaryOp, m_unaryOp); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); @@ -111,7 +111,7 @@ struct TestFunctorA { case 3: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp, initVal); + m_binaryOp, m_unaryOp, initVal); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp index c46146e0a8f..cef0f7c13d0 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp @@ -58,7 +58,7 @@ struct TestFunctorA { } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::unique(member, KE::begin(myRowView), KE::end(myRowView), - CustomEqualityComparator{}); + CustomEqualityComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::begin(myRow), it); } else { auto it = std::unique(KE::begin(myRow), KE::end(myRow), - CustomEqualityComparator{}); + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 0d3289e196f..89ea8154c7e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -72,8 +72,8 @@ struct TestFunctorA { using comparator_t = CustomEqualityComparator; auto it = KE::unique_copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), - KE::begin(myRowViewDest), comparator_t()); + KE::end(myRowViewFrom), + KE::begin(myRowViewDest), comparator_t()); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -159,12 +159,12 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { std::size_t stdDistance = 0; if (apiId <= 1) { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest)); + KE::begin(myRowDest)); stdDistance = KE::distance(KE::begin(myRowDest), it); } else { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest), - CustomEqualityComparator{}); + KE::begin(myRowDest), + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRowDest), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index fa2804256ac..365ca21688b 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -161,7 +161,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index fb81ae91b04..cc872621478 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -173,7 +173,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index 9c5ae0cf8a1..6ee93e3d5fa 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -138,7 +138,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index 3cf43ad4db8..e3e96964583 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -146,7 +146,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -235,7 +235,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp index c05006a1617..0044b935587 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp @@ -72,7 +72,7 @@ auto create_host_view_with_reduction_order_indices( result(8) = 7; result(9) = 5; } else { - throw std::runtime_error("test: Invalid enum"); + Kokkos::abort("test: Invalid enum"); } return result; @@ -80,7 +80,7 @@ auto create_host_view_with_reduction_order_indices( template auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; @@ -191,7 +191,7 @@ template void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, const ValuesPair gold_values, const IndexPair gold_locs) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml deleted file mode 100644 index d0a5645ef7b..00000000000 --- a/lib/kokkos/appveyor.yml +++ /dev/null @@ -1,10 +0,0 @@ -image: - - Visual Studio 2019 -clone_folder: c:\projects\source -build_script: -- cmd: >- - mkdir build && - cd build && - cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && - cmake --build . --target install && - ctest -C Debug --output-on-failure diff --git a/lib/kokkos/benchmarks/CMakeLists.txt b/lib/kokkos/benchmarks/CMakeLists.txt index 529ef393d99..968c8ae3bf5 100644 --- a/lib/kokkos/benchmarks/CMakeLists.txt +++ b/lib/kokkos/benchmarks/CMakeLists.txt @@ -1,12 +1,12 @@ #FIXME_OPENMPTARGET - compiling in debug mode causes ICE. -KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(view_copy_constructor) +kokkos_add_benchmark_directories(atomic) +kokkos_add_benchmark_directories(gather) +kokkos_add_benchmark_directories(gups) +kokkos_add_benchmark_directories(launch_latency) +kokkos_add_benchmark_directories(stream) +kokkos_add_benchmark_directories(view_copy_constructor) #FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. -IF(NOT Kokkos_ENABLE_OPENMPTARGET) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) -ENDIF() +if(NOT Kokkos_ENABLE_OPENMPTARGET) + kokkos_add_benchmark_directories(policy_performance) + kokkos_add_benchmark_directories(bytes_and_flops) +endif() diff --git a/lib/kokkos/benchmarks/atomic/CMakeLists.txt b/lib/kokkos/benchmarks/atomic/CMakeLists.txt index 85f7412f492..7fda2bf6f6a 100644 --- a/lib/kokkos/benchmarks/atomic/CMakeLists.txt +++ b/lib/kokkos/benchmarks/atomic/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - atomic - SOURCES main.cpp -) +kokkos_add_executable(atomic SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt index 0ce44a6f1a8..9c65d06ce28 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt +++ b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt @@ -1,4 +1,9 @@ -KOKKOS_ADD_EXECUTABLE( +kokkos_add_executable( bytes_and_flops - SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp + SOURCES + bench_double.cpp + bench_float.cpp + bench_int32_t.cpp + bench_int64_t.cpp + main.cpp ) diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 78cfd48effe..762cc988f14 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -17,9 +17,9 @@ template struct Run { static void run(int N, int K, int R, int F, int T, int S, int Ba, int I) { - Kokkos::View A("A", N, K); - Kokkos::View B("B", N, K); - Kokkos::View C("C", N, K); + Kokkos::View A("A", N, K); + Kokkos::View B("B", N, K); + Kokkos::View C("C", N, K); Kokkos::deep_copy(A, Scalar(1.5)); Kokkos::deep_copy(B, Scalar(2.5)); diff --git a/lib/kokkos/benchmarks/gather/CMakeLists.txt b/lib/kokkos/benchmarks/gather/CMakeLists.txt index 24c70627725..2de1ce85e63 100644 --- a/lib/kokkos/benchmarks/gather/CMakeLists.txt +++ b/lib/kokkos/benchmarks/gather/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gather - SOURCES main.cpp -) +kokkos_add_executable(gather SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/gups/CMakeLists.txt b/lib/kokkos/benchmarks/gups/CMakeLists.txt index 8de5b73cc67..dc707470292 100644 --- a/lib/kokkos/benchmarks/gups/CMakeLists.txt +++ b/lib/kokkos/benchmarks/gups/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gups - SOURCES gups.cpp -) +kokkos_add_executable(gups SOURCES gups.cpp) diff --git a/lib/kokkos/benchmarks/gups/gups.cpp b/lib/kokkos/benchmarks/gups/gups.cpp index 369052321d7..e00f87968bd 100644 --- a/lib/kokkos/benchmarks/gups/gups.cpp +++ b/lib/kokkos/benchmarks/gups/gups.cpp @@ -140,7 +140,7 @@ int run_benchmark(const Index indicesCount, const Index dataCount, break; } default: { - throw std::runtime_error("unexpected mode"); + Kokkos::abort("unexpected mode"); } } diff --git a/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt index bb14da749d1..4775bf2261e 100644 --- a/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt +++ b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - launch_latency - SOURCES launch_latency.cpp -) +kokkos_add_executable(launch_latency SOURCES launch_latency.cpp) diff --git a/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp index 73b176ab8dd..156c29af09e 100644 --- a/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp +++ b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp @@ -254,7 +254,7 @@ int main(int argc, char* argv[]) { else if (i == 3) K = atoi(arg.data()); else { - throw std::runtime_error("unexpected argument!"); + Kokkos::abort("unexpected argument!"); } } else if (arg == "--no-parallel-for") { opts.par_for = false; @@ -265,7 +265,7 @@ int main(int argc, char* argv[]) { } else { std::stringstream ss; ss << "unexpected argument \"" << arg << "\" at position " << i; - throw std::runtime_error(ss.str()); + Kokkos::abort(ss.str().c_str()); } } diff --git a/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt index 929b9c97023..4a939775c0b 100644 --- a/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt +++ b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - policy_performance - SOURCES main.cpp -) +kokkos_add_executable(policy_performance SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/stream/CMakeLists.txt b/lib/kokkos/benchmarks/stream/CMakeLists.txt index 0dded6e3a54..b096976c486 100644 --- a/lib/kokkos/benchmarks/stream/CMakeLists.txt +++ b/lib/kokkos/benchmarks/stream/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - stream - SOURCES stream-kokkos.cpp -) +kokkos_add_executable(stream SOURCES stream-kokkos.cpp) diff --git a/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt b/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt index 50a331b2b35..f7bbc13b6ec 100644 --- a/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt +++ b/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - view_copy_constructor - SOURCES view_copy_constructor.cpp -) +kokkos_add_executable(view_copy_constructor SOURCES view_copy_constructor.cpp) diff --git a/lib/kokkos/bin/kokkos_launch_compiler b/lib/kokkos/bin/kokkos_launch_compiler index d1f8896f91b..ee3c29e96d3 100755 --- a/lib/kokkos/bin/kokkos_launch_compiler +++ b/lib/kokkos/bin/kokkos_launch_compiler @@ -62,7 +62,7 @@ KOKKOS_COMPILER=${1} shift # store the expected C++ compiler -CXX_COMPILER=${1} +CXX_COMPILER=$(which "${1}") # remove the expected C++ compiler from the arguments shift @@ -84,7 +84,7 @@ shift # kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: # ${KOKKOS_COMPILER} -c file.cpp -o file.o -if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != $(which "${1}") ]]; then debug-message "$@" # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} exec "$@" diff --git a/lib/kokkos/cmake/Dependencies.cmake b/lib/kokkos/cmake/Dependencies.cmake index fb1e73b5799..2f70c2f038c 100644 --- a/lib/kokkos/cmake/Dependencies.cmake +++ b/lib/kokkos/cmake/Dependencies.cmake @@ -1,5 +1,3 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - ) +tribits_package_define_dependencies(LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib) -TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) +tribits_tpl_tentatively_enable(DLlib) diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 08f128f2d1a..44f81bb8cea 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -24,7 +24,6 @@ #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX #cmakedefine KOKKOS_ENABLE_SYCL -#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED /* General Settings */ #cmakedefine KOKKOS_ENABLE_CXX17 @@ -40,7 +39,10 @@ #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS -#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#cmakedefine KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE +#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED +#cmakedefine KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK @@ -80,6 +82,7 @@ #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 #cmakedefine KOKKOS_ARCH_RISCV_SG2042 +#cmakedefine KOKKOS_ARCH_RISCV_RVA22V #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 @@ -118,10 +121,11 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX90A #cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 +#cmakedefine KOKKOS_ARCH_AMD_GFX942_APU #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 #cmakedefine KOKKOS_ARCH_AMD_GFX1103 -#cmakedefine KOKKOS_ARCH_AMD_GPU +#cmakedefine KOKKOS_ARCH_AMD_GPU "@KOKKOS_ARCH_AMD_GPU@" #cmakedefine KOKKOS_ARCH_VEGA // deprecated #cmakedefine KOKKOS_ARCH_VEGA906 // deprecated #cmakedefine KOKKOS_ARCH_VEGA908 // deprecated diff --git a/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in b/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in deleted file mode 100644 index 626ef5a8ebe..00000000000 --- a/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in +++ /dev/null @@ -1,17 +0,0 @@ -IF (NOT TARGET Kokkos::kokkos) - # Compute the installation prefix relative to this file. - get_filename_component(KOKKOS_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - if(KOKKOS_IMPORT_PREFIX STREQUAL "/") - set(KOKKOS_IMPORT_PREFIX "") - endif() - add_library(Kokkos::kokkos INTERFACE IMPORTED) - set_target_properties(Kokkos::kokkos PROPERTIES - INTERFACE_LINK_LIBRARIES "@Kokkos_LIBRARIES@;@KOKKOS_LINK_OPTIONS@" - INTERFACE_COMPILE_FEATURES "@KOKKOS_CXX_STANDARD_FEATURE@" - INTERFACE_COMPILE_OPTIONS "@KOKKOS_ALL_COMPILE_OPTIONS@" - INTERFACE_INCLUDE_DIRECTORIES "${KOKKOS_IMPORT_PREFIX}/include" - ) -ENDIF() diff --git a/lib/kokkos/cmake/Modules/CudaToolkit.cmake b/lib/kokkos/cmake/Modules/CudaToolkit.cmake index eda5541f7c0..b8ac2048b5f 100644 --- a/lib/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/lib/kokkos/cmake/Modules/CudaToolkit.cmake @@ -483,38 +483,40 @@ endif() # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${CUDAToolkit_BIN_DIR} NO_DEFAULT_PATH - ) + ) endif() # Search using CUDAToolkit_ROOT -find_program(CUDAToolkit_NVCC_EXECUTABLE +find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ENV CUDA_PATH PATH_SUFFIXES bin ) # If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. -if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) +if(NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) # Declare error messages now, print later depending on find_package args. set(fail_base "Could not find nvcc executable in path specified by") set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") - if (CUDAToolkit_FIND_REQUIRED) - if (DEFINED CUDAToolkit_ROOT) + if(CUDAToolkit_FIND_REQUIRED) + if(DEFINED CUDAToolkit_ROOT) message(FATAL_ERROR ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(FATAL_ERROR ${env_cuda_root_fail}) endif() else() - if (NOT CUDAToolkit_FIND_QUIETLY) - if (DEFINED CUDAToolkit_ROOT) + if(NOT CUDAToolkit_FIND_QUIETLY) + if(DEFINED CUDAToolkit_ROOT) message(STATUS ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(STATUS ${env_cuda_root_fail}) endif() endif() @@ -535,9 +537,9 @@ endif() # We will also search the default symlink location /usr/local/cuda first since # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked # directory is the desired location. -if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (UNIX) - if (NOT APPLE) +if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(UNIX) + if(NOT APPLE) set(platform_base "/usr/local/cuda-") else() set(platform_base "/Developer/NVIDIA/CUDA-") @@ -550,10 +552,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) file(GLOB possible_paths "${platform_base}*") # Iterate the glob results and create a descending list. set(possible_versions) - foreach (p ${possible_paths}) + foreach(p ${possible_paths}) # Extract version number from end of string string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) - if (IS_DIRECTORY ${p} AND p_version) + if(IS_DIRECTORY ${p} AND p_version) list(APPEND possible_versions ${p_version}) endif() endforeach() @@ -563,10 +565,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # every possible version of CUDA installed, this wouldn't create any # significant overhead. set(versions) - foreach (v ${possible_versions}) + foreach(v ${possible_versions}) list(LENGTH versions num_versions) # First version, nothing to compare with so just append. - if (num_versions EQUAL 0) + if(num_versions EQUAL 0) list(APPEND versions ${v}) else() # Loop through list. Insert at an index when comparison is @@ -574,9 +576,9 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # happen since this came from a glob list of directories. set(i 0) set(early_terminate FALSE) - while (i LESS num_versions) + while(i LESS num_versions) list(GET versions ${i} curr) - if (v VERSION_GREATER curr) + if(v VERSION_GREATER curr) list(INSERT versions ${i} ${v}) set(early_terminate TRUE) break() @@ -584,7 +586,7 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) math(EXPR i "${i} + 1") endwhile() # If it did not get inserted, place it at the end. - if (NOT early_terminate) + if(NOT early_terminate) list(APPEND versions ${v}) endif() endif() @@ -592,17 +594,18 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # With a descending list of versions, populate possible paths to search. set(search_paths) - foreach (v ${versions}) + foreach(v ${versions}) list(APPEND search_paths "${platform_base}${v}") endforeach() # Force the global default /usr/local/cuda to the front on Unix. - if (UNIX) + if(UNIX) list(INSERT search_paths 0 "/usr/local/cuda") endif() # Now search for nvcc again using the platform default search paths. - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${search_paths} PATH_SUFFIXES bin @@ -617,8 +620,8 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) unset(early_terminate) unset(search_paths) - if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (CUDAToolkit_FIND_REQUIRED) + if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(CUDAToolkit_FIND_REQUIRED) message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") elseif(NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") @@ -636,8 +639,7 @@ if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) unset(cuda_dir) endif() -if(CUDAToolkit_NVCC_EXECUTABLE AND - CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) +if(CUDAToolkit_NVCC_EXECUTABLE AND CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value # This if statement will always match, but is used to provide variables for MATCH 1,2,3... if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) @@ -648,39 +650,38 @@ if(CUDAToolkit_NVCC_EXECUTABLE AND endif() else() # Compute the version by invoking nvcc - execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") - set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") endif() unset(NVCC_OUT) endif() - get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) # Handle cross compilation if(CMAKE_CROSSCOMPILING) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") # Support for NVPACK - set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") # Support for arm cross compilation set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") # Support for aarch64 cross compilation - if (ANDROID_ARCH_NAME STREQUAL "arm64") + if(ANDROID_ARCH_NAME STREQUAL "arm64") set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") else() set(CUDAToolkit_TARGET_NAME "aarch64-linux") - endif (ANDROID_ARCH_NAME STREQUAL "arm64") + endif(ANDROID_ARCH_NAME STREQUAL "arm64") elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - set(CUDAToolkit_TARGET_NAME "x86_64-linux") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") endif() - if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") # add known CUDA target root path to the set of directories we search for programs, libraries and headers list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") @@ -702,25 +703,16 @@ else() set(_CUDAToolkit_Pop_Prefix True) endif() - # Find the include/ directory -find_path(CUDAToolkit_INCLUDE_DIR - NAMES cuda_runtime.h -) +find_path(CUDAToolkit_INCLUDE_DIR NAMES cuda_runtime.h) # And find the CUDA Runtime Library libcudart -find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64 lib/x64 -) -if (NOT CUDA_CUDART) - find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64/stubs lib/x64/stubs - ) +find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64 lib/x64) +if(NOT CUDA_CUDART) + find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64/stubs lib/x64/stubs) endif() -if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) +if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Unable to find cudart library.") endif() @@ -733,24 +725,17 @@ endif() #----------------------------------------------------------------------------- # Perform version comparison and validate all required variables are set. include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(CUDAToolkit - REQUIRED_VARS - CUDAToolkit_INCLUDE_DIR - CUDA_CUDART - CUDAToolkit_NVCC_EXECUTABLE - VERSION_VAR - CUDAToolkit_VERSION +find_package_handle_standard_args( + CUDAToolkit REQUIRED_VARS CUDAToolkit_INCLUDE_DIR CUDA_CUDART CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR CUDAToolkit_VERSION ) -mark_as_advanced(CUDA_CUDART - CUDAToolkit_INCLUDE_DIR - CUDAToolkit_NVCC_EXECUTABLE - ) +mark_as_advanced(CUDA_CUDART CUDAToolkit_INCLUDE_DIR CUDAToolkit_NVCC_EXECUTABLE) #----------------------------------------------------------------------------- # Construct result variables if(CUDAToolkit_FOUND) - set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) - get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) endif() #----------------------------------------------------------------------------- @@ -762,27 +747,26 @@ if(CUDAToolkit_FOUND) set(search_names ${lib_name} ${arg_ALT}) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH - PATH_SUFFIXES nvidia/current lib64 lib/x64 lib - ${arg_EXTRA_PATH_SUFFIXES} + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib ${arg_EXTRA_PATH_SUFFIXES} ) # Don't try any stub directories intil we have exhausted all other # search locations. if(NOT CUDA_${lib_name}_LIBRARY) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs ) endif() mark_as_advanced(CUDA_${lib_name}_LIBRARY) - if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + if(NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") @@ -800,16 +784,15 @@ if(CUDAToolkit_FOUND) target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() - _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + _cudatoolkit_find_and_add_import_lib(cuda_driver ALT cuda) - _CUDAToolkit_find_and_add_import_lib(cudart) - _CUDAToolkit_find_and_add_import_lib(cudart_static) + _cudatoolkit_find_and_add_import_lib(cudart) + _cudatoolkit_find_and_add_import_lib(cudart_static) # setup dependencies that are required for cudart_static when building # on linux. These are generally only required when using the CUDA toolkit # when CUDA language is disabled - if(NOT TARGET CUDA::cudart_static_deps - AND TARGET CUDA::cudart_static) + if(NOT TARGET CUDA::cudart_static_deps AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) @@ -831,55 +814,64 @@ if(CUDAToolkit_FOUND) endif() endif() - _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library - foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + _cudatoolkit_find_and_add_import_lib(culibos) # it's a static library + foreach(cuda_lib cublas cufft curand cusparse nppc nvjpeg) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) endforeach() # cuFFTW depends on cuFFT - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft_static) # cuSOLVER depends on cuBLAS, and cuSPARSE - _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) - _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + _cudatoolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _cudatoolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) # nvGRAPH depends on cuRAND, and cuSOLVER. - _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) - _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + _cudatoolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _cudatoolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) # Process the majority of the NPP libraries. - foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + foreach( + cuda_lib + nppial + nppicc + nppidei + nppif + nppig + nppim + nppist + nppitc + npps + nppicom + nppisu + ) + _cudatoolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) endforeach() - _CUDAToolkit_find_and_add_import_lib(cupti - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(cupti_static - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti_static EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + _cudatoolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) - _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + _cudatoolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) if(WIN32) # nvtools can be installed outside the CUDA toolkit directory # so prefer the NVTOOLSEXT_PATH windows only environment variable # In addition on windows the most common name is nvToolsExt64_1 - find_library(CUDA_nvToolsExt_LIBRARY + find_library( + CUDA_nvToolsExt_LIBRARY NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt - PATHS ENV NVTOOLSEXT_PATH - ENV CUDA_PATH + PATHS ENV NVTOOLSEXT_PATH ENV CUDA_PATH PATH_SUFFIXES lib/x64 lib ) endif() - _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + _cudatoolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) - _CUDAToolkit_find_and_add_import_lib(OpenCL) + _cudatoolkit_find_and_add_import_lib(OpenCL) endif() if(_CUDAToolkit_Pop_ROOT_PATH) diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake index 445f4e93a59..3a6a826197e 100644 --- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -1,44 +1,40 @@ -IF (NOT CUDAToolkit_ROOT) - IF (NOT CUDA_ROOT) - SET(CUDA_ROOT $ENV{CUDA_ROOT}) - ENDIF() - IF(CUDA_ROOT) - SET(CUDAToolkit_ROOT ${CUDA_ROOT}) - ENDIF() -ENDIF() +if(NOT CUDAToolkit_ROOT) + if(NOT CUDA_ROOT) + set(CUDA_ROOT $ENV{CUDA_ROOT}) + endif() + if(CUDA_ROOT) + set(CUDAToolkit_ROOT ${CUDA_ROOT}) + endif() +endif() -IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") - MESSAGE(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") + message(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") +endif() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") find_package(CUDAToolkit REQUIRED) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - KOKKOS_EXPORT_CMAKE_TPL(CUDAToolkit REQUIRED) -ELSE() + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + kokkos_export_cmake_tpl(CUDAToolkit REQUIRED) +else() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) - IF (TARGET CUDA::cudart) - SET(FOUND_CUDART TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) - ELSE() - SET(FOUND_CUDART FALSE) - ENDIF() + if(TARGET CUDA::cudart) + set(FOUND_CUDART TRUE) + kokkos_export_imported_tpl(CUDA::cudart) + else() + set(FOUND_CUDART FALSE) + endif() - IF (TARGET CUDA::cuda_driver) - SET(FOUND_CUDA_DRIVER TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) - ELSE() - SET(FOUND_CUDA_DRIVER FALSE) - ENDIF() + if(TARGET CUDA::cuda_driver) + set(FOUND_CUDA_DRIVER TRUE) + kokkos_export_imported_tpl(CUDA::cuda_driver) + else() + set(FOUND_CUDA_DRIVER FALSE) + endif() include(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) - IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - ENDIF() -ENDIF() + find_package_handle_standard_args(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) + if(FOUND_CUDA_DRIVER AND FOUND_CUDART) + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + endif() +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLHPX.cmake b/lib/kokkos/cmake/Modules/FindTPLHPX.cmake index d7b54fb9c9a..e3c199b7c5d 100644 --- a/lib/kokkos/cmake/Modules/FindTPLHPX.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLHPX.cmake @@ -1,15 +1,10 @@ - -FIND_PACKAGE(HPX REQUIRED 1.8.0) +find_package(HPX REQUIRED 1.8.0) #as of right now, HPX doesn't export correctly #so let's convert it to an interface target -KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE - LINK_LIBRARIES ${HPX_LIBRARIES} - INCLUDES ${HPX_INCLUDE_DIRS} -) +kokkos_create_imported_tpl(HPX INTERFACE LINK_LIBRARIES ${HPX_LIBRARIES} INCLUDES ${HPX_INCLUDE_DIRS}) #this is a bit funky since this is a CMake target #but HPX doesn't export itself correctly -KOKKOS_EXPORT_CMAKE_TPL(HPX) +kokkos_export_cmake_tpl(HPX) #I would prefer all of this gets replaced with #KOKKOS_IMPORT_CMAKE_TPL(HPX) - diff --git a/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake b/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake index cf763b7e5bb..77ce8c71f73 100644 --- a/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(HWLOC HEADER hwloc.h LIBRARY hwloc) +kokkos_find_imported(HWLOC HEADER hwloc.h LIBRARY hwloc) diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake index 8adcdcdbb8e..85ae0b82244 100644 --- a/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) +kokkos_find_imported(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake index 70e0d6c454a..ce428b0aeec 100644 --- a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake @@ -2,17 +2,19 @@ # (which would not be contained in CMake's search paths anyway). # Hence, try if the compiler supports libquadmath natively first before doing # the standard package search. -SET(CMAKE_REQUIRED_LIBRARIES "quadmath") -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +set(CMAKE_REQUIRED_LIBRARIES "quadmath") +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main(void){ __float128 foo = ::sqrtq(123.456); return foo; }" - KOKKOS_QUADMATH_COMPILER_SUPPORT) -IF (KOKKOS_QUADMATH_COMPILER_SUPPORT) - KOKKOS_CREATE_IMPORTED_TPL(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) -ELSE() - KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) -ENDIF() + KOKKOS_QUADMATH_COMPILER_SUPPORT +) +if(KOKKOS_QUADMATH_COMPILER_SUPPORT) + kokkos_create_imported_tpl(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) +else() + kokkos_find_imported(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake index 603510c315e..68de942a698 100644 --- a/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake @@ -1,9 +1,10 @@ -INCLUDE(CheckIncludeFileCXX) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) +include(CheckIncludeFileCXX) +check_include_file_cxx(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) +check_include_file_cxx(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main() @@ -13,37 +14,40 @@ CHECK_CXX_SOURCE_COMPILES(" #endif return 0; }" - KOKKOS_NO_TBB_CONFLICT) + KOKKOS_NO_TBB_CONFLICT +) -IF (KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE +if(KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() -ELSE() - FIND_PACKAGE(oneDPL REQUIRED) + endif() +else() + find_package(oneDPL REQUIRED) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE LINK_LIBRARIES oneDPL) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE + LINK_LIBRARIES + oneDPL # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() + endif() # Export oneDPL as a Kokkos dependency - KOKKOS_EXPORT_CMAKE_TPL(oneDPL) -ENDIF() + kokkos_export_cmake_tpl(oneDPL) +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake index f796737f5b2..9673af0b9d9 100644 --- a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -1,7 +1,7 @@ include(FindPackageHandleStandardArgs) -FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) -FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) # FIXME_HIP Starting with ROCm 5.5 it is not necessary to link againt clang_rt. # We keep the code as is for now because it is hard to find the version of ROCM @@ -16,18 +16,24 @@ execute_process( COMMAND ${CMAKE_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE CLANG_RT_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE CLANG_RT_CHECK) + RESULT_VARIABLE CLANG_RT_CHECK +) -if( NOT "${CLANG_RT_CHECK}" STREQUAL "0" ) +if(NOT "${CLANG_RT_CHECK}" STREQUAL "0") # if the above failed, we delete CLANG_RT_LIBRARY to make the args check # below fail unset(CLANG_RT_LIBRARY) endif() - find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY CLANG_RT_LIBRARY) -kokkos_create_imported_tpl(ROCM INTERFACE - LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} ${CLANG_RT_LIBRARY} - COMPILE_DEFINITIONS __HIP_ROCclr__ +kokkos_create_imported_tpl( + ROCM + INTERFACE + LINK_LIBRARIES + ${HSA_RUNTIME_LIBRARY} + ${AMD_HIP_LIBRARY} + ${CLANG_RT_LIBRARY} + COMPILE_DEFINITIONS + __HIP_ROCclr__ ) diff --git a/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake b/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake index dae7dc3c952..b4b905795dd 100644 --- a/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake @@ -6,10 +6,10 @@ # behavior of ROCm 5.7 and later for earlier version of ROCm we set # AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If # the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. -SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") -SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") -FIND_PACKAGE(rocthrust REQUIRED) -KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) +set(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +set(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +find_package(rocthrust REQUIRED) +kokkos_create_imported_tpl(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) # Export ROCTHRUST as a Kokkos dependency -KOKKOS_EXPORT_CMAKE_TPL(rocthrust) +kokkos_export_cmake_tpl(rocthrust) diff --git a/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake b/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake index ff0db5123f8..280b8641da1 100644 --- a/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake @@ -1,15 +1,14 @@ -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE(Threads) +include(FindPackageHandleStandardArgs) +find_package(Threads) -IF (TARGET Threads::Threads) - SET(FOUND_THREADS TRUE) -ELSE() - SET(FOUND_THREADS FALSE) -ENDIF() +if(TARGET Threads::Threads) + set(FOUND_THREADS TRUE) +else() + set(FOUND_THREADS FALSE) +endif() -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLTHREADS DEFAULT_MSG FOUND_THREADS) +find_package_handle_standard_args(TPLTHREADS DEFAULT_MSG FOUND_THREADS) #Only create the TPL if we succeed -IF (FOUND_THREADS) - KOKKOS_CREATE_IMPORTED_TPL(THREADS INTERFACE LINK_OPTIONS - ${CMAKE_THREAD_LIBS_INIT}) -ENDIF() +if(FOUND_THREADS) + kokkos_create_imported_tpl(THREADS INTERFACE LINK_OPTIONS ${CMAKE_THREAD_LIBS_INIT}) +endif() diff --git a/lib/kokkos/cmake/README.md b/lib/kokkos/cmake/README.md index 385bbfcd5d5..0548e89a90e 100644 --- a/lib/kokkos/cmake/README.md +++ b/lib/kokkos/cmake/README.md @@ -310,20 +310,6 @@ When Kokkos is loaded by a downstream project, this TPL must be loaded. Calling this function simply appends text recording the location where the TPL was found and adding a `find_dependency(...)` call that will reload the CMake target. -### The Great TriBITS Compromise - -TriBITS was a masterpiece of CMake version 2 before the modern CMake idioms of building and using. -TriBITS greatly limited verbosity of CMake files, handled complicated dependency trees between packages, and handled automatically setting up include and linker paths for dependent libraries. - -Kokkos is now used by numerous projects that don't (and won't) depend on TriBITS for their build systems. -Kokkos has to work outside of TriBITS and provide a standard CMake 3+ build system. -At the same time, Kokkos is used by numerous projects that depend on TriBITS and don't (and won't) switch to a standard CMake 3+ build system. - -Instead of calling functions `TRIBITS_X(...)`, the CMake calls wrapper functions `KOKKOS_X(...)`. -If TriBITS is available (as in Trilinos), `KOKKOS_X` will just be a thin wrapper around `TRIBITS_X`. -If TriBITS is not available, Kokkos maps `KOKKOS_X` calls to native CMake that complies with CMake 3 idioms. -For the time being, this seems the most sensible way to handle the competing requirements of a standalone modern CMake and TriBITS build system. - ##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) diff --git a/lib/kokkos/cmake/build_env_info.cmake b/lib/kokkos/cmake/build_env_info.cmake index 0eeb6372455..ac28b2d8503 100644 --- a/lib/kokkos/cmake/build_env_info.cmake +++ b/lib/kokkos/cmake/build_env_info.cmake @@ -2,111 +2,108 @@ find_package(Git QUIET) -SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) -SET(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -SET(post_configure_dir ${CMAKE_BINARY_DIR}/generated) +set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) +set(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) +set(post_configure_dir ${CMAKE_BINARY_DIR}/generated) -SET(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) -SET(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) +set(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) +set(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) -FUNCTION(check_git_write git_hash git_clean_status) - FILE( - WRITE - ${CMAKE_BINARY_DIR}/git-state.txt - "${git_hash}-${git_clean_status}") -ENDFUNCTION() +function(check_git_write git_hash git_clean_status) + file(WRITE ${CMAKE_BINARY_DIR}/git-state.txt "${git_hash}-${git_clean_status}") +endfunction() -FUNCTION(check_git_read git_hash) - IF(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) - FILE(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) - LIST(GET CONTENT 0 var) +function(check_git_read git_hash) + if(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) + file(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) + list(GET CONTENT 0 var) message(DEBUG "Cached Git hash: ${var}") - SET(${git_hash} ${var} PARENT_SCOPE) + set(${git_hash} ${var} PARENT_SCOPE) else() - SET(${git_hash} "INVALID" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -FUNCTION(check_git_version) - IF(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) - FILE( - COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp - DESTINATION ${post_configure_dir}) - ENDIF() - - IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) + set(${git_hash} "INVALID" PARENT_SCOPE) + endif() +endfunction() + +function(check_git_version) + if(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) + file(COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp DESTINATION ${post_configure_dir}) + endif() + + if(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) return() - ENDIF() + endif() # Get the current working branch execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit description execute_process( COMMAND ${GIT_EXECUTABLE} show -s --format=%s WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DESCRIPTION - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit date execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%cI WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DATE - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Check if repo is dirty / clean execute_process( COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} RESULT_VARIABLE IS_DIRTY - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - IF(IS_DIRTY EQUAL 0) - SET(GIT_CLEAN_STATUS "CLEAN") + if(IS_DIRTY EQUAL 0) + set(GIT_CLEAN_STATUS "CLEAN") else() - SET(GIT_CLEAN_STATUS "DIRTY") - ENDIF() + set(GIT_CLEAN_STATUS "DIRTY") + endif() # Get the latest abbreviated commit hash of the working branch execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%h WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) check_git_read(GIT_HASH_CACHE) - IF(NOT EXISTS ${post_configure_dir}) + if(NOT EXISTS ${post_configure_dir}) file(MAKE_DIRECTORY ${post_configure_dir}) - ENDIF() + endif() # Only update the git_version.cpp if the hash has changed. This will # prevent us from rebuilding the project more than we need to. - IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} - OR NOT EXISTS ${post_configure_file}) + if(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${post_configure_file}) # Set the GIT_HASH_CACHE variable so the next build won't have # to regenerate the source file. check_git_write(${GIT_COMMIT_HASH} ${GIT_CLEAN_STATUS}) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) message(STATUS "Configured git information in ${post_configure_file}") - ENDIF() -ENDFUNCTION() + endif() +endfunction() -FUNCTION(check_git_setup) +function(check_git_setup) add_custom_target( - AlwaysCheckGit COMMAND ${CMAKE_COMMAND} - -DRUN_CHECK_GIT_VERSION=1 - -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} - -P ${CURRENT_LIST_DIR}/build_env_info.cmake - BYPRODUCTS ${post_configure_file}) + AlwaysCheckGit COMMAND ${CMAKE_COMMAND} -DRUN_CHECK_GIT_VERSION=1 -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} -P + ${CURRENT_LIST_DIR}/build_env_info.cmake BYPRODUCTS ${post_configure_file} + ) add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) @@ -114,9 +111,9 @@ FUNCTION(check_git_setup) add_dependencies(impl_git_version AlwaysCheckGit) check_git_version() -ENDFUNCTION() +endfunction() # This is used to run this function from an external cmake process. -IF(RUN_CHECK_GIT_VERSION) +if(RUN_CHECK_GIT_VERSION) check_git_version() -ENDIF() +endif() diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp b/lib/kokkos/cmake/compile_tests/amd_apu.cc similarity index 57% rename from lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp rename to lib/kokkos/cmake/compile_tests/amd_apu.cc index 3c599b95a6f..a9c1edbd57b 100644 --- a/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp +++ b/lib/kokkos/cmake/compile_tests/amd_apu.cc @@ -14,5 +14,25 @@ // //@HEADER -#include -#include +#include +#include + +int main() { + hipDeviceProp_t hipProp; + hipError_t error = hipGetDeviceProperties(&hipProp, 0); + + if (error != hipSuccess) { + std::cout << hipGetErrorString(error) << '\n'; + return error; + } + + if (hipProp.integrated == 1) { + // We detected an APU + std::cout << "ON"; + } else { + // We detected a discrete GPU + std::cout << "OFF"; + } + + return 0; +} diff --git a/lib/kokkos/cmake/cray.cmake b/lib/kokkos/cmake/cray.cmake index 08912f5130f..4ce5352bda2 100644 --- a/lib/kokkos/cmake/cray.cmake +++ b/lib/kokkos/cmake/cray.cmake @@ -1,9 +1,6 @@ - - function(kokkos_set_cray_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake index 5b6afd61512..49eaf883a46 100644 --- a/lib/kokkos/cmake/deps/CUDA.cmake +++ b/lib/kokkos/cmake/deps/CUDA.cmake @@ -17,24 +17,24 @@ # Check for CUDA support -SET(_CUDA_FAILURE OFF) +set(_CUDA_FAILURE OFF) # Have CMake find CUDA -IF(NOT _CUDA_FAILURE) - FIND_PACKAGE(CUDA 3.2) - IF (NOT CUDA_FOUND) - SET(_CUDA_FAILURE ON) - ENDIF() -ENDIF() +if(NOT _CUDA_FAILURE) + find_package(CUDA 3.2) + if(NOT CUDA_FOUND) + set(_CUDA_FAILURE ON) + endif() +endif() -IF(NOT _CUDA_FAILURE) +if(NOT _CUDA_FAILURE) # if we haven't met failure macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target) - TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY) + tribits_add_library(${cuda_target} ${ARGN} CUDALIBRARY) endmacro() - GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) - GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) -ELSE() - SET(TPL_ENABLE_CUDA OFF) -ENDIF() + global_set(TPL_CUDA_LIBRARY_DIRS) + global_set(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + global_set(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) +else() + set(TPL_ENABLE_CUDA OFF) +endif() diff --git a/lib/kokkos/cmake/deps/HWLOC.cmake b/lib/kokkos/cmake/deps/HWLOC.cmake index 77d5a9b83a6..52d8368d041 100644 --- a/lib/kokkos/cmake/deps/HWLOC.cmake +++ b/lib/kokkos/cmake/deps/HWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,7 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/lib/kokkos/cmake/deps/Pthread.cmake b/lib/kokkos/cmake/deps/Pthread.cmake index e879bff3741..b811f850841 100644 --- a/lib/kokkos/cmake/deps/Pthread.cmake +++ b/lib/kokkos/cmake/deps/Pthread.cmake @@ -15,31 +15,27 @@ # ************************************************************************ # @HEADER +set(USE_THREADS FALSE) -SET(USE_THREADS FALSE) - -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(Pthread) -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") + kokkos_create_imported_tpl_library(Pthread) +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/lib/kokkos/cmake/deps/quadmath.cmake b/lib/kokkos/cmake/deps/quadmath.cmake index 6aef08e8812..9006d0cb9ef 100644 --- a/lib/kokkos/cmake/deps/quadmath.cmake +++ b/lib/kokkos/cmake/deps/quadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +kokkos_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/lib/kokkos/cmake/fake_tribits.cmake b/lib/kokkos/cmake/fake_tribits.cmake index a18d2ac518a..d3fe1e6e2f6 100644 --- a/lib/kokkos/cmake/fake_tribits.cmake +++ b/lib/kokkos/cmake/fake_tribits.cmake @@ -1,288 +1,213 @@ #These are tribits wrappers used by all projects in the Kokkos ecosystem -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) +include(CMakeParseArguments) +include(CTest) -FUNCTION(ASSERT_DEFINED VARS) - FOREACH(VAR ${VARS}) - IF(NOT DEFINED ${VAR}) - MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!") - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -IF(NOT KOKKOS_HAS_TRILINOS) -MACRO(APPEND_GLOB VAR) - FILE(GLOB LOCAL_TMP_VAR ${ARGN}) - LIST(APPEND ${VAR} ${LOCAL_TMP_VAR}) -ENDMACRO() - -MACRO(GLOBAL_SET VARNAME) - SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) -ENDMACRO() - -MACRO(PREPEND_GLOBAL_SET VARNAME) - ASSERT_DEFINED(${VARNAME}) - GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) -ENDMACRO() -ENDIF() - -MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) - FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") - ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) - SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE) -ENDMACRO() - -FUNCTION(KOKKOS_ADD_TEST) - if (KOKKOS_HAS_TRILINOS) - CMAKE_PARSE_ARGUMENTS(TEST - "SKIP_TRIBITS" - "EXE;NAME;TOOL" - "ARGS" - ${ARGN}) - - IF(TEST_SKIP_TRIBITS) - MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") - RETURN() - ENDIF() - - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - - TRIBITS_ADD_TEST( - ${EXE_ROOT} - NAME ${TEST_NAME} - COMM serial mpi - NUM_MPI_PROCS 1 - ARGS ${TEST_ARGS} - ${TEST_UNPARSED_ARGUMENTS} - ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED - ) - - # We will get prepended package name here - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - - # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults - # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, - # the test won't actually be added and attempting to set a property on it below - # will yield an error. - if(TARGET ${EXE}) - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - foreach(TEST_ADDED ${ALL_TESTS_ADDED}) - set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_TOOLS_LIBS=$") - endforeach() - endif() +function(ASSERT_DEFINED VARS) + foreach(VAR ${VARS}) + if(NOT DEFINED ${VAR}) + message(SEND_ERROR "Error, the variable ${VAR} is not defined!") endif() + endforeach() +endfunction() + +macro(APPEND_GLOB VAR) + file(GLOB LOCAL_TMP_VAR ${ARGN}) + list(APPEND ${VAR} ${LOCAL_TMP_VAR}) +endmacro() + +macro(GLOBAL_SET VARNAME) + set(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) +endmacro() + +macro(PREPEND_GLOBAL_SET VARNAME) + assert_defined(${VARNAME}) + global_set(${VARNAME} ${ARGN} ${${VARNAME}}) +endmacro() + +macro(ADD_INTERFACE_LIBRARY LIB_NAME) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") + add_library(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + set_target_properties(${LIB_NAME} PROPERTIES INTERFACE TRUE) +endmacro() + +function(KOKKOS_ADD_TEST) + cmake_parse_arguments( + TEST "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" + ${ARGN} + ) + # To match Tribits, we should always be receiving + # the root names of exes/libs + if(TEST_EXE) + set(EXE_ROOT ${TEST_EXE}) else() - CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL;SKIP_TRIBITS" - "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" - "CATEGORIES;ARGS" - ${ARGN}) - # To match Tribits, we should always be receiving - # the root names of exes/libs - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - # Prepend package name to the test name - # These should be the full target name - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) - ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_TOOL) - ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - ENDIF() -ENDFUNCTION() - -MACRO(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) - ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) - TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) - TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) -ENDMACRO() + set(EXE_ROOT ${TEST_NAME}) + endif() + # Prepend package name to the test name + # These should be the full target name + set(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) + + # For compatibility with Trilinos testing, we support: + # * `-D _DISABLE=ON` + # * `-D _EXTRA_ARGS=";;;..."` + # * `-D _SET_RUN_SERIAL=ON` + if(${TEST_NAME}_DISABLE) + return() + endif() -FUNCTION(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES(${TPL_NAME} ${ARGN}) + set(EXE ${PACKAGE_NAME}_${EXE_ROOT}) + if(WIN32) + add_test(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} + ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS} + ) else() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" - ${ARGN}) - - SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE) - IF (PARSE_REQUIRED_LIBS_NAMES) - FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) - IF(NOT TPL_${TPL_NAME}_LIBRARIES) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (PARSE_REQUIRED_HEADERS) - FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) - IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (_${TPL_NAME}_ENABLE_SUCCESS) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME}) - ENDIF() - VERIFY_EMPTY(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) + add_test(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS}) endif() -ENDFUNCTION() - -MACRO(KOKKOS_TARGET_COMPILE_OPTIONS TARGET) -if(KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -else() - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -endif() -ENDMACRO() - -FUNCTION(KOKKOS_LIB_TYPE LIB RET) -GET_TARGET_PROPERTY(PROP ${LIB} TYPE) -IF (${PROP} STREQUAL "INTERFACE_LIBRARY") - SET(${RET} "INTERFACE" PARENT_SCOPE) -ELSE() - SET(${RET} "PUBLIC" PARENT_SCOPE) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) -IF(KOKKOS_HAS_TRILINOS) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - #don't trust tribits to do this correctly - but need to add package name - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSEIF(TARGET ${TARGET}) - #the target actually exists - this means we are doing separate libs - #or this a test library - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSE() - GET_PROPERTY(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - IF (${TARGET} IN_LIST LIBS) - SET_PROPERTY(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) - ELSE() - MESSAGE(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") - ENDIF() -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) -IF(KOKKOS_HAS_TRILINOS) - #do nothing -ELSE() - SET(options INTERFACE) - SET(oneValueArgs) - SET(multiValueArgs) - CMAKE_PARSE_ARGUMENTS(PARSE - "INTERFACE" - "" - "" - ${ARGN}) - SET(LINK_TYPE) - IF(PARSE_INTERFACE) - SET(LINK_TYPE INTERFACE) - ELSE() - SET(LINK_TYPE PUBLIC) - ENDIF() - TARGET_LINK_LIBRARIES(${TARGET} ${LINK_TYPE} ${DEPLIB}) - VERIFY_EMPTY(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_TEST_LIBRARY NAME) -IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN} TESTONLY) -ELSE() - SET(oneValueArgs) - SET(multiValueArgs HEADERS SOURCES) - - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES;DEPLIBS" - ${ARGN}) - - SET(LIB_TYPE) - IF (PARSE_STATIC) - SET(LIB_TYPE STATIC) - ELSEIF (PARSE_SHARED) - SET(LIB_TYPE SHARED) - ENDIF() + # Trilinos testing benefits from labeling the tests as "Kokkos" tests + set_tests_properties(${TEST_NAME} PROPERTIES LABELS Kokkos) + if(${TEST_NAME}_SET_RUN_SERIAL) + set_tests_properties(${TEST_NAME} PROPERTIES RUN_SERIAL ON) + endif() + # TriBITS doesn't actually currently support `-D _ENVIRONMENT` + # but we decided to add it anyway + if(${TEST_NAME}_ENVIRONMENT) + set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT "${${TEST_NAME}_ENVIRONMENT}") + endif() + if(TEST_WILL_FAIL) + set_tests_properties(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + endif() + if(TEST_FAIL_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + endif() + if(TEST_PASS_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + endif() + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + set_property( + TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$" + ) + endif() + verify_empty(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) +endfunction() + +macro(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) + add_interface_library(TPL_LIB_${TPL_NAME}) + target_link_libraries(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) + target_include_directories(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) +endmacro() + +function(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) + cmake_parse_arguments(PARSE "" "" "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" ${ARGN}) + + set(_${TPL_NAME}_ENABLE_SUCCESS TRUE) + if(PARSE_REQUIRED_LIBS_NAMES) + find_library(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) + if(NOT TPL_${TPL_NAME}_LIBRARIES) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(PARSE_REQUIRED_HEADERS) + find_path(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) + if(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(_${TPL_NAME}_ENABLE_SUCCESS) + kokkos_create_imported_tpl_library(${TPL_NAME}) + endif() + verify_empty(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - ADD_LIBRARY(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) - IF (PARSE_DEPLIBS) - TARGET_LINK_LIBRARIES(${NAME} PRIVATE ${PARSE_DEPLIBS}) - ENDIF() -ENDIF() -ENDFUNCTION() +function(KOKKOS_LIB_TYPE LIB RET) + get_target_property(PROP ${LIB} TYPE) + if(${PROP} STREQUAL "INTERFACE_LIBRARY") + set(${RET} "INTERFACE" PARENT_SCOPE) + else() + set(${RET} "PUBLIC" PARENT_SCOPE) + endif() +endfunction() + +function(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) + if(TARGET ${TARGET}) + #the target actually exists - this means we are doing separate libs + #or this a test library + kokkos_lib_type(${TARGET} INCTYPE) + target_include_directories(${TARGET} ${INCTYPE} ${ARGN}) + else() + get_property(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + if(${TARGET} IN_LIST LIBS) + set_property(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) + else() + message(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") + endif() + endif() +endfunction() + +function(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) + set(options INTERFACE) + set(oneValueArgs) + set(multiValueArgs) + cmake_parse_arguments(PARSE "INTERFACE" "" "" ${ARGN}) + set(LINK_TYPE) + if(PARSE_INTERFACE) + set(LINK_TYPE INTERFACE) + else() + set(LINK_TYPE PUBLIC) + endif() + target_link_libraries(${TARGET} ${LINK_TYPE} ${DEPLIB}) + verify_empty(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() +function(KOKKOS_ADD_TEST_LIBRARY NAME) + set(oneValueArgs) + set(multiValueArgs HEADERS SOURCES) -FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) -IF(KOKKOS_HAS_TRILINOS) - TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) -ELSE() - CMAKE_PARSE_ARGUMENTS( - INC - "REQUIRED_DURING_INSTALLATION_TESTING" - "" - "" - ${ARGN} - ) - INCLUDE_DIRECTORIES(${INC_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES;DEPLIBS" ${ARGN}) + set(LIB_TYPE) + if(PARSE_STATIC) + set(LIB_TYPE STATIC) + elseif(PARSE_SHARED) + set(LIB_TYPE SHARED) + endif() -MACRO(PRINTALL match) -get_cmake_property(_variableNames VARIABLES) -list (SORT _variableNames) -foreach (_variableName ${_variableNames}) - if("${_variableName}" MATCHES "${match}") - message(STATUS "${_variableName}=${${_variableName}}") + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) endif() -endforeach() -ENDMACRO() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + add_library(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) + if(PARSE_DEPLIBS) + target_link_libraries(${NAME} PRIVATE ${PARSE_DEPLIBS}) + endif() +endfunction() + +function(KOKKOS_INCLUDE_DIRECTORIES) + cmake_parse_arguments(INC "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN}) + include_directories(${INC_UNPARSED_ARGUMENTS}) +endfunction() + +macro(PRINTALL match) + get_cmake_property(_variableNames VARIABLES) + list(SORT _variableNames) + foreach(_variableName ${_variableNames}) + if("${_variableName}" MATCHES "${match}") + message(STATUS "${_variableName}=${${_variableName}}") + endif() + endforeach() +endmacro() -MACRO(SET_GLOBAL_REPLACE SUBSTR VARNAME) - STRING(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDMACRO() +macro(SET_GLOBAL_REPLACE SUBSTR VARNAME) + string(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) + global_set(${VARNAME} ${TEMP}) +endmacro() -FUNCTION(GLOBAL_APPEND VARNAME) +function(GLOBAL_APPEND VARNAME) #We make this a function since we are setting variables #and want to use scope to avoid overwriting local variables - SET(TEMP ${${VARNAME}}) - LIST(APPEND TEMP ${ARGN}) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDFUNCTION() + set(TEMP ${${VARNAME}}) + list(APPEND TEMP ${ARGN}) + global_set(${VARNAME} ${TEMP}) +endfunction() diff --git a/lib/kokkos/cmake/gnu.cmake b/lib/kokkos/cmake/gnu.cmake index aa11fe87b11..e53b4a7becd 100644 --- a/lib/kokkos/cmake/gnu.cmake +++ b/lib/kokkos/cmake/gnu.cmake @@ -1,23 +1,21 @@ - -FUNCTION(kokkos_set_gnu_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_gnu_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) + endif() +endfunction() diff --git a/lib/kokkos/cmake/intel.cmake b/lib/kokkos/cmake/intel.cmake index 7e6ee3358c9..b7752caabdf 100644 --- a/lib/kokkos/cmake/intel.cmake +++ b/lib/kokkos/cmake/intel.cmake @@ -1,18 +1,15 @@ - -FUNCTION(kokkos_set_intel_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_intel_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) -ENDFUNCTION() - - + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + set(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) +endfunction() diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index 0b3d4044d0b..ae45da806f7 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -1,611 +1,732 @@ - -FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) +function(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) #all optimizations off by default - KOKKOS_DEPENDENT_OPTION(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) - SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - IF(KOKKOS_ARCH_${SUFFIX}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) - SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - + kokkos_dependent_option(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) + set(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + if(KOKKOS_ARCH_${SUFFIX}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) + set(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) + endif() +endfunction() # Make sure devices and compiler ID are done -KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID) -KOKKOS_CFG_DEPENDS(ARCH DEVICES) -KOKKOS_CFG_DEPENDS(ARCH OPTIONS) +kokkos_cfg_depends(ARCH COMPILER_ID) +kokkos_cfg_depends(ARCH DEVICES) +kokkos_cfg_depends(ARCH OPTIONS) -KOKKOS_CHECK_DEPRECATED_OPTIONS( - ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" - ARCH_RYZEN "Please replace RYZEN with ZEN or ZEN2, depending on your platform" +kokkos_check_deprecated_options( + ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" ARCH_RYZEN + "Please replace RYZEN with ZEN or ZEN2, depending on your platform" ) #------------------------------------------------------------------------------- # List of possible host architectures. #------------------------------------------------------------------------------- -SET(KOKKOS_ARCH_LIST) +set(KOKKOS_ARCH_LIST) include(CheckCXXCompilerFlag) -KOKKOS_DEPRECATED_LIST(ARCH ARCH) - -SET(HOST_ARCH_ALREADY_SPECIFIED "") -MACRO(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) - KOKKOS_ARCH_OPTION(${ARCH} HOST "${LABEL}" TRUE) - IF(KOKKOS_ARCH_${ARCH}) - IF(HOST_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) - ENDIF() -ENDMACRO() - -DECLARE_AND_CHECK_HOST_ARCH(NATIVE "local machine") -DECLARE_AND_CHECK_HOST_ARCH(AMDAVX "AMD chip") -DECLARE_AND_CHECK_HOST_ARCH(ARMV80 "ARMv8.0 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") -DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") -DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") -DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") -DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ICL "Intel Ice Lake Client CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(SKL "Intel Skylake Client CPUs") -DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") -DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") - -IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_CUDA_ARCHS ON) -ENDIF() - -KOKKOS_ARCH_OPTION(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") - -IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_HIP_ARCHS ON) -ENDIF() +kokkos_deprecated_list(ARCH ARCH) + +set(HOST_ARCH_ALREADY_SPECIFIED "") +macro(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) + kokkos_arch_option(${ARCH} HOST "${LABEL}" TRUE) + if(KOKKOS_ARCH_${ARCH}) + if(HOST_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) + endif() +endmacro() + +declare_and_check_host_arch(NATIVE "local machine") +declare_and_check_host_arch(AMDAVX "AMD chip") +declare_and_check_host_arch(ARMV80 "ARMv8.0 Compatible CPU") +declare_and_check_host_arch(ARMV81 "ARMv8.1 Compatible CPU") +declare_and_check_host_arch(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") +declare_and_check_host_arch(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") +declare_and_check_host_arch(A64FX "ARMv8.2 with SVE Support") +declare_and_check_host_arch(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") +declare_and_check_host_arch(SNB "Intel Sandy/Ivy Bridge CPUs") +declare_and_check_host_arch(HSW "Intel Haswell CPUs") +declare_and_check_host_arch(BDW "Intel Broadwell Xeon E-class CPUs") +declare_and_check_host_arch(ICL "Intel Ice Lake Client CPUs (AVX512)") +declare_and_check_host_arch(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(SKL "Intel Skylake Client CPUs") +declare_and_check_host_arch(SKX "Intel Skylake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(KNC "Intel Knights Corner Xeon Phi") +declare_and_check_host_arch(KNL "Intel Knights Landing Xeon Phi") +declare_and_check_host_arch(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(POWER8 "IBM POWER8 CPUs") +declare_and_check_host_arch(POWER9 "IBM POWER9 CPUs") +declare_and_check_host_arch(ZEN "AMD Zen architecture") +declare_and_check_host_arch(ZEN2 "AMD Zen2 architecture") +declare_and_check_host_arch(ZEN3 "AMD Zen3 architecture") +declare_and_check_host_arch(RISCV_SG2042 "SG2042 (RISC-V) CPUs") +declare_and_check_host_arch(RISCV_RVA22V "RVA22V (RISC-V) CPUs") + +if(Kokkos_ENABLE_CUDA + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_CUDA_ARCHS ON) +endif() + +kokkos_arch_option(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") + +if(Kokkos_ENABLE_HIP + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_HIP_ARCHS ON) +endif() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940) -LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) -LIST(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) -LIST(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) +list(APPEND SUPPORTED_AMD_GPUS MI300 MI300A MI300) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX942_APU AMD_GFX940) +list(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx942 gfx940) +list(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) +list(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) +list(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) +list(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) +list(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) +list(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) +list(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) +list(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17 -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - KOKKOS_ARCH_OPTION(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") -ENDFOREACH() - -IF(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) - SET(KOKKOS_SHOW_SYCL_ARCHS ON) -ENDIF() - -KOKKOS_ARCH_OPTION(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") - -IF(KOKKOS_ENABLE_COMPILER_WARNINGS) - SET(COMMON_WARNINGS - "-Wall" "-Wextra" "-Wunused-parameter" "-Wshadow" "-pedantic" - "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + kokkos_arch_option(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") +endforeach() + +if(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) + set(KOKKOS_SHOW_SYCL_ARCHS ON) +endif() + +kokkos_arch_option(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") + +if(KOKKOS_ENABLE_COMPILER_WARNINGS) + set(COMMON_WARNINGS + "-Wall" + "-Wextra" + "-Wunused-parameter" + "-Wshadow" + "-pedantic" + "-Wsign-compare" + "-Wtype-limits" + "-Wuninitialized" + "-Wsuggest-override" + ) # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH - IF(Kokkos_ENABLE_LIBQUADMATH) + if(Kokkos_ENABLE_LIBQUADMATH) # warning: non-standard suffix on floating constant [-Wpedantic] - LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic") - ENDIF() + list(REMOVE_ITEM COMMON_WARNINGS "-pedantic") + endif() # NVHPC compiler does not support -Wtype-limits. - IF(KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") - ENDIF() - ENDIF() - - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - LIST(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") - ENDIF() - - SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" - ${COMMON_WARNINGS}) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - LIST(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") - ENDIF() + if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") + endif() + endif() + + # ICPC doesn't support -Wsuggest-override + if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + list(REMOVE_ITEM COMMON_WARNINGS "-Wsuggest-override") + endif() + + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + list(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") + endif() + + set(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" ${COMMON_WARNINGS}) + if(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + list(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") + endif() # Not using COMPILER_SPECIFIC_FLAGS function so the warning flags are not passed downstream - IF(CMAKE_CXX_COMPILER_ID STREQUAL GNU) - STRING(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") - ELSEIF(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + string(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) # FIXME_NVHPC - ELSE() - STRING(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") - ENDIF() - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") -ENDIF() - + else() + string(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") +endif() #------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") - ENDIF() -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-extended-lambda") + global_append(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") +endif() + +if(KOKKOS_ENABLE_CUDA_CONSTEXPR) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") + endif() +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + set(CUDA_ARCH_FLAG "--cuda-gpu-arch") + global_append(KOKKOS_CUDA_OPTIONS -x cuda) # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR - IF (Kokkos_CUDA_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) - ELSEIF(CUDAToolkit_BIN_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - SET(CUDA_ARCH_FLAG "-arch") -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - STRING(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) - IF (KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -lineinfo) - ENDIF() - UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -ENDIF() - + if(Kokkos_CUDA_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + elseif(CUDAToolkit_BIN_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + set(CUDA_ARCH_FLAG "-arch") +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + string(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) + if(KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + global_append(KOKKOS_CUDA_OPTIONS -lineinfo) + endif() + unset(_UPPERCASE_CMAKE_BUILD_TYPE) +endif() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- -KOKKOS_OPTION(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") -KOKKOS_OPTION(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_FLAGS) -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_LINK) +kokkos_option(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") +kokkos_option(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") +mark_as_advanced(Kokkos_IMPL_AMDGPU_FLAGS) +mark_as_advanced(Kokkos_IMPL_AMDGPU_LINK) #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_ENABLE_HIP) - SET(AMDGPU_ARCH_FLAG "--offload-arch") - IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF (NOT CMAKE_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -xhip) - IF(DEFINED ENV{ROCM_PATH}) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) - ENDIF() - ENDIF() -ENDIF() - - -IF(KOKKOS_ARCH_NATIVE) - IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") - MESSAGE(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") - ENDIF() - - STRING(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) - IF(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") - SET(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") - ELSE() - SET(KOKKOS_NATIVE_FLAGS "-mcpu=native") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - NVHPC -tp=native - DEFAULT ${KOKKOS_NATIVE_FLAGS} +global_set(KOKKOS_AMDGPU_OPTIONS) +if(KOKKOS_ENABLE_HIP) + set(AMDGPU_ARCH_FLAG "--offload-arch") + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(NOT CMAKE_CXX_STANDARD) + message(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") + endif() + global_append(KOKKOS_AMDGPU_OPTIONS -xhip) + if(DEFINED ENV{ROCM_PATH}) + global_append(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + endif() + endif() +endif() + +if(KOKKOS_ARCH_NATIVE) + if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") + message(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") + endif() + + string(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) + if(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") + set(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") + else() + set(KOKKOS_NATIVE_FLAGS "-mcpu=native") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID NVHPC -tp=native DEFAULT ${KOKKOS_NATIVE_FLAGS}) +endif() + +if(KOKKOS_ARCH_ARMV80) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV80) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a +endif() + +if(KOKKOS_ARCH_ARMV81) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.1-a ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV81) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.1-a +endif() + +if(KOKKOS_ARCH_ARMV8_THUNDERX) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a + -mtune=thunderx ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV8_THUNDERX) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a -mtune=thunderx +endif() + +if(KOKKOS_ARCH_ARMV8_THUNDERX2) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=thunderx2t99 + -mtune=thunderx2t99 ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV8_THUNDERX2) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99 +endif() + +if(KOKKOS_ARCH_A64FX) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Clang + -march=armv8.2-a+sve + -msve-vector-bits=512 + GNU + -march=armv8.2-a+sve + -msve-vector-bits=512 + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.2-a+sve ) -ENDIF() - -IF (KOKKOS_ARCH_A64FX) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Clang -march=armv8.2-a+sve -msve-vector-bits=512 - GNU -march=armv8.2-a+sve -msve-vector-bits=512 - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.2-a+sve - ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV9_GRACE) - SET(KOKKOS_ARCH_ARM_NEON ON) +if(KOKKOS_ARCH_ARMV9_GRACE) + set(KOKKOS_ARCH_ARM_NEON ON) check_cxx_compiler_flag("-mcpu=neoverse-n2" COMPILER_SUPPORTS_NEOVERSE_N2) check_cxx_compiler_flag("-msve-vector-bits=128" COMPILER_SUPPORTS_SVE_VECTOR_BITS) - IF (COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128 - ) - ELSE() - MESSAGE(WARNING "Compiler does not support ARMv9 Grace architecture") - ENDIF() -ENDIF() - -IF (KOKKOS_ARCH_ZEN) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen - DEFAULT -march=znver1 -mtune=znver1 + if(COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128) + else() + message(WARNING "Compiler does not support ARMv9 Grace architecture") + endif() +endif() + +if(KOKKOS_ARCH_ZEN) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen + DEFAULT + -march=znver1 + -mtune=znver1 ) - SET(KOKKOS_ARCH_AMD_ZEN ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_ZEN2) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver2 -mtune=znver2 + set(KOKKOS_ARCH_AMD_ZEN ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_ZEN2) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver2 + -mtune=znver2 ) - SET(KOKKOS_ARCH_AMD_ZEN2 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_ZEN3) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver3 -mtune=znver3 + set(KOKKOS_ARCH_AMD_ZEN2 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_ZEN3) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver3 + -mtune=znver3 ) - SET(KOKKOS_ARCH_AMD_ZEN3 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) - SET(KOKKOS_ARCH_AVX ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -mavx - MSVC /arch:AVX - NVHPC -tp=sandybridge - DEFAULT -mavx + set(KOKKOS_ARCH_AMD_ZEN3 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) + set(KOKKOS_ARCH_AVX ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -mavx + MSVC + /arch:AVX + NVHPC + -tp=sandybridge + DEFAULT + -mavx ) -ENDIF() - -IF (KOKKOS_ARCH_HSW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 +endif() + +if(KOKKOS_ARCH_HSW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 ) -ENDIF() - -IF (KOKKOS_ARCH_RISCV_SG2042) - IF(NOT - (KOKKOS_CXX_COMPILER_ID STREQUAL GNU - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) - OR - (KOKKOS_CXX_COMPILER_ID STREQUAL Clang - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) +endif() + +if(KOKKOS_ARCH_RISCV_SG2042) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) - MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -march=rv64imafdcv - ) -ENDIF() - - -IF (KOKKOS_ARCH_BDW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -march=rv64imafdcv) +endif() + +if(KOKKOS_ARCH_RISCV_RVA22V) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) -ENDIF() - -IF (KOKKOS_ARCH_KNL) - #avx512-mic - SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xMIC-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=knl - DEFAULT -march=knl -mtune=knl + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT + -march=rv64imafdcv_sscofpmf_sstc_svpbmt_zicbom_zicboz_zicbop_zihintpause + ) +endif() + +if(KOKKOS_ARCH_BDW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 + -mrtm ) -ENDIF() +endif() -IF (KOKKOS_ARCH_KNC) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - DEFAULT -mmic +if(KOKKOS_ARCH_KNL) + #avx512-mic + set(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xMIC-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=knl + DEFAULT + -march=knl + -mtune=knl ) -ENDIF() - -IF (KOKKOS_ARCH_SKL) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSKYLAKE - MSVC /arch:AVX2 - NVHPC -tp=skylake - DEFAULT -march=skylake -mtune=skylake +endif() + +if(KOKKOS_ARCH_KNC) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC NO-VALUE-SPECIFIED DEFAULT -mmic) +endif() + +if(KOKKOS_ARCH_SKL) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xSKYLAKE + MSVC + /arch:AVX2 + NVHPC + -tp=skylake + DEFAULT + -march=skylake + -mtune=skylake ) -ENDIF() - -IF (KOKKOS_ARCH_SKX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=skylake - DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 +endif() + +if(KOKKOS_ARCH_SKX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=skylake + DEFAULT + -march=skylake-avx512 + -mtune=skylake-avx512 ) -ENDIF() - -IF (KOKKOS_ARCH_ICL) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-client -mtune=icelake-client +endif() + +if(KOKKOS_ARCH_ICL) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-client + -mtune=icelake-client ) -ENDIF() - -IF (KOKKOS_ARCH_ICX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-server -mtune=icelake-server +endif() + +if(KOKKOS_ARCH_ICX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-server + -mtune=icelake-server ) -ENDIF() - -IF (KOKKOS_ARCH_SPR) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=sapphirerapids -mtune=sapphirerapids +endif() + +if(KOKKOS_ARCH_SPR) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=sapphirerapids + -mtune=sapphirerapids ) -ENDIF() - -IF (KOKKOS_ARCH_POWER7) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=power7 -mtune=power7 +endif() + +if(KOKKOS_ARCH_POWER7) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=power7 + -mtune=power7 ) -ENDIF() - -IF (KOKKOS_ARCH_POWER8) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr8 - DEFAULT -mcpu=power8 -mtune=power8 +endif() + +if(KOKKOS_ARCH_POWER8) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr8 + DEFAULT + -mcpu=power8 + -mtune=power8 ) -ENDIF() - -IF (KOKKOS_ARCH_POWER9) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr9 - DEFAULT -mcpu=power9 -mtune=power9 +endif() + +if(KOKKOS_ARCH_POWER9) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr9 + DEFAULT + -mcpu=power9 + -mtune=power9 ) -ENDIF() +endif() # If Kokkos_ARCH_NATIVE is enabled, we are trying to autodetect # the SIMD capabilities based on compiler macros. -IF (KOKKOS_ARCH_NATIVE) +if(KOKKOS_ARCH_NATIVE) # Make sure to rerun the checks if compile options have changed - IF(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") - SET(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") - - SET(CMAKE_REQUIRED_QUIET ON) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - - UNSET(KOKKOS_COMPILER_HAS_AVX512 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) - UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) - UNSET(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) - CHECK_CXX_SYMBOL_EXISTS(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) - UNSET(KOKKOS_COMPILER_HAS_AVX CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - - UNSET(CMAKE_REQUIRED_QUIET) - UNSET(CMAKE_REQUIRED_FLAGS) - ENDIF() + if(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") + set(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") + + set(CMAKE_REQUIRED_QUIET ON) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + + unset(KOKKOS_COMPILER_HAS_AVX512 CACHE) + check_cxx_symbol_exists(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) + unset(KOKKOS_COMPILER_HAS_AVX2 CACHE) + check_cxx_symbol_exists(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) + unset(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) + check_cxx_symbol_exists(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) + unset(KOKKOS_COMPILER_HAS_AVX CACHE) + check_cxx_symbol_exists(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + + unset(CMAKE_REQUIRED_QUIET) + unset(CMAKE_REQUIRED_FLAGS) + endif() # Only define one of these macros for now # to be uniform with what we are doing for other architectures. - IF(KOKKOS_COMPILER_HAS_AVX512) - MESSAGE(STATUS "SIMD: AVX512 detected") - SET(KOKKOS_ARCH_AVX512XEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX2) - MESSAGE(STATUS "SIMD: AVX2 detected") - SET(KOKKOS_ARCH_AVX2 ON) - ELSEIF(KOKKOS_COMPILER_HAS_ARM_NEON) - MESSAGE(STATUS "SIMD: ARM_NEON detected") - SET(KOKKOS_ARCH_ARM_NEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX) - MESSAGE(STATUS "SIMD: AVX detected") - SET(KOKKOS_ARCH_AVX ON) - ENDIF() -ENDIF() + if(KOKKOS_COMPILER_HAS_AVX512) + message(STATUS "SIMD: AVX512 detected") + set(KOKKOS_ARCH_AVX512XEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX2) + message(STATUS "SIMD: AVX2 detected") + set(KOKKOS_ARCH_AVX2 ON) + elseif(KOKKOS_COMPILER_HAS_ARM_NEON) + message(STATUS "SIMD: ARM_NEON detected") + set(KOKKOS_ARCH_ARM_NEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX) + message(STATUS "SIMD: AVX detected") + set(KOKKOS_ARCH_AVX ON) + endif() +endif() # FIXME_NVHPC nvc++ doesn't seem to support AVX512. -IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON OFF) -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) + set(KOKKOS_ARCH_AVX512XEON OFF) +endif() # FIXME_NVCC nvcc doesn't seem to support Arm Neon. -IF(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - UNSET(KOKKOS_ARCH_ARM_NEON) -ENDIF() - -IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - Clang -fcuda-rdc - NVIDIA --relocatable-device-code=true - ) - ENDIF() -ENDIF() +if(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + unset(KOKKOS_ARCH_ARM_NEON) +endif() + +if(NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(Clang -fcuda-rdc NVIDIA --relocatable-device-code=true) + endif() +endif() # Clang needs mcx16 option enabled for Windows atomic functions -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) - COMPILER_SPECIFIC_OPTIONS( - Clang -mcx16 - ) -ENDIF() +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) + compiler_specific_options(Clang -mcx16) +endif() # MSVC ABI has many deprecation warnings, so ignore them -IF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - COMPILER_SPECIFIC_DEFS( - Clang _CRT_SECURE_NO_WARNINGS - ) -ENDIF() - +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + compiler_specific_defs(Clang _CRT_SECURE_NO_WARNINGS) +endif() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (KOKKOS_ENABLE_HIP) - IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fgpu-rdc - ) - IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT --hip-link - ) - ENDIF() - ELSE() - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fno-gpu-rdc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization - ) - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-unnamed-lambda - ) -ENDIF() +if(KOKKOS_ENABLE_HIP) + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(DEFAULT -fgpu-rdc) + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + compiler_specific_link_options(DEFAULT --hip-link) + endif() + else() + compiler_specific_flags(DEFAULT -fno-gpu-rdc) + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + compiler_specific_flags(DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization) + compiler_specific_options(DEFAULT -fsycl-unnamed-lambda) + if(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2024.1.0) + # Before oneAPI 2024.1.0 passing -fno-sycl didn't work properly + if(NOT KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + message(FATAL_ERROR "Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=OFF requires oneAPI 2024.1.0 or later") + endif() + elseif(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT -fsycl-rdc) + else() + compiler_specific_options(DEFAULT -fno-sycl-rdc) + endif() +endif() # Check support for device_global variables # FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device @@ -613,17 +734,18 @@ ENDIF() # implementation. Otherwise, the feature is not supported when building shared # libraries. Thus, we don't even check for support if shared libraries are # requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. -IF(KOKKOS_ENABLE_SYCL) - STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) +if(KOKKOS_ENABLE_SYCL) + string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + if(KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) # Use the non-separable compilation implementation to support shared libraries as well. - COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) - ELSEIF(NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) - CHECK_CXX_SOURCE_COMPILES(" + compiler_specific_flags(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + elseif(NOT BUILD_SHARED_LIBS AND KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " #include using namespace sycl::ext::oneapi::experimental; using namespace sycl; @@ -638,548 +760,617 @@ IF(KOKKOS_ENABLE_SYCL) int main(){ return 0; } " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + compiler_specific_flags(DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + endif() + endif() + + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_GRAPH "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_GRAPH) +endif() + +set(CUDA_ARCH_ALREADY_SPECIFIED "") +function(CHECK_CUDA_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(CUDA_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." ) - ENDIF() - ENDIF() -ENDIF() - -SET(CUDA_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) - MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_CUDA) - STRING(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) - IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) - ELSE() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() - ENDIF() - LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) - SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) - LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) - SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) -ENDFUNCTION() - + endif() + set(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_CUDA + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_SYCL + AND NOT KOKKOS_ENABLE_OPENACC + ) + message( + WARNING + "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_CUDA) + string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) + endif() + set(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) + if(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + set(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) + else() + global_append(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() + endif() + list(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) + set(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) + list(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) + set(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) +endfunction() #These will define KOKKOS_CUDA_ARCH_FLAG #to the corresponding flag name if ON -CHECK_CUDA_ARCH(KEPLER30 sm_30) -CHECK_CUDA_ARCH(KEPLER32 sm_32) -CHECK_CUDA_ARCH(KEPLER35 sm_35) -CHECK_CUDA_ARCH(KEPLER37 sm_37) -CHECK_CUDA_ARCH(MAXWELL50 sm_50) -CHECK_CUDA_ARCH(MAXWELL52 sm_52) -CHECK_CUDA_ARCH(MAXWELL53 sm_53) -CHECK_CUDA_ARCH(PASCAL60 sm_60) -CHECK_CUDA_ARCH(PASCAL61 sm_61) -CHECK_CUDA_ARCH(VOLTA70 sm_70) -CHECK_CUDA_ARCH(VOLTA72 sm_72) -CHECK_CUDA_ARCH(TURING75 sm_75) -CHECK_CUDA_ARCH(AMPERE80 sm_80) -CHECK_CUDA_ARCH(AMPERE86 sm_86) -CHECK_CUDA_ARCH(ADA89 sm_89) -CHECK_CUDA_ARCH(HOPPER90 sm_90) - -SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_OPENACC AND NOT KOKKOS_ENABLE_SYCL) - MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_HIP) - SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) - ENDIF() - IF(NOT KOKKOS_IMPL_AMDGPU_FLAGS) - SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() +check_cuda_arch(KEPLER30 sm_30) +check_cuda_arch(KEPLER32 sm_32) +check_cuda_arch(KEPLER35 sm_35) +check_cuda_arch(KEPLER37 sm_37) +check_cuda_arch(MAXWELL50 sm_50) +check_cuda_arch(MAXWELL52 sm_52) +check_cuda_arch(MAXWELL53 sm_53) +check_cuda_arch(PASCAL60 sm_60) +check_cuda_arch(PASCAL61 sm_61) +check_cuda_arch(VOLTA70 sm_70) +check_cuda_arch(VOLTA72 sm_72) +check_cuda_arch(TURING75 sm_75) +check_cuda_arch(AMPERE80 sm_80) +check_cuda_arch(AMPERE86 sm_86) +check_cuda_arch(ADA89 sm_89) +check_cuda_arch(HOPPER90 sm_90) + +set(AMDGPU_ARCH_ALREADY_SPECIFIED "") +function(CHECK_AMDGPU_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(AMDGPU_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_HIP + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_OPENACC + AND NOT KOKKOS_ENABLE_SYCL + ) + message( + WARNING + "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_HIP) + set(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) + endif() + if(NOT KOKKOS_IMPL_AMDGPU_FLAGS) + set(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) + global_append(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + global_append(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() +endfunction() #These will define KOKKOS_AMDGPU_ARCH_FLAG #to the corresponding flag name if ON -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) -ENDFOREACH() - -IF(KOKKOS_IMPL_AMDGPU_FLAGS) - IF (NOT AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " - "Please explicitly set the GPU architecture.") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") -ENDIF() - -MACRO(SET_AND_CHECK_AMD_ARCH ARCH FLAG) - KOKKOS_SET_OPTION(ARCH_${ARCH} ON) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) -ENDMACRO() - -MACRO(CHECK_MULTIPLE_INTEL_ARCH) - IF(KOKKOS_ARCH_INTEL_GPU) - MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") - ENDIF() - SET(KOKKOS_ARCH_INTEL_GPU ON) -ENDMACRO() - -IF(KOKKOS_ARCH_INTEL_GEN) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_DG1) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN9) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN11) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN12LP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_XEHP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_PVC) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() - -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - IF (CLANG_CUDA_ARCH) - IF(KOKKOS_CLANG_IS_CRAY) - COMPILER_SPECIFIC_FLAGS( - Cray -fopenmp - ) - ELSE() - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 - NVHPC -gpu=${NVHPC_CUDA_ARCH} +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + check_amdgpu_arch(${ARCH} ${FLAG}) +endforeach() + +if(KOKKOS_IMPL_AMDGPU_FLAGS) + if(NOT AMDGPU_ARCH_ALREADY_SPECIFIED) + message(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " + "Please explicitly set the GPU architecture." + ) + endif() + global_append(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") + global_append(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") +endif() + +macro(SET_AND_CHECK_AMD_ARCH ARCH FLAG) + kokkos_set_option(ARCH_${ARCH} ON) + check_amdgpu_arch(${ARCH} ${FLAG}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) +endmacro() + +macro(CHECK_MULTIPLE_INTEL_ARCH) + if(KOKKOS_ARCH_INTEL_GPU) + message(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") + endif() + set(KOKKOS_ARCH_INTEL_GPU ON) +endmacro() + +if(KOKKOS_ARCH_INTEL_GEN) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_DG1) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN9) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN11) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN12LP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_XEHP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_PVC) + check_multiple_intel_arch() +endif() + +if(KOKKOS_ENABLE_OPENMP) + compiler_specific_link_options(CrayClang -fopenmp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + if(CLANG_CUDA_ARCH) + if(KOKKOS_CLANG_IS_CRAY) + compiler_specific_flags(Cray -fopenmp) + else() + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) + compiler_specific_flags( + Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 NVHPC -gpu=${NVHPC_CUDA_ARCH} ) - ENDIF() - ENDIF() - SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) - IF (CLANG_AMDGPU_ARCH) - COMPILER_SPECIFIC_FLAGS( + endif() + endif() + set(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) + if(CLANG_AMDGPU_ARCH) + compiler_specific_flags( Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa ) - ENDIF() - IF (KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ - ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ - ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" + endif() + if(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__) + else() + compiler_specific_options(IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__) + if(KOKKOS_ARCH_INTEL_GEN9) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7") + endif() + endif() +endif() + +if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CUDA_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" - ) - ENDIF() - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CUDA_ARCH_FLAG) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc -gpu=${NVHPC_CUDA_ARCH} - Clang -Xopenmp-target=nvptx64-nvidia-cuda -march=${CLANG_CUDA_ARCH} - -fopenmp-targets=nvptx64-nvidia-cuda + endif() + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + compiler_specific_flags( + NVHPC + -acc + -gpu=${NVHPC_CUDA_ARCH} + Clang + -Xopenmp-target=nvptx64-nvidia-cuda + -march=${CLANG_CUDA_ARCH} + -fopenmp-targets=nvptx64-nvidia-cuda ) - ELSEIF(KOKKOS_AMDGPU_ARCH_FLAG) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} - -fopenmp-targets=amdgcn-amd-amdhsa - ) - ELSE() - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} + if(DEFINED ENV{CUDA_PATH}) + compiler_specific_link_options(Clang -L$ENV{CUDA_PATH}/lib64) + endif() + compiler_specific_libs(Clang -lcudart NVHPC -cuda) + elseif(KOKKOS_AMDGPU_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSE() - MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(AMDGPU_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} - ) - ELSE() - MESSAGE(SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64 + endif() + compiler_specific_flags( + Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} -fopenmp-targets=amdgcn-amd-amdhsa ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen + if(DEFINED ENV{ROCM_PATH}) + compiler_specific_flags(Clang -I$ENV{ROCM_PATH}/include) + compiler_specific_link_options(Clang -L$ENV{ROCM_PATH}/lib) + endif() + compiler_specific_libs(Clang -lamdhip64) + elseif(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + compiler_specific_flags(NVHPC -acc=multicore) + else() + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + compiler_specific_flags(NVHPC -acc=gpu,multicore) + message( + STATUS + "No OpenACC target device is specificed; the OpenACC backend will be executed in an automatic fallback mode." ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11" + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + if(CUDA_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda + --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" + else() + message( + SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4" + endif() + elseif(AMDGPU_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" + else() + message( + SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ENDIF() - ENDIF() -ENDIF() - -IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) + endif() + elseif(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(DEFAULT -fsycl-targets=spir64) + elseif(KOKKOS_ARCH_INTEL_GPU) + set(SYCL_TARGET_FLAG -fsycl-targets=spir64_gen) + + if(KOKKOS_ARCH_INTEL_GEN9) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.60.7") + endif() + + if(Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG}) + compiler_specific_link_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + else() + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + endif() + endif() +endif() + +if(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device - SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) - FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR}) - FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) - - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + set(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) + file(REMOVE_RECURSE ${_BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) + + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) # if user is using kokkos_compiler_launcher, above will fail. - IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + if(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) - GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + get_property(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough - IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + if(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) # make sure the user knows that we aren't using CUDA compiler for anything else - MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") - INCLUDE(CheckLanguage) - CHECK_LANGUAGE(CUDA) - IF(CMAKE_CUDA_COMPILER) - ENABLE_LANGUAGE(CUDA) - ELSE() - MESSAGE(STATUS "CUDA language could not be enabled") - ENDIF() - ENDIF() + message( + STATUS + "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture..." + ) + include(CheckLanguage) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(STATUS "CUDA language could not be enabled") + endif() + endif() # if CUDA was enabled, this will be defined - IF(CMAKE_CUDA_COMPILER) + if(CMAKE_CUDA_COMPILER) # copy our test to .cu so cmake compiles as CUDA - CONFIGURE_FILE( + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COPYONLY + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COPYONLY ) # run test again - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) - ENDIF() - ENDIF() - - LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) - IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) - MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") - LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) - KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON) - CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) - ELSE() - MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " - "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" - "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " - "If you are cross-compiling, you should try to do this on a compute node.") - ENDIF() -ENDIF() + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) + endif() + endif() + + list(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) + if(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) + message(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") + list(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) + kokkos_set_option(ARCH_${ARCHITECTURE} ON) + check_cuda_arch(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) + else() + message( + SEND_ERROR + "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " + "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" + "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " + "If you are cross-compiling, you should try to do this on a compute node." + ) + endif() +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_KEPLER30 OR KOKKOS_ARCH_KEPLER32 OR KOKKOS_ARCH_KEPLER35 OR KOKKOS_ARCH_KEPLER37) - SET(KOKKOS_ARCH_KEPLER ON) -ENDIF() +if(KOKKOS_ARCH_KEPLER30 + OR KOKKOS_ARCH_KEPLER32 + OR KOKKOS_ARCH_KEPLER35 + OR KOKKOS_ARCH_KEPLER37 +) + set(KOKKOS_ARCH_KEPLER ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) - SET(KOKKOS_ARCH_MAXWELL ON) -ENDIF() +if(KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) + set(KOKKOS_ARCH_MAXWELL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) - SET(KOKKOS_ARCH_PASCAL ON) -ENDIF() +if(KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) + set(KOKKOS_ARCH_PASCAL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) - SET(KOKKOS_ARCH_VOLTA ON) -ENDIF() +if(KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) + set(KOKKOS_ARCH_VOLTA ON) +endif() -IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) - SET(KOKKOS_ARCH_AMPERE ON) -ENDIF() +if(KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) + set(KOKKOS_ARCH_AMPERE ON) +endif() -IF (KOKKOS_ARCH_HOPPER90) - SET(KOKKOS_ARCH_HOPPER ON) -ENDIF() +if(KOKKOS_ARCH_HOPPER90) + set(KOKKOS_ARCH_HOPPER ON) +endif() + +function(CHECK_AMD_APU ARCH) + set(BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/AmdApuWorkdir) + file(REMOVE_RECURSE ${BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${BINARY_TEST_DIR}) + + try_run(RESULT COMPILE_RESULT ${BINARY_TEST_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/amd_apu.cc + RUN_OUTPUT_VARIABLE AMD_APU + ) + + if(NOT COMPILE_RESULT OR NOT RESULT EQUAL 0) + message(SEND_ERROR "Autodetection of AMD APU failed." + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + + if(AMD_APU) + set(${ARCH} AMD_GFX942_APU PARENT_SCOPE) + endif() +endfunction() #HIP detection of gpu arch -IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator) - IF(NOT ROCM_ENUMERATOR) - MESSAGE(FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " - "rocm_agent_enumerator could not be found. " - "Please specify an arch manually via -DKokkos_ARCH_{..}=ON") - ELSE() - EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) - STRING(LENGTH "${GPU_ARCHS}" len_str) +if(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + find_program(ROCM_ENUMERATOR rocm_agent_enumerator) + if(NOT ROCM_ENUMERATOR) + message( + FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " "rocm_agent_enumerator could not be found. " + "Please specify an arch manually via -DKokkos_ARCH_{..}=ON" + ) + else() + execute_process(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) + string(LENGTH "${GPU_ARCHS}" len_str) # enumerator always output gfx000 as the first line - IF(${len_str} LESS 8) - MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - # check for known gpu archs, otherwise error out - ELSE() - SET(AMD_ARCH_DETECTED "") - FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - STRING(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) - IF("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") - SET_AND_CHECK_AMD_ARCH(${ARCH} ${FLAG}) - SET(AMD_ARCH_DETECTED ${ARCH}) - BREAK() - ENDIF() - ENDFOREACH() - IF("${AMD_ARCH_DETECTED}" STREQUAL "") - MESSAGE(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " - "is supported. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - ENDIF() - ENDIF() - ENDIF() -ENDIF() - -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - STRING(REGEX MATCH "90A" IS_90A ${ARCH}) - IF(IS_90A) - SET(KOKKOS_ARCH_AMD_GFX90A ON) - SET(KOKKOS_ARCH_VEGA90A ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "908" IS_908 ${ARCH}) - IF(IS_908) - SET(KOKKOS_ARCH_AMD_GFX908 ON) - SET(KOKKOS_ARCH_VEGA908 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "906" IS_906 ${ARCH}) - IF(IS_906) - SET(KOKKOS_ARCH_AMD_GFX906 ON) - SET(KOKKOS_ARCH_VEGA906 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1100" IS_1100 ${ARCH}) - IF(IS_1100) - SET(KOKKOS_ARCH_AMD_GFX1100 ON) - SET(KOKKOS_ARCH_NAVI1100 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1030" IS_1030 ${ARCH}) - IF(IS_1030) - SET(KOKKOS_ARCH_AMD_GFX1030 ON) - SET(KOKKOS_ARCH_NAVI1030 ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() + if(${len_str} LESS 8) + message(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + # check for known gpu archs, otherwise error out + else() + set(AMD_ARCH_DETECTED "") + foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + string(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) + if("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") + # If we detected gfx942, we need to discriminate between APU and discrete GPU + if(FLAG STREQUAL "gfx942") + check_amd_apu(ARCH) + endif() + set_and_check_amd_arch(${ARCH} ${FLAG}) + set(AMD_ARCH_DETECTED ${ARCH}) + break() + endif() + endforeach() + if("${AMD_ARCH_DETECTED}" STREQUAL "") + message(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " "is supported. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + endif() + endif() +endif() + +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + string(REGEX MATCH "90A" IS_90A ${ARCH}) + if(IS_90A) + set(KOKKOS_ARCH_AMD_GFX90A ON) + set(KOKKOS_ARCH_VEGA90A ON) + break() + endif() + string(REGEX MATCH "908" IS_908 ${ARCH}) + if(IS_908) + set(KOKKOS_ARCH_AMD_GFX908 ON) + set(KOKKOS_ARCH_VEGA908 ON) + break() + endif() + string(REGEX MATCH "906" IS_906 ${ARCH}) + if(IS_906) + set(KOKKOS_ARCH_AMD_GFX906 ON) + set(KOKKOS_ARCH_VEGA906 ON) + break() + endif() + string(REGEX MATCH "1100" IS_1100 ${ARCH}) + if(IS_1100) + set(KOKKOS_ARCH_AMD_GFX1100 ON) + set(KOKKOS_ARCH_NAVI1100 ON) + break() + endif() + string(REGEX MATCH "1030" IS_1030 ${ARCH}) + if(IS_1030) + set(KOKKOS_ARCH_AMD_GFX1030 ON) + set(KOKKOS_ARCH_NAVI1030 ON) + break() + endif() + endif() +endforeach() #Regardless of version, make sure we define the general architecture name -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - SET(KOKKOS_ARCH_AMD_GPU ON) - STRING(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) - IF(IS_VEGA) - SET(KOKKOS_ARCH_VEGA ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) - IF(IS_NAVI) - SET(KOKKOS_ARCH_NAVI ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + set(KOKKOS_ARCH_AMD_GPU "${FLAG}") + string(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) + if(IS_VEGA) + set(KOKKOS_ARCH_VEGA ON) + break() + endif() + string(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) + if(IS_NAVI) + set(KOKKOS_ARCH_NAVI ON) + break() + endif() + endif() +endforeach() #CMake verbose is kind of pointless #Let's just always print things -MESSAGE(STATUS "Built-in Execution Spaces:") - -FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_DEVICE_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_DEVICE_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "Cuda") - IF(KOKKOS_ENABLE_CUDA_UVM) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead") - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") - ENDIF() - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - ENDIF() - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSEIF(${_BACKEND} STREQUAL "HIP") - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() -IF(NOT _DEVICE_PARALLEL) - SET(_DEVICE_PARALLEL "NoTypeDefined") - SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") -ENDIF() -MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") - -FOREACH (_BACKEND OpenMP Threads HPX) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_HOST_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_HOST_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "HPX") - SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ELSE() - SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() - -IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "At least one host execution space must be enabled, " - "but no host parallel execution space was requested " - "and Kokkos_ENABLE_SERIAL=OFF.") -ENDIF() - -IF(_HOST_PARALLEL) -MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") -ELSE() - SET(_HOST_PARALLEL "NoTypeDefined") - MESSAGE(STATUS " Host Parallel: NoTypeDefined") -ENDIF() - -IF(KOKKOS_ENABLE_SERIAL) - MESSAGE(STATUS " Host Serial: SERIAL") -ELSE() - MESSAGE(STATUS " Host Serial: NONE") -ENDIF() - -MESSAGE(STATUS "") -MESSAGE(STATUS "Architectures:") -FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) - MESSAGE(STATUS " ${Arch}") -ENDFOREACH() - - -IF(KOKKOS_ENABLE_ATOMICS_BYPASS) - IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") - MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") - ENDIF() - IF(NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "Implementation bug") # safeguard - ENDIF() - MESSAGE(STATUS "Atomics: **DISABLED**") -ENDIF() +message(STATUS "Built-in Execution Spaces:") + +foreach(_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_DEVICE_PARALLEL) + message( + FATAL_ERROR + "Multiple device parallel execution spaces are not allowed! " + "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_DEVICE_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "Cuda") + if(KOKKOS_ENABLE_CUDA_UVM) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead" + ) + if(NOT KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") + endif() + endif() + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + elseif(${_BACKEND} STREQUAL "HIP" OR ${_BACKEND} STREQUAL "SYCL") + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + else() + set(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + endif() + endif() +endforeach() +if(NOT _DEVICE_PARALLEL) + set(_DEVICE_PARALLEL "NoTypeDefined") +endif() +message(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") + +foreach(_BACKEND OpenMP Threads HPX) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_HOST_PARALLEL) + message( + FATAL_ERROR + "Multiple host parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_HOST_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "HPX") + set(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + else() + set(_HOST_PARALLEL "Kokkos::${_BACKEND}") + endif() + endif() +endforeach() + +if(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "At least one host execution space must be enabled, " + "but no host parallel execution space was requested " "and Kokkos_ENABLE_SERIAL=OFF." + ) +endif() + +if(_HOST_PARALLEL) + message(STATUS " Host Parallel: ${_HOST_PARALLEL}") +else() + set(_HOST_PARALLEL "NoTypeDefined") + message(STATUS " Host Parallel: NoTypeDefined") +endif() + +if(KOKKOS_ENABLE_SERIAL) + message(STATUS " Host Serial: SERIAL") +else() + message(STATUS " Host Serial: NONE") +endif() + +message(STATUS "") +message(STATUS "Architectures:") +foreach(Arch ${KOKKOS_ENABLED_ARCH_LIST}) + message(STATUS " ${Arch}") +endforeach() + +if(KOKKOS_ENABLE_ATOMICS_BYPASS) + if(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + message( + FATAL_ERROR + "Disabling atomics (via -DKokkos_ENABLE_ATOMICS_BYPASS=ON) is not allowed if a host parallel or a device backend is enabled!" + ) + endif() + if(NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "Implementation bug") # safeguard + endif() + message(STATUS "Atomics: **DISABLED**") +endif() diff --git a/lib/kokkos/cmake/kokkos_check_env.cmake b/lib/kokkos/cmake/kokkos_check_env.cmake index a455a403b9d..f1a309ff857 100644 --- a/lib/kokkos/cmake/kokkos_check_env.cmake +++ b/lib/kokkos/cmake/kokkos_check_env.cmake @@ -1,12 +1,15 @@ -SET(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) -IF (CRAYPE_VERSION) - SET(KOKKOS_IS_CRAYPE TRUE) - SET(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) - IF (CRAYPE_LINK_TYPE) - IF (NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") - MESSAGE(WARNING "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() - ELSE() - MESSAGE(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() -ENDIF() +set(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) +if(CRAYPE_VERSION) + set(KOKKOS_IS_CRAYPE TRUE) + set(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) + if(CRAYPE_LINK_TYPE) + if(NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") + message( + WARNING + "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'" + ) + endif() + else() + message(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") + endif() +endif() diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index e8bfadb64eb..010ed33ede8 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -1,262 +1,273 @@ -KOKKOS_CFG_DEPENDS(COMPILER_ID NONE) +kokkos_cfg_depends(COMPILER_ID NONE) -SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) -SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) -SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) +set(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) +set(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +set(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) -MACRO(kokkos_internal_have_compiler_nvcc) +macro(kokkos_internal_have_compiler_nvcc) # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${ARGN} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") - IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) - SET(INTERNAL_HAVE_COMPILER_NVCC true) - ELSE() - SET(INTERNAL_HAVE_COMPILER_NVCC false) - ENDIF() -ENDMACRO() + execute_process(COMMAND ${ARGN} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + string(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + if(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + set(INTERNAL_HAVE_COMPILER_NVCC true) + else() + set(INTERNAL_HAVE_COMPILER_NVCC false) + endif() +endmacro() -IF(Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # kokkos_enable_options is not yet called so use lower case here - IF(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) kokkos_internal_have_compiler_nvcc(${CMAKE_CUDA_COMPILER}) - ELSE() + else() # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) # Check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # If launcher was found and nvcc_wrapper was not specified as # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher. # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper - IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(CMAKE_CXX_COMPILER_LAUNCHER) - MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!") - ENDIF() + if(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_LAUNCHER) + message( + FATAL_ERROR + "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!" + ) + endif() # the first argument to launcher is always the C++ compiler defined by cmake # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) - SET(INTERNAL_USE_COMPILER_LAUNCHER true) - ENDIF() - ENDIF() -ENDIF() + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} + -DKOKKOS_DEPENDENCE + ) + set(INTERNAL_USE_COMPILER_LAUNCHER true) + endif() + endif() +endif() -IF(INTERNAL_HAVE_COMPILER_NVCC) +if(INTERNAL_HAVE_COMPILER_NVCC) # Save the host compiler id before overwriting it. - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) # SET the compiler id to nvcc. We use the value used by CMake 3.8. - SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) + set(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) - STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") - IF(INTERNAL_USE_COMPILER_LAUNCHER) - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") + string(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + string(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + if(INTERNAL_USE_COMPILER_LAUNCHER) + message(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) - ENDIF() -ENDIF() + endif() +endif() -IF(Kokkos_ENABLE_HIP) +if(Kokkos_ENABLE_HIP) # get HIP version - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) - IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) - SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) - ENDIF() + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + if(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + set(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + endif() - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c Cray - OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_CRAY TRUE) - ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c Cray + OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_CRAY TRUE) + set(KOKKOS_CXX_COMPILER_ID CrayClang) + endif() # The clang based Intel compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c "DPC++\\|icpx" - OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_INTEL TRUE) - SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - ENDIF() -ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c "DPC++\\|icpx" + OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_INTEL TRUE) + set(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) # SET Cray's compiler version. - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - IF (KOKKOS_CLANG_IS_CRAY) - SET(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) - ELSE() - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - ENDIF() -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + if(KOKKOS_CLANG_IS_CRAY) + set(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) + else() + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) # SET Fujitsus compiler version which is not detected by CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +endif() # Enforce the minimum compilers supported by Kokkos. -IF(NOT CMAKE_CXX_STANDARD) - SET(CMAKE_CXX_STANDARD 17) -ENDIF() -IF(CMAKE_CXX_STANDARD EQUAL 17) - SET(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 8.2.0) - SET(KOKKOS_INTEL_MINIMUM 19.0.5) - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 11.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.29) -ELSE() - SET(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 10.1.0) - SET(KOKKOS_INTEL_MINIMUM "not supported") - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 12.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.30) -ENDIF() +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +if(CMAKE_CXX_STANDARD EQUAL 17) + set(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 8.2.0) + set(KOKKOS_INTEL_MINIMUM 19.0.5) + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 11.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.29) +else() + set(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 10.1.0) + set(KOKKOS_INTEL_MINIMUM "not supported") + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 12.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.30) +endif() -SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") +set(KOKKOS_MESSAGE_TEXT + "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:" +) +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() - SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + if((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() # Treat PGI internally as NVHPC to simplify handling both compilers. # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is # backward-compatible to pgc++. - SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ENDIF() + set(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +endif() -IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) -ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) - SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ENDIF() +if(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) +elseif(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) + set(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +endif() -STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) -LIST(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) -LIST(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) -LIST(LENGTH VERSION_LIST LIST_LENGTH) +string(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) +list(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) +list(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) +list(LENGTH VERSION_LIST LIST_LENGTH) # On Android, the compiler doesn't have a patch version, just a major/minor -IF(LIST_LENGTH GREATER 2) - LIST(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) -ELSE() - SET(KOKKOS_COMPILER_VERSION_PATCH 0) -ENDIF() - +if(LIST_LENGTH GREATER 2) + list(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) +else() + set(KOKKOS_COMPILER_VERSION_PATCH 0) +endif() diff --git a/lib/kokkos/cmake/kokkos_configure_trilinos.cmake b/lib/kokkos/cmake/kokkos_configure_trilinos.cmake new file mode 100644 index 00000000000..5aeef61e7b3 --- /dev/null +++ b/lib/kokkos/cmake/kokkos_configure_trilinos.cmake @@ -0,0 +1,38 @@ +if(CMAKE_PROJECT_NAME STREQUAL "Trilinos") + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "Whether to build Serial backend" FORCE) + + if(NOT ${Trilinos_ENABLE_OpenMP} STREQUAL "") + set(Kokkos_ENABLE_OPENMP ${Trilinos_ENABLE_OpenMP} CACHE BOOL "Whether to build OpenMP backend" FORCE) + else() + set(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "Whether to build OpenMP backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_CUDA} STREQUAL "") + set(Kokkos_ENABLE_CUDA ${TPL_ENABLE_CUDA} CACHE BOOL "Whether to build CUDA backend" FORCE) + else() + set(Kokkos_ENABLE_CUDA OFF CACHE BOOL "Whether to build CUDA backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_HPX} STREQUAL "") + set(Kokkos_ENABLE_HPX ${TPL_ENABLE_HPX} CACHE BOOL "Whether to build HPX backend" FORCE) + else() + set(Kokkos_ENABLE_HPX OFF CACHE BOOL "Whether to build HPX backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_quadmath} STREQUAL "") + set(Kokkos_ENABLE_LIBQUADMATH ${TPL_ENABLE_quadmath} CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + else() + set(Kokkos_ENABLE_LIBQUADMATH OFF CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + endif() + + if(NOT ${TPL_ENABLE_DLlib} STREQUAL "") + set(Kokkos_ENABLE_LIBDL ${TPL_ENABLE_DLlib} CACHE BOOL "Whether to enable the LIBDL library" FORCE) + else() + set(Kokkos_ENABLE_LIBDL OFF CACHE BOOL "Whether to enable the LIBDL library" FORCE) + endif() + + set(Kokkos_ENABLE_COMPLEX_ALIGN OFF CACHE BOOL "Whether to align Kokkos::complex to 2*alignof(RealType)") + + # FIXME_TRILINOS We run into problems when trying to use an external GTest in Trilinos CI + set(CMAKE_DISABLE_FIND_PACKAGE_GTest ON) +endif() diff --git a/lib/kokkos/cmake/kokkos_corner_cases.cmake b/lib/kokkos/cmake/kokkos_corner_cases.cmake index ede2b4e0caf..530e9e8fd8e 100644 --- a/lib/kokkos/cmake/kokkos_corner_cases.cmake +++ b/lib/kokkos/cmake/kokkos_corner_cases.cmake @@ -1,4 +1,8 @@ -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.2) - MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496") -ENDIF() - +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 11.2 +) + message( + WARNING + "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496" + ) +endif() diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index c7d189285c5..40c2d3ea8af 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -1,128 +1,132 @@ - -FUNCTION(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME}) - LIST(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) +function(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME}) + list(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) - IF (KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") - SET(KOKKOS_HAS_HOST ON PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + set(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) + if(KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") + set(KOKKOS_HAS_HOST ON PARENT_SCOPE) + endif() +endfunction() -KOKKOS_CFG_DEPENDS(DEVICES NONE) +kokkos_cfg_depends(DEVICES NONE) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(DEVICES ENABLE) - +kokkos_deprecated_list(DEVICES ENABLE) -KOKKOS_DEVICE_OPTION(THREADS OFF HOST "Whether to build C++ threads backend") +kokkos_device_option(THREADS OFF HOST "Whether to build C++ threads backend") # detect clang++ / cl / clang-cl clashes -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") # this specific test requires CMake >= 3.15 - IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + if("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") # use pure clang++ instead of clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC OFF) - ELSE() + set(KOKKOS_COMPILER_CLANG_MSVC OFF) + else() # it defaults to clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC ON) - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) - SET(OMP_DEFAULT ON) -ELSE() - SET(OMP_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") + set(KOKKOS_COMPILER_CLANG_MSVC ON) + endif() +endif() +if(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) + set(OMP_DEFAULT ON) +else() + set(OMP_DEFAULT OFF) +endif() +kokkos_device_option(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") # We want this to default to OFF for cache reasons, but if no # host space is given, then activate serial -IF (KOKKOS_HAS_TRILINOS) - #However, Trilinos always wants Serial ON - SET(SERIAL_DEFAULT ON) -ELSEIF (KOKKOS_HAS_HOST) - SET(SERIAL_DEFAULT OFF) -ELSE() - SET(SERIAL_DEFAULT ON) - IF (NOT DEFINED Kokkos_ENABLE_SERIAL) - MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") - ENDIF() -ENDIF() -KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") - -KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") +if(KOKKOS_HAS_HOST) + set(SERIAL_DEFAULT OFF) +else() + set(SERIAL_DEFAULT ON) + if(NOT DEFINED Kokkos_ENABLE_SERIAL) + message( + STATUS + "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt" + ) + endif() +endif() +kokkos_device_option(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") + +kokkos_device_option(HPX OFF HOST "Whether to build HPX backend (experimental)") # Device backends have to come after host backends for header include order reasons # Without this we can't make e.g. CudaSpace accessible by HostSpace -KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend") -IF (KOKKOS_ENABLE_OPENACC) - COMPILER_SPECIFIC_FLAGS( - Clang -fopenacc -fopenacc-fake-async-wait - -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version - -Wno-pass-failed - ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG - ) -ENDIF() - -KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(ClangOpenMPFlag -fopenmp=libomp) - IF(KOKKOS_CLANG_IS_CRAY) - SET(ClangOpenMPFlag -fopenmp) - ENDIF() - - COMPILER_SPECIFIC_FLAGS( - Clang ${ClangOpenMPFlag} -Wno-openmp-mapping - IntelLLVM -fiopenmp -Wno-openmp-mapping - NVHPC -mp=gpu - DEFAULT -fopenmp +kokkos_device_option(OPENACC OFF DEVICE "Whether to build the OpenACC backend") +if(KOKKOS_ENABLE_OPENACC) + compiler_specific_flags( + Clang + -fopenacc + -fopenacc-fake-async-wait + -fopenacc-implicit-worker=vector + -Wno-openacc-and-cxx + -Wno-openmp-mapping + -Wno-unknown-cuda-version + -Wno-pass-failed ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) +endif() + +kokkos_device_option(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") +if(KOKKOS_ENABLE_OPENMPTARGET) + set(ClangOpenMPFlag -fopenmp=libomp) + if(KOKKOS_CLANG_IS_CRAY) + set(ClangOpenMPFlag -fopenmp) + endif() + + compiler_specific_flags( + Clang + ${ClangOpenMPFlag} + -Wno-openmp-mapping + IntelLLVM + -fiopenmp + -Wno-openmp-mapping + NVHPC + -mp=gpu + DEFAULT + -fopenmp ) -# Are there compilers which identify as Clang and need this library? -# COMPILER_SPECIFIC_LIBS( -# Clang -lopenmptarget -# ) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_DEFAULT ON) -ELSE() - SET(CUDA_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") - -IF (KOKKOS_ENABLE_CUDA) - GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") -## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros - LIST(APPEND DEVICE_SETUP_LIST Cuda) -ENDIF() - -KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) + # Are there compilers which identify as Clang and need this library? + # COMPILER_SPECIFIC_LIBS( + # Clang -lopenmptarget + # ) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") + endif() +endif() + +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + set(CUDA_DEFAULT ON) +else() + set(CUDA_DEFAULT OFF) +endif() +kokkos_device_option(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") + +if(KOKKOS_ENABLE_CUDA) + global_set(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") + ## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + list(APPEND DEVICE_SETUP_LIST Cuda) +endif() + +kokkos_device_option(HIP OFF DEVICE "Whether to build HIP backend") ## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros -IF (KOKKOS_ENABLE_HIP) - LIST(APPEND DEVICE_SETUP_LIST HIP) -ENDIF() +if(KOKKOS_ENABLE_HIP) + list(APPEND DEVICE_SETUP_LIST HIP) +endif() -KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") +kokkos_device_option(SYCL OFF DEVICE "Whether to build SYCL backend") ## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros -IF (KOKKOS_ENABLE_SYCL) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") - ENDIF() - LIST(APPEND DEVICE_SETUP_LIST SYCL) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "SYCL backend requires C++17 or newer!") + endif() + list(APPEND DEVICE_SETUP_LIST SYCL) +endif() diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 53764b0c684..a5d6fdfe4ed 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -1,198 +1,236 @@ ########################## NOTES ############################################### # List the options for configuring kokkos using CMake method of doing it. -# These options then get mapped onto KOKKOS_SETTINGS environment variable by -# kokkos_settings.cmake. It is separate to allow other packages to override -# these variables (e.g., TriBITS). ########################## AVAILABLE OPTIONS ################################### # Use lists for documentation, verification, and programming convenience - -FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) +function(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) + list(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) -ENDFUNCTION() + set(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) +endfunction() # Certain defaults will depend on knowing the enabled devices -KOKKOS_CFG_DEPENDS(OPTIONS DEVICES) -KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) +kokkos_cfg_depends(OPTIONS DEVICES) +kokkos_cfg_depends(OPTIONS COMPILER_ID) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) +kokkos_deprecated_list(OPTIONS ENABLE) -KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") -KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") -KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") +kokkos_enable_option(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") +kokkos_enable_option(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") +kokkos_enable_option(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") # In contrast to other CUDA-dependent, options CUDA_LAMBDA is ON by default. # That is problematic when CUDA is not enabled because this not only yields a # bogus warning, but also exports the Kokkos_ENABLE_CUDA_LAMBDA variable and -# sets it to ON. This if-clause is a crutch that delays the refactoring of the -# way we declare all options until after we get rid of TriBITS. -IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSEIF (KOKKOS_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSE() - SET(CUDA_LAMBDA_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**") - -# May be used to disable our use of CudaMallocAsync. It had caused issues in -# the past when UCX was used as MPI communication layer. We expect it is -# resolved but we keep the option around a bit longer to be safe. -KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") -KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") - -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) -KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) -KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") -KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") -KOKKOS_ENABLE_OPTION(BENCHMARKS OFF "Whether to build the benchmarks") -KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") -STRING(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) -IF(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - KOKKOS_ENABLE_OPTION(DEBUG ON "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") -ELSE() - KOKKOS_ENABLE_OPTION(DEBUG OFF "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") -ENDIF() -UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") -KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") -KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") -KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") -KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") -KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") -KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") -KOKKOS_ENABLE_OPTION(IMPL_HIP_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for HIP") +# sets it to ON. +kokkos_enable_option( + CUDA_LAMBDA ${KOKKOS_ENABLE_CUDA} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**" +) + +# As of 09/2024, cudaMallocAsync causes issues with ICP and older version of UCX +# as MPI communication layer. +kokkos_enable_option(IMPL_CUDA_MALLOC_ASYNC OFF "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") +kokkos_enable_option(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") +kokkos_enable_option(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") + +kokkos_enable_option(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available") +kokkos_enable_option(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings") +kokkos_enable_option(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") + +# Disabling RDC only works properly since oneAPI 2024.1.0 +if(KOKKOS_ENABLE_SYCL AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 2024.1.0 +) + set(SYCL_RDC_DEFAULT ON) +else() + set(SYCL_RDC_DEFAULT OFF) +endif() +kokkos_enable_option( + SYCL_RELOCATABLE_DEVICE_CODE ${SYCL_RDC_DEFAULT} "Whether to enable relocatable device code (RDC) for SYCL" +) +kokkos_enable_option(TESTS OFF "Whether to build the unit tests") +kokkos_enable_option(BENCHMARKS OFF "Whether to build the benchmarks") +kokkos_enable_option(EXAMPLES OFF "Whether to build the examples") +string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) +if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + kokkos_enable_option(DEBUG ON "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") +else() + kokkos_enable_option(DEBUG OFF "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") +endif() +unset(_UPPERCASE_CMAKE_BUILD_TYPE) +kokkos_enable_option(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") +kokkos_enable_option(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") +kokkos_enable_option(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") +kokkos_enable_option(TUNING OFF "Whether to create bindings for tuning tools") +kokkos_enable_option(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +kokkos_enable_option(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") +kokkos_enable_option( + HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF + "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time" +) +kokkos_enable_option(IMPL_HIP_MALLOC_ASYNC OFF "Whether to enable hipMallocAsync") +kokkos_enable_option(OPENACC_FORCE_HOST_AS_DEVICE OFF "Whether to force to use host as a target device for OpenACC") # This option will go away eventually, but allows fallback to old implementation when needed. -KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") -KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") -KOKKOS_ENABLE_OPTION(IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting") +kokkos_enable_option(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +kokkos_enable_option( + ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases" +) +kokkos_enable_option( + IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting" +) mark_as_advanced(Kokkos_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY) -KOKKOS_ENABLE_OPTION(IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction.") +kokkos_enable_option( + IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF + "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction." +) mark_as_advanced(Kokkos_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND) -KOKKOS_ENABLE_OPTION(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") -KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") -KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan") +kokkos_enable_option(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") +kokkos_enable_option(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") +kokkos_enable_option( + IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan" +) mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) -IF (Trilinos_ENABLE_Kokkos) - SET(COMPLEX_ALIGN_DEFAULT OFF) -ELSE() - SET(COMPLEX_ALIGN_DEFAULT ON) -ENDIF() -KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") - -IF (KOKKOS_ENABLE_TESTS) - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) -ELSE() - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") -IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) - MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) - SET(CUDA_CONSTEXPR_DEFAULT ON) -ELSE() - SET(CUDA_CONSTEXPR_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") - -IF (KOKKOS_ENABLE_HPX) - SET(HPX_ASYNC_DISPATCH_DEFAULT ON) -ELSE() - SET(HPX_ASYNC_DISPATCH_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") - -Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") - -FUNCTION(check_device_specific_options) - CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) - IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) - FOREACH(OPTION ${SOME_OPTIONS}) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() - IF(KOKKOS_ENABLE_${OPTION}) - MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") - UNSET(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() - -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +kokkos_enable_option(COMPLEX_ALIGN ON "Whether to align Kokkos::complex to 2*alignof(RealType)") + +if(KOKKOS_ENABLE_TESTS) + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +else() + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +endif() +kokkos_enable_option( + HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests" +) +if(NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + message( + WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored." + ) +endif() + +if(KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) + set(CUDA_CONSTEXPR_DEFAULT ON) +else() + set(CUDA_CONSTEXPR_DEFAULT OFF) +endif() +kokkos_enable_option( + CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions" +) + +if(KOKKOS_ENABLE_HPX) + set(HPX_ASYNC_DISPATCH_DEFAULT ON) +else() + set(HPX_ASYNC_DISPATCH_DEFAULT OFF) +endif() +kokkos_enable_option(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") + +kokkos_enable_option(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") + +function(check_device_specific_options) + cmake_parse_arguments(SOME "" "DEVICE" "OPTIONS" ${ARGN}) + if(NOT KOKKOS_ENABLE_${SOME_DEVICE}) + foreach(OPTION ${SOME_OPTIONS}) + if(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + message(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") + endif() + if(KOKKOS_ENABLE_${OPTION}) + message( + WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored." + ) + unset(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) + endif() + endforeach() + endif() +endfunction() + +check_device_specific_options( + DEVICE + CUDA + OPTIONS + CUDA_UVM + CUDA_RELOCATABLE_DEVICE_CODE + CUDA_LAMBDA + CUDA_CONSTEXPR + CUDA_LDG_INTRINSIC + IMPL_CUDA_MALLOC_ASYNC + IMPL_CUDA_UNIFIED_MEMORY +) +check_device_specific_options( + DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE HIP_MULTIPLE_KERNEL_INSTANTIATIONS IMPL_HIP_MALLOC_ASYNC +) +check_device_specific_options(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +check_device_specific_options(DEVICE OPENACC OPTIONS OPENACC_FORCE_HOST_AS_DEVICE) # Needed due to change from deprecated name to new header define name -IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) - SET(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) -ENDIF() +if(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) + set(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) +endif() # Force consistency of KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE # and CMAKE_CUDA_SEPARABLE_COMPILATION when we are compiling # using the CMake CUDA language support. # Either one being on will turn the other one on. -IF (KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - IF (NOT CMAKE_CUDA_SEPARABLE_COMPILATION) - MESSAGE(STATUS "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support") - SET(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - ELSE() - IF (CMAKE_CUDA_SEPARABLE_COMPILATION) - SET(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) - ENDIF() - ENDIF() -ENDIF() +if(KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + if(NOT CMAKE_CUDA_SEPARABLE_COMPILATION) + message( + STATUS + "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support" + ) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + endif() + else() + if(CMAKE_CUDA_SEPARABLE_COMPILATION) + set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) + endif() + endif() +endif() # This is known to occur with Clang 9. We would need to use nvcc as the linker # http://lists.llvm.org/pipermail/cfe-dev/2018-June/058296.html # TODO: Through great effort we can use a different linker by hacking # CMAKE_CXX_LINK_EXECUTABLE in a future release -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - MESSAGE(FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) - MESSAGE(FATAL_ERROR "Relocatable device code requires static libraries.") -ENDIF() - -IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") - ENDIF() -ENDIF() -IF(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON") +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + message( + FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC" + ) +endif() + +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) + message(FATAL_ERROR "Relocatable device code requires static libraries.") +endif() + +if(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") + endif() +endif() +if(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON" + ) set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE) set(KOKKOS_ENABLE_CUDA_LAMBDA ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") - ENDIF() -ENDIF() - - -IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) - MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") -ENDIF() + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") + endif() +endif() + +if(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) + message(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") +endif() diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index d1f1e0d7a78..38eedd8362c 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -5,12 +5,8 @@ # Validate options are given with correct case and define an internal # upper-case version for use within -set(Kokkos_OPTIONS_NOT_TO_EXPORT - Kokkos_ENABLE_BENCHMARKS - Kokkos_ENABLE_EXAMPLES - Kokkos_ENABLE_TESTS - Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS - Kokkos_ENABLE_COMPILER_WARNINGS +set(Kokkos_OPTIONS_NOT_TO_EXPORT Kokkos_ENABLE_BENCHMARKS Kokkos_ENABLE_EXAMPLES Kokkos_ENABLE_TESTS + Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS Kokkos_ENABLE_COMPILER_WARNINGS ) # @@ -22,139 +18,122 @@ set(Kokkos_OPTIONS_NOT_TO_EXPORT # It attempts to print a helpful message about updating the options for the new CMake. # Kokkos_${SUFFIX} is the name of the option (like Kokkos_ARCH) being checked. # Kokkos_${PREFIX}_X is the name of new option to be defined from a list X,Y,Z,... -FUNCTION(kokkos_deprecated_list SUFFIX PREFIX) - SET(CAMEL_NAME Kokkos_${SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_deprecated_list SUFFIX PREFIX) + set(CAMEL_NAME Kokkos_${SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) #I don't love doing it this way but better to be safe - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - STRING(REPLACE "," ";" optlist "${${opt}}") - SET(ERROR_MSG "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:") - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - STRING(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") - ENDFOREACH() - STRING(APPEND ERROR_MSG "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it).") - IF (KOKKOS_HAS_TRILINOS) - MESSAGE(WARNING ${ERROR_MSG}) - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - SET(${CAMEL_NAME}_${ENTRY_UC} ON CACHE BOOL "Deprecated Trilinos translation") - ENDFOREACH() - UNSET(${opt} CACHE) - ELSE() - MESSAGE(SEND_ERROR ${ERROR_MSG}) - ENDIF() - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE}) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + string(REPLACE "," ";" optlist "${${opt}}") + set(ERROR_MSG + "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:" + ) + foreach(entry ${optlist}) + string(TOUPPER ${entry} ENTRY_UC) + string(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") + endforeach() + string( + APPEND + ERROR_MSG + "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it)." + ) + message(SEND_ERROR ${ERROR_MSG}) + endif() + endforeach() +endfunction() + +function(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES ${TYPE}) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) # Make sure this appears in the cache with the appropriate DOCSTRING - SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) - - IF (KOKKOS_HAS_TRILINOS) - IF (NOT CAMEL_NAME IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - TRIBITS_PKG_EXPORT_CACHE_VAR(${CAMEL_NAME}) - ENDIF() - ENDIF() + set(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -INCLUDE (CMakeDependentOption) -FUNCTION(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES BOOL) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - - CMAKE_DEPENDENT_OPTION(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() + +include(CMakeDependentOption) +function(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES BOOL) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + + cmake_dependent_option(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE) - LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) - IF(OPTION_INDEX EQUAL -1) - MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") - ENDIF() - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) - LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) - SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) - MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}") - SET(${UC_NAME} ${VALUE} PARENT_SCOPE) -ENDFUNCTION() - -FUNCTION(kokkos_append_config_line LINE) - GLOBAL_APPEND(KOKKOS_TPL_EXPORTS "${LINE}") -ENDFUNCTION() - -MACRO(kokkos_export_cmake_tpl NAME) + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() + +function(kokkos_set_option CAMEL_SUFFIX VALUE) + list(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) + if(OPTION_INDEX EQUAL -1) + message(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") + endif() + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) + list(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) + set(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) + message(STATUS "Setting ${CAMEL_NAME}=${VALUE}") + set(${UC_NAME} ${VALUE} PARENT_SCOPE) +endfunction() + +function(kokkos_append_config_line LINE) + global_append(KOKKOS_TPL_EXPORTS "${LINE}") +endfunction() + +macro(kokkos_export_cmake_tpl NAME) cmake_parse_arguments(KOKKOS_EXTRA_ARG "REQUIRED" "" "COMPONENTS" ${ARGN}) #CMake TPLs are located with a call to find_package @@ -163,91 +142,88 @@ MACRO(kokkos_export_cmake_tpl NAME) #If Kokkos was configured to find the TPL through a _DIR variable #make sure thar DIR variable is available to downstream packages - IF (DEFINED ${NAME}_DIR) + if(DEFINED ${NAME}_DIR) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_DIR)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_DIR ${${NAME}_DIR})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_DIR)") + kokkos_append_config_line(" SET(${NAME}_DIR ${${NAME}_DIR})") + kokkos_append_config_line("ENDIF()") + endif() - IF (DEFINED ${NAME}_ROOT) + if(DEFINED ${NAME}_ROOT) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_ROOT)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_ROOT ${${NAME}_ROOT})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - SET(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") - - IF(KOKKOS_EXTRA_ARG_REQUIRED) - STRING(APPEND KOKKOS_CONFIG_STRING " REQUIRED") - ENDIF() - IF(KOKKOS_EXTRA_ARG_COMPONENTS) - STRING(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") - ENDIF() - STRING(APPEND KOKKOS_CONFIG_STRING ")") - KOKKOS_APPEND_CONFIG_LINE(${KOKKOS_CONFIG_STRING}) -ENDMACRO() - -MACRO(kokkos_export_imported_tpl NAME) - IF (NOT KOKKOS_HAS_TRILINOS) - GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED) - IF (NOT LIB_IMPORTED) - # This is not an imported target - # This an interface library that we created - INSTALL( - TARGETS ${NAME} - EXPORT KokkosTargets - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) - ELSE() - #make sure this also gets "exported" in the config file - KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})") - - GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE) - IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - ELSE() - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) - IF(TPL_LIBRARY) - KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") - ENDIF() - ENDIF() - - GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) - IF(TPL_INCLUDES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") - ENDIF() - - GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) - IF(TPL_COMPILE_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") - ENDIF() - - SET(TPL_LINK_OPTIONS) - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - IF(TPL_LINK_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") - ENDIF() - - GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) - IF(TPL_LINK_LIBRARIES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") - ENDIF() - KOKKOS_APPEND_CONFIG_LINE(")") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - ENDIF() -ENDMACRO() - + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_ROOT)") + kokkos_append_config_line(" SET(${NAME}_ROOT ${${NAME}_ROOT})") + kokkos_append_config_line("ENDIF()") + endif() + set(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") + + if(KOKKOS_EXTRA_ARG_REQUIRED) + string(APPEND KOKKOS_CONFIG_STRING " REQUIRED") + endif() + if(KOKKOS_EXTRA_ARG_COMPONENTS) + string(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") + endif() + string(APPEND KOKKOS_CONFIG_STRING ")") + kokkos_append_config_line(${KOKKOS_CONFIG_STRING}) +endmacro() + +macro(kokkos_export_imported_tpl NAME) + get_target_property(LIB_IMPORTED ${NAME} IMPORTED) + if(NOT LIB_IMPORTED) + # This is not an imported target + # This an interface library that we created + install( + TARGETS ${NAME} + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + else() + #make sure this also gets "exported" in the config file + kokkos_append_config_line("IF(NOT TARGET ${NAME})") + + get_target_property(LIB_TYPE ${NAME} TYPE) + if(${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") + kokkos_append_config_line("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + else() + kokkos_append_config_line("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + get_target_property(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) + if(TPL_LIBRARY) + kokkos_append_config_line("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") + endif() + endif() + + get_target_property(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) + if(TPL_INCLUDES) + kokkos_append_config_line("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") + endif() + + get_target_property(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) + if(TPL_COMPILE_OPTIONS) + kokkos_append_config_line("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") + endif() + + set(TPL_LINK_OPTIONS) + get_target_property(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) + if(TPL_LINK_OPTIONS) + kokkos_append_config_line("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") + endif() + + get_target_property(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) + if(TPL_LINK_LIBRARIES) + kokkos_append_config_line("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") + endif() + kokkos_append_config_line(")") + kokkos_append_config_line("ENDIF()") + endif() +endmacro() # # @MACRO: KOKKOS_IMPORT_TPL() @@ -271,57 +247,43 @@ ENDMACRO() # # If specified, this TPL will build an INTERFACE library rather than an # IMPORTED target -IF (KOKKOS_HAS_TRILINOS) -MACRO(kokkos_import_tpl NAME) - #do nothing -ENDMACRO() -ELSE() -MACRO(kokkos_import_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT;INTERFACE" - "" - "" - ${ARGN}) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - - IF (KOKKOS_ENABLE_${NAME}) +macro(kokkos_import_tpl NAME) + cmake_parse_arguments(TPL "NO_EXPORT;INTERFACE" "" "" ${ARGN}) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + + if(KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find - FIND_PACKAGE(TPL${NAME} REQUIRED MODULE) - IF(NOT TARGET ${TPL_IMPORTED_NAME}) - MESSAGE(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") - ENDIF() - IF(NOT TPL_NO_EXPORT) - GET_TARGET_PROPERTY(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) - IF (NOT TPL_ORIGINAL_NAME) - SET(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) - ENDIF() - KOKKOS_EXPORT_IMPORTED_TPL(${TPL_ORIGINAL_NAME}) - ENDIF() - LIST(APPEND KOKKOS_ENABLED_TPLS ${NAME}) - ENDIF() -ENDMACRO(kokkos_import_tpl) -ENDIF() - -MACRO(kokkos_import_cmake_tpl MODULE_NAME) + find_package(TPL${NAME} REQUIRED MODULE) + if(NOT TARGET ${TPL_IMPORTED_NAME}) + message(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") + endif() + if(NOT TPL_NO_EXPORT) + get_target_property(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) + if(NOT TPL_ORIGINAL_NAME) + set(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) + endif() + kokkos_export_imported_tpl(${TPL_ORIGINAL_NAME}) + endif() + list(APPEND KOKKOS_ENABLED_TPLS ${NAME}) + endif() +endmacro(kokkos_import_tpl) + +macro(kokkos_import_cmake_tpl MODULE_NAME) kokkos_import_tpl(${MODULE_NAME} ${ARGN} NO_EXPORT) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT" - "OPTION_NAME" - "" - ${ARGN}) + cmake_parse_arguments(TPL "NO_EXPORT" "OPTION_NAME" "" ${ARGN}) - IF (NOT TPL_OPTION_NAME) - SET(TPL_OPTION_NAME ${MODULE_NAME}) - ENDIF() + if(NOT TPL_OPTION_NAME) + set(TPL_OPTION_NAME ${MODULE_NAME}) + endif() - IF (NOT TPL_NO_EXPORT) - KOKKOS_EXPORT_CMAKE_TPL(${MODULE_NAME}) - ENDIF() -ENDMACRO() + if(NOT TPL_NO_EXPORT) + kokkos_export_cmake_tpl(${MODULE_NAME}) + endif() +endmacro() # # @MACRO: KOKKOS_CREATE_IMPORTED_TPL() @@ -368,68 +330,57 @@ ENDMACRO() # # If specified, this gives a list of linker flags that must be used # for using this library. -MACRO(kokkos_create_imported_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE" - "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" - ${ARGN}) - - - IF (KOKKOS_HAS_TRILINOS) - #TODO: we need to set a bunch of cache variables here - ELSEIF (TPL_INTERFACE) - ADD_LIBRARY(${NAME} INTERFACE) +macro(kokkos_create_imported_tpl NAME) + cmake_parse_arguments( + TPL "INTERFACE" "LIBRARY" "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN} + ) + + if(TPL_INTERFACE) + add_library(${NAME} INTERFACE) #Give this an importy-looking name - ADD_LIBRARY(Kokkos::${NAME} ALIAS ${NAME}) - IF (TPL_LIBRARY) - MESSAGE(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") - ENDIF() + add_library(Kokkos::${NAME} ALIAS ${NAME}) + if(TPL_LIBRARY) + message(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) - ENDIF() - IF(TPL_INCLUDES) - TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) - ENDIF() - IF(TPL_COMPILE_OPTIONS) - TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) - ENDIF() - IF(TPL_LINK_OPTIONS) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) - ENDIF() - ELSE() - ADD_LIBRARY(${NAME} UNKNOWN IMPORTED) - IF(TPL_LIBRARY) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - IMPORTED_LOCATION ${TPL_LIBRARY}) - ENDIF() + if(TPL_LINK_LIBRARIES) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) + endif() + if(TPL_INCLUDES) + target_include_directories(${NAME} INTERFACE ${TPL_INCLUDES}) + endif() + if(TPL_COMPILE_DEFINITIONS) + target_compile_definitions(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + endif() + if(TPL_COMPILE_OPTIONS) + target_compile_options(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) + endif() + if(TPL_LINK_OPTIONS) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) + endif() + else() + add_library(${NAME} UNKNOWN IMPORTED) + if(TPL_LIBRARY) + set_target_properties(${NAME} PROPERTIES IMPORTED_LOCATION ${TPL_LIBRARY}) + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") - ENDIF() - IF(TPL_INCLUDES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") - ENDIF() - IF(TPL_COMPILE_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") - ENDIF() - IF(TPL_LINK_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") - ENDIF() - ENDIF() -ENDMACRO() + if(TPL_LINK_LIBRARIES) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") + endif() + if(TPL_INCLUDES) + set_target_properties(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") + endif() + if(TPL_COMPILE_DEFINITIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + endif() + if(TPL_COMPILE_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") + endif() + if(TPL_LINK_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") + endif() + endif() +endmacro() # # @MACRO: KOKKOS_FIND_HEADER @@ -479,37 +430,32 @@ ENDMACRO() # # Custom paths to search for the header # -MACRO(kokkos_find_header VAR_NAME HEADER TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS" - ${ARGN}) - - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) - - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_PATH(${VAR_NAME} ${HEADER} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} +macro(kokkos_find_header VAR_NAME HEADER TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS" ${ARGN}) + + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) + + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_path( + ${VAR_NAME} ${HEADER} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} PATH_SUFFIXES include - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_PATH(${VAR_NAME} ${HEADER}) - ENDIF() + find_path(${VAR_NAME} ${HEADER}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_LIBRARY @@ -565,42 +511,36 @@ ENDMACRO() # Suffixes appended to PATHS when attempting to locate # the library. Defaults to {lib, lib64}. # -MACRO(kokkos_find_library VAR_NAME LIB TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS;SUFFIXES" - ${ARGN}) - - IF(NOT TPL_SUFFIXES) - SET(TPL_SUFFIXES lib lib64) - ENDIF() - - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) - - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_LIBRARY(${VAR_NAME} ${LIB} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} - PATH_SUFFIXES - ${TPL_SUFFIXES} - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() - - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) +macro(kokkos_find_library VAR_NAME LIB TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS;SUFFIXES" ${ARGN}) + + if(NOT TPL_SUFFIXES) + set(TPL_SUFFIXES lib lib64) + endif() + + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) + + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_library( + ${VAR_NAME} ${LIB} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} + PATH_SUFFIXES ${TPL_SUFFIXES} + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() + + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_LIBRARY(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) - ENDIF() + find_library(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_IMPORTED @@ -683,111 +623,127 @@ ENDMACRO() # If specified, this gives a list of paths to search for the headers # If not given, _ROOT/include and _ROOT/include will be searched. # -MACRO(kokkos_find_imported NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" - "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" - "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" - ${ARGN}) - - IF(NOT TPL_MODULE_NAME) - SET(TPL_MODULE_NAME TPL${NAME}) - ENDIF() - - IF (TPL_ALLOW_SYSTEM_PATH_FALLBACK) - SET(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) - ELSE() - SET(ALLOW_PATH_FALLBACK_OPT) - ENDIF() - - IF (NOT TPL_IMPORTED_NAME) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - ENDIF() - - IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib) - IF(KOKKOS_IMPL_32BIT) - LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) - ELSE() - LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) - ENDIF() - ENDIF() - - SET(${NAME}_INCLUDE_DIRS) - IF (TPL_HEADER) - KOKKOS_FIND_HEADER(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - ENDIF() - - FOREACH(HEADER ${TPL_HEADERS}) - KOKKOS_FIND_HEADER(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - IF(HEADER_FIND_TEMP) - LIST(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) - ENDIF() - ENDFOREACH() - - SET(${NAME}_LIBRARY) - IF(TPL_LIBRARY) - KOKKOS_FIND_LIBRARY(${NAME}_LIBRARY ${TPL_LIBRARY} ${NAME} +macro(kokkos_find_imported NAME) + cmake_parse_arguments( + TPL "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" + "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" ${ARGN} + ) + + if(NOT TPL_MODULE_NAME) + set(TPL_MODULE_NAME TPL${NAME}) + endif() + + if(TPL_ALLOW_SYSTEM_PATH_FALLBACK) + set(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) + else() + set(ALLOW_PATH_FALLBACK_OPT) + endif() + + if(NOT TPL_IMPORTED_NAME) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + endif() + + if(NOT TPL_LIBRARY_SUFFIXES) + set(TPL_LIBRARY_SUFFIXES lib) + if(KOKKOS_IMPL_32BIT) + list(APPEND TPL_LIBRARY_SUFFIXES lib32) + else() + list(APPEND TPL_LIBRARY_SUFFIXES lib64) + endif() + endif() + + set(${NAME}_INCLUDE_DIRS) + if(TPL_HEADER) + kokkos_find_header(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + endif() + + foreach(HEADER ${TPL_HEADERS}) + kokkos_find_header(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + if(HEADER_FIND_TEMP) + list(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) + endif() + endforeach() + + set(${NAME}_LIBRARY) + if(TPL_LIBRARY) + kokkos_find_library( + ${NAME}_LIBRARY + ${TPL_LIBRARY} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - ENDIF() - - SET(${NAME}_FOUND_LIBRARIES) - FOREACH(LIB ${TPL_LIBRARIES}) - KOKKOS_FIND_LIBRARY(${LIB}_LOCATION ${LIB} ${NAME} + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + endif() + + set(${NAME}_FOUND_LIBRARIES) + foreach(LIB ${TPL_LIBRARIES}) + kokkos_find_library( + ${LIB}_LOCATION + ${LIB} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - IF(${LIB}_LOCATION) - LIST(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - ELSE() - SET(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - BREAK() - ENDIF() - ENDFOREACH() - - INCLUDE(FindPackageHandleStandardArgs) + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + if(${LIB}_LOCATION) + list(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + else() + set(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + break() + endif() + endforeach() + + include(FindPackageHandleStandardArgs) #Collect all the variables we need to be valid for #find_package to have succeeded - SET(TPL_VARS_NEEDED) - IF (TPL_LIBRARY) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) - ENDIF() - IF(TPL_HEADER) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) - ENDIF() - IF(TPL_LIBRARIES) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) - ENDIF() - FIND_PACKAGE_HANDLE_STANDARD_ARGS(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) - - MARK_AS_ADVANCED(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) + set(TPL_VARS_NEEDED) + if(TPL_LIBRARY) + list(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) + endif() + if(TPL_HEADER) + list(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) + endif() + if(TPL_LIBRARIES) + list(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) + endif() + find_package_handle_standard_args(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) + + mark_as_advanced(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) #this is so much fun on a Cray system #/usr/include should never be added as a -isystem include #this freaks out the compiler include search order - IF (KOKKOS_IS_CRAYPE) - LIST(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") - ENDIF() - - IF (${TPL_MODULE_NAME}_FOUND) - SET(IMPORT_TYPE) - IF (TPL_INTERFACE) - SET(IMPORT_TYPE "INTERFACE") - SET(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) - ENDIF() - KOKKOS_CREATE_IMPORTED_TPL(${TPL_IMPORTED_NAME} + if(KOKKOS_IS_CRAYPE) + list(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") + endif() + + if(${TPL_MODULE_NAME}_FOUND) + set(IMPORT_TYPE) + if(TPL_INTERFACE) + set(IMPORT_TYPE "INTERFACE") + set(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) + endif() + kokkos_create_imported_tpl( + ${TPL_IMPORTED_NAME} ${IMPORT_TYPE} - INCLUDES "${${NAME}_INCLUDE_DIRS}" - LIBRARY "${${NAME}_LIBRARY}" - LINK_LIBRARIES "${${NAME}_FOUND_LIBRARIES}") - ENDIF() -ENDMACRO(kokkos_find_imported) + INCLUDES + "${${NAME}_INCLUDE_DIRS}" + LIBRARY + "${${NAME}_LIBRARY}" + LINK_LIBRARIES + "${${NAME}_FOUND_LIBRARIES}" + ) + endif() +endmacro(kokkos_find_imported) # # @MACRO: KOKKOS_LINK_TPL() @@ -817,109 +773,114 @@ ENDMACRO(kokkos_find_imported) # If specified, this gives the exact name of the target to link against # target_link_libraries( ) # -FUNCTION(kokkos_link_tpl TARGET) - CMAKE_PARSE_ARGUMENTS(TPL - "PUBLIC;PRIVATE;INTERFACE" - "IMPORTED_NAME" - "" - ${ARGN}) +function(kokkos_link_tpl TARGET) + cmake_parse_arguments(TPL "PUBLIC;PRIVATE;INTERFACE" "IMPORTED_NAME" "" ${ARGN}) #the name of the TPL - SET(TPL ${TPL_UNPARSED_ARGUMENTS}) - IF (KOKKOS_HAS_TRILINOS) - #Do nothing, they will have already been linked - ELSE() - IF (NOT TPL_IMPORTED_NAME) - SET(TPL_IMPORTED_NAME Kokkos::${TPL}) - ENDIF() - IF (KOKKOS_ENABLE_${TPL}) - IF (TPL_PUBLIC) - TARGET_LINK_LIBRARIES(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_PRIVATE) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_INTERFACE) - TARGET_LINK_LIBRARIES(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) - ELSE() - TARGET_LINK_LIBRARIES(${TARGET} ${TPL_IMPORTED_NAME}) - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() - -FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA NVHPC DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu MSVC) - CMAKE_PARSE_ARGUMENTS( - PARSE - "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" - "COMPILER_ID" - "${COMPILERS}" - ${ARGN}) - IF(PARSE_UNPARSED_ARGUMENTS) - MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options") - ENDIF() - - IF(PARSE_COMPILER_ID) - SET(COMPILER ${${PARSE_COMPILER_ID}}) - ELSE() - SET(COMPILER ${KOKKOS_CXX_COMPILER_ID}) - ENDIF() - - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) - FOREACH(COMP ${COMPILERS}) - IF (COMPILER STREQUAL "${COMP}") - IF (PARSE_${COMPILER}) - IF ("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") - SET(COMPILER_SPECIFIC_FLAGS_TMP "") - ELSE() - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() - - IF (PARSE_COMPILE_OPTIONS) + set(TPL ${TPL_UNPARSED_ARGUMENTS}) + if(NOT TPL_IMPORTED_NAME) + set(TPL_IMPORTED_NAME Kokkos::${TPL}) + endif() + if(KOKKOS_ENABLE_${TPL}) + if(TPL_PUBLIC) + target_link_libraries(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) + elseif(TPL_PRIVATE) + target_link_libraries(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) + elseif(TPL_INTERFACE) + target_link_libraries(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) + else() + target_link_libraries(${TARGET} ${TPL_IMPORTED_NAME}) + endif() + endif() +endfunction() + +function(COMPILER_SPECIFIC_OPTIONS_HELPER) + set(COMPILERS + NVIDIA + NVHPC + DEFAULT + Cray + Intel + Clang + AppleClang + IntelLLVM + GNU + HIPCC + Fujitsu + MSVC + CrayClang + ) + cmake_parse_arguments( + PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" "COMPILER_ID" "${COMPILERS}" ${ARGN} + ) + if(PARSE_UNPARSED_ARGUMENTS) + message( + SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options" + ) + endif() + + if(PARSE_COMPILER_ID) + set(COMPILER ${${PARSE_COMPILER_ID}}) + else() + set(COMPILER ${KOKKOS_CXX_COMPILER_ID}) + endif() + + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) + foreach(COMP ${COMPILERS}) + if(COMPILER STREQUAL "${COMP}") + if(PARSE_${COMPILER}) + if("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") + set(COMPILER_SPECIFIC_FLAGS_TMP "") + else() + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) + endif() + endif() + endif() + endforeach() + + if(PARSE_COMPILE_OPTIONS) # The funky logic here is for future handling of argument deduplication # If we naively pass multiple -Xcompiler flags to target_compile_options # -Xcompiler will get deduplicated and break the build - IF ("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) - LIST(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") - GLOBAL_APPEND(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ELSE() - GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - ENDIF() - - IF (PARSE_LINK_OPTIONS) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - - IF (PARSE_COMPILE_DEFINITIONS) - GLOBAL_APPEND(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - - IF (PARSE_LINK_LIBRARIES) - GLOBAL_APPEND(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - -FUNCTION(COMPILER_SPECIFIC_FLAGS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_FLAGS) - -FUNCTION(COMPILER_SPECIFIC_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS) - -FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - -FUNCTION(COMPILER_SPECIFIC_DEFS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS) -ENDFUNCTION(COMPILER_SPECIFIC_DEFS) - -FUNCTION(COMPILER_SPECIFIC_LIBS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) -ENDFUNCTION(COMPILER_SPECIFIC_LIBS) + if("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) + list(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") + global_append(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + else() + global_append(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + endif() + + if(PARSE_LINK_OPTIONS) + global_append(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + + if(PARSE_COMPILE_DEFINITIONS) + global_append(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + + if(PARSE_LINK_LIBRARIES) + global_append(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() +endfunction(COMPILER_SPECIFIC_OPTIONS_HELPER) + +function(COMPILER_SPECIFIC_FLAGS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_FLAGS) + +function(COMPILER_SPECIFIC_OPTIONS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS) +endfunction(COMPILER_SPECIFIC_OPTIONS) + +function(COMPILER_SPECIFIC_LINK_OPTIONS) + compiler_specific_options_helper(${ARGN} LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_LINK_OPTIONS) + +function(COMPILER_SPECIFIC_DEFS) + compiler_specific_options_helper(${ARGN} COMPILE_DEFINITIONS) +endfunction(COMPILER_SPECIFIC_DEFS) + +function(COMPILER_SPECIFIC_LIBS) + compiler_specific_options_helper(${ARGN} LINK_LIBRARIES) +endfunction(COMPILER_SPECIFIC_LIBS) # Given a list of the form # key1;value1;key2;value2,... # Create a list of all keys in a variable named ${KEY_LIST_NAME} @@ -927,41 +888,42 @@ ENDFUNCTION(COMPILER_SPECIFIC_LIBS) # kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2) # would produce a list variable ALL_ARCHES=key1;key2 # and individual variables ARCHkey1=value1 and ARCHkey2=value2 -MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) - SET(PARSE_KEY ON) - SET(${KEY_LIST_NAME}) - FOREACH(ENTRY ${ARGN}) - IF(PARSE_KEY) - SET(CURRENT_KEY ${ENTRY}) - SET(PARSE_KEY OFF) - LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) - ELSE() - SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) - SET(PARSE_KEY ON) - ENDIF() - ENDFOREACH() -ENDMACRO() - -FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) - KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) - FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST}) - SET(OPTION_NAME Kokkos_${OPTION_SUFFIX}) - SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) - IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off - MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") - ENDIF() - ENDFOREACH() -ENDFUNCTION() +macro(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) + set(PARSE_KEY ON) + set(${KEY_LIST_NAME}) + foreach(ENTRY ${ARGN}) + if(PARSE_KEY) + set(CURRENT_KEY ${ENTRY}) + set(PARSE_KEY OFF) + list(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) + else() + set(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) + set(PARSE_KEY ON) + endif() + endforeach() +endmacro() + +function(KOKKOS_CHECK_DEPRECATED_OPTIONS) + kokkos_key_value_map(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) + foreach(OPTION_SUFFIX ${DEPRECATED_LIST}) + set(OPTION_NAME Kokkos_${OPTION_SUFFIX}) + set(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) + if(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off + message(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") + endif() + endforeach() +endfunction() # this function checks whether the current CXX compiler supports building CUDA -FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) - # don't run this test every time - IF(DEFINED ${_VAR}) - RETURN() - ENDIF() - - FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp -" +function(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + if(DEFINED ${_VAR}) + return() + endif() + + file( + WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp + " #include #include @@ -985,14 +947,13 @@ int main() cudaDeviceSynchronize(); return EXIT_SUCCESS; } -") +" + ) - TRY_COMPILE(_RET - ${PROJECT_BINARY_DIR}/compile_tests - SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + try_compile(_RET ${PROJECT_BINARY_DIR}/compile_tests SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) - SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") -ENDFUNCTION() + set(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +endfunction() # this function is provided to easily select which files use nvcc_wrapper: # @@ -1005,58 +966,77 @@ ENDFUNCTION() # NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. # This version explicitly uses nvcc_wrapper. # -FUNCTION(kokkos_compilation) - # check whether the compiler already supports building CUDA - KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) - # if CUDA compile test has already been performed, just return - IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) - RETURN() - ENDIF() - - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) - - # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") - ENDIF() - - # find nvcc_wrapper - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") - ENDIF() - - IF(COMP_GLOBAL) - # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ELSE() - FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) - # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) - IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) - LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) - UNSET(COMP_${_TYPE}) - ENDIF() - # set the properties if defined - IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() +function(kokkos_compilation) + # check whether the compiler already supports building CUDA + kokkos_cxx_compiler_cuda_test(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + if(Kokkos_CXX_COMPILER_COMPILES_CUDA) + return() + endif() + + cmake_parse_arguments(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # find kokkos_launch_compiler + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR + "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'" + ) + endif() + + # find nvcc_wrapper + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'" + ) + endif() + + if(COMP_GLOBAL) + # if global, don't bother setting others + set_property( + GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + else() + foreach(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + if("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + list(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + unset(COMP_${_TYPE}) + endif() + # set the properties if defined + if(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + endif() + endforeach() + endif() +endfunction() ## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names ## and create output config header file...used for ## creating dynamic include files based on enabled backends @@ -1066,14 +1046,15 @@ ENDFUNCTION() ## HEADER_GUARD TEXT used with include header guard ## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) ## DATA_LIST list of backends to include in generated file -FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) - SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") - CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) - FOREACH( BACKEND_NAME ${DATA_LIST} ) - SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> -\@INCLUDE_NEXT_FILE\@") - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) - ENDFOREACH() - SET(INCLUDE_NEXT_FILE "" ) - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) -ENDFUNCTION() +function(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + set(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + configure_file(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + foreach(BACKEND_NAME ${DATA_LIST}) + set(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@" + ) + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + endforeach() + set(INCLUDE_NEXT_FILE "") + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +endfunction() diff --git a/lib/kokkos/cmake/kokkos_install.cmake b/lib/kokkos/cmake/kokkos_install.cmake index f818dfa2448..3ae7570ffea 100644 --- a/lib/kokkos/cmake/kokkos_install.cmake +++ b/lib/kokkos/cmake/kokkos_install.cmake @@ -1,57 +1,51 @@ -INCLUDE(CMakePackageConfigHelpers) -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - INCLUDE(GNUInstallDirs) +include(CMakePackageConfigHelpers) +if(NOT Kokkos_INSTALL_TESTING) + include(GNUInstallDirs) #Set all the variables needed for KokkosConfig.cmake - GET_PROPERTY(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - SET(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) + get_property(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + set(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) - INCLUDE(CMakePackageConfigHelpers) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfig.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + include(CMakePackageConfigHelpers) + configure_package_config_file( + cmake/KokkosConfig.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfigCommon.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + configure_package_config_file( + cmake/KokkosConfigCommon.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - WRITE_BASIC_PACKAGE_VERSION_FILE("${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) # Install the KokkosConfig*.cmake files - install(FILES - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) + install(FILES "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos + ) install(EXPORT KokkosTargets NAMESPACE Kokkos:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${Kokkos_BINARY_DIR}/KokkosTargets.cmake) # Required to be a TriBITS-compliant external package file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake) -ELSE() - CONFIGURE_FILE(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake KOKKOS_CONFIG_COMMON) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_CONFIG_COMMON}") - CONFIGURE_FILE(cmake/KokkosTrilinosConfig.cmake.in ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake KOKKOS_TRILINOS_CONFIG) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_TRILINOS_CONFIG}") - - WRITE_BASIC_PACKAGE_VERSION_FILE("${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake + ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos + ) + file(WRITE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake + "include(${Kokkos_BINARY_DIR}/KokkosTargets.cmake)" + ) +else() + configure_file(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) + + write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/${PACKAGE_NAME}") -ENDIF() - -INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) + DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/Kokkos" + ) +endif() +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) diff --git a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake index ae14a10d531..0d31e6d131f 100644 --- a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,20 +1,28 @@ # From CMake 3.10 documentation #This can run at any time -KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17") +kokkos_option( + CXX_STANDARD + "" + STRING + "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17" +) # Set CXX standard flags -SET(KOKKOS_ENABLE_CXX17 OFF) -SET(KOKKOS_ENABLE_CXX20 OFF) -SET(KOKKOS_ENABLE_CXX23 OFF) -SET(KOKKOS_ENABLE_CXX26 OFF) -IF (KOKKOS_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") -ENDIF() +set(KOKKOS_ENABLE_CXX17 OFF) +set(KOKKOS_ENABLE_CXX20 OFF) +set(KOKKOS_ENABLE_CXX23 OFF) +set(KOKKOS_ENABLE_CXX26 OFF) +if(KOKKOS_CXX_STANDARD) + message( + FATAL_ERROR + "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead" + ) +endif() -IF (NOT CMAKE_CXX_STANDARD) - SET(KOKKOS_CXX_STANDARD "17") -ELSE() - SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -ENDIF() -MESSAGE(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") +if(NOT CMAKE_CXX_STANDARD) + set(KOKKOS_CXX_STANDARD "17") +else() + set(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +endif() +message(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index 5b45674e057..a84e714064d 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -1,101 +1,112 @@ -KOKKOS_CFG_DEPENDS(CXX_STD COMPILER_ID) +kokkos_cfg_depends(CXX_STD COMPILER_ID) -FUNCTION(kokkos_set_cxx_standard_feature standard) - SET(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) - SET(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) - SET(FEATURE_NAME cxx_std_${standard}) +function(kokkos_set_cxx_standard_feature standard) + set(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) + set(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) + set(FEATURE_NAME cxx_std_${standard}) #CMake's way of telling us that the standard (or extension) #flags are supported is the extension/standard variables - IF (NOT DEFINED CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSEIF(CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - MESSAGE(FATAL_ERROR "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue") - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSE() - #For trilinos, we need to make sure downstream projects - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ENDIF() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + global_set(KOKKOS_USE_CXX_EXTENSIONS OFF) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + elseif(CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + message( + FATAL_ERROR + "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue" + ) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + endif() - IF (KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) - MESSAGE(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) - MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") - IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - IF(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) - SET(SUPPORTED_NVCC_FLAGS "-std=c++17") - ELSE() - SET(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") - ENDIF() - IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) - MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") - ENDIF() - ENDIF() - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + if(KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) + message(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) + message(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU + OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang) + ) + if(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) + set(SUPPORTED_NVCC_FLAGS "-std=c++17") + else() + set(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") + endif() + if(NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) + message( + FATAL_ERROR + "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help." + ) + endif() + endif() + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") #MSVC doesn't need a command line flag, that doesn't mean it has no support - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSE() + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu") + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + else() #nope, we can't do anything here - MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ENDIF() + message( + WARNING + "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command." + ) + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + endif() - IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) - IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) - MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") - ENDIF() - ENDIF() -ENDFUNCTION() + if((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + if(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message( + FATAL_ERROR + "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported" + ) + endif() + endif() +endfunction() -IF(KOKKOS_CXX_STANDARD STREQUAL "17") +if(KOKKOS_CXX_STANDARD STREQUAL "17") kokkos_set_cxx_standard_feature(17) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") - SET(KOKKOS_ENABLE_CXX17 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") + set(KOKKOS_ENABLE_CXX17 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "20") kokkos_set_cxx_standard_feature(20) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") - SET(KOKKOS_ENABLE_CXX20 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") + set(KOKKOS_ENABLE_CXX20 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") - SET(KOKKOS_ENABLE_CXX23 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") + set(KOKKOS_ENABLE_CXX23 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "26") kokkos_set_cxx_standard_feature(26) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") - SET(KOKKOS_ENABLE_CXX26 ON) -ELSE() - MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") -ENDIF() + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + set(KOKKOS_ENABLE_CXX26 ON) +else() + message(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") +endif() # Enforce that we can compile a simple C++17 program -TRY_COMPILE(CAN_COMPILE_CPP17 - ${KOKKOS_TOP_BUILD_DIR}/corner_cases - ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp - OUTPUT_VARIABLE ERROR_MESSAGE - CXX_STANDARD 17 +try_compile( + CAN_COMPILE_CPP17 ${KOKKOS_TOP_BUILD_DIR}/corner_cases ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp + OUTPUT_VARIABLE ERROR_MESSAGE CXX_STANDARD 17 ) -if (NOT CAN_COMPILE_CPP17) - UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") -ENDIF() -UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - +if(NOT CAN_COMPILE_CPP17) + unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this + message( + FATAL_ERROR + "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}" + ) +endif() +unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's @@ -105,66 +116,70 @@ UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # that we can only use host compilers for CUDA builds that use those flags. # It also means that extensions (gnu++17) can't be turned on for CUDA builds. -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") + endif() +endif() -IF(KOKKOS_ENABLE_CUDA) +if(KOKKOS_ENABLE_CUDA) # ENFORCE that the compiler can compile CUDA code. - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) - MESSAGE(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") - ENDIF() - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() - ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") - ENDIF() -ENDIF() + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") + endif() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message( + FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF" + ) + endif() + elseif(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + message( + FATAL_ERROR + "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}" + ) + endif() +endif() -IF (NOT KOKKOS_CXX_STANDARD_FEATURE) +if(NOT KOKKOS_CXX_STANDARD_FEATURE) #we need to pick the C++ flags ourselves - UNSET(CMAKE_CXX_STANDARD) - UNSET(CMAKE_CXX_STANDARD CACHE) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake) + unset(CMAKE_CXX_STANDARD) + unset(CMAKE_CXX_STANDARD CACHE) + if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) + include(${KOKKOS_SRC_PATH}/cmake/cray.cmake) kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + include(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + include(${KOKKOS_SRC_PATH}/cmake/intel.cmake) kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) + include(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) + else() + include(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) kokkos_set_gnu_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ENDIF() + endif() #check that the compiler accepts the C++ standard flag - INCLUDE(CheckCXXCompilerFlag) - IF (DEFINED CXX_STD_FLAGS_ACCEPTED) - UNSET(CXX_STD_FLAGS_ACCEPTED CACHE) - ENDIF() - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) - IF (NOT CXX_STD_FLAGS_ACCEPTED) - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) - IF (NOT CXX_INT_STD_FLAGS_ACCEPTED) - MESSAGE(FATAL_ERROR "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}") - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) - ENDIF() - MESSAGE(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") -ENDIF() - - - - + include(CheckCXXCompilerFlag) + if(DEFINED CXX_STD_FLAGS_ACCEPTED) + unset(CXX_STD_FLAGS_ACCEPTED CACHE) + endif() + check_cxx_compiler_flag("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) + if(NOT CXX_STD_FLAGS_ACCEPTED) + check_cxx_compiler_flag("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) + if(NOT CXX_INT_STD_FLAGS_ACCEPTED) + message( + FATAL_ERROR + "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}" + ) + endif() + set(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) + endif() + message(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") +endif() diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index cda9e0d6004..f43aff4d1f0 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -1,126 +1,120 @@ -KOKKOS_CFG_DEPENDS(TPLS OPTIONS) -KOKKOS_CFG_DEPENDS(TPLS DEVICES) -KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) +kokkos_cfg_depends(TPLS OPTIONS) +kokkos_cfg_depends(TPLS DEVICES) +kokkos_cfg_depends(TPLS COMPILER_ID) -FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) - CMAKE_PARSE_ARGUMENTS(PARSED - "" - "TRIBITS" - "" - ${ARGN}) +function(KOKKOS_TPL_OPTION PKG DEFAULT) + cmake_parse_arguments(PARSED "" "TRIBITS" "" ${ARGN}) - IF (PARSED_TRIBITS) + if(PARSED_TRIBITS) #this is also a TPL option you can activate with Tribits - IF (NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") + if(NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") #Tribits brought its own default that should take precedence - SET(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) - ENDIF() - ENDIF() + set(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) + endif() + endif() - KOKKOS_ENABLE_OPTION(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") - KOKKOS_OPTION(${PKG}_DIR "" PATH "Location of ${PKG} library") - SET(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) - SET(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) + kokkos_enable_option(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") + kokkos_option(${PKG}_DIR "" PATH "Location of ${PKG} library") + set(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) + set(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) +endfunction() - IF (KOKKOS_HAS_TRILINOS - AND KOKKOS_ENABLE_${PKG} - AND NOT PARSED_TRIBITS) - #this TPL was enabled, but it is not valid to use inside of TriBITS - MESSAGE(FATAL_ERROR "Enabled TPL ${PKG} inside TriBITS build, " - "but this can only be enabled in a standalone build") - ENDIF() -ENDFUNCTION() - -KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT - KOKKOS_HAS_TRILINOS) - SET(ROCM_DEFAULT ON) -ELSE() - SET(ROCM_DEFAULT OFF) -ENDIF() -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) - SET(ROCTHRUST_DEFAULT ON) -ELSE() - SET(ROCTHRUST_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) -KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) +kokkos_tpl_option(HWLOC Off TRIBITS HWLOC) +kokkos_tpl_option(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) +if(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + set(ROCM_DEFAULT ON) +else() + set(ROCM_DEFAULT OFF) +endif() +if(KOKKOS_ENABLE_HIP) + set(ROCTHRUST_DEFAULT ON) +else() + set(ROCTHRUST_DEFAULT OFF) +endif() +kokkos_tpl_option(ROCM ${ROCM_DEFAULT}) +kokkos_tpl_option(ROCTHRUST ${ROCTHRUST_DEFAULT}) +if(Kokkos_ENABLE_ROCTHRUST) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " + #include + int main() { + static_assert(_GLIBCXX_RELEASE < 9); + return 0; + } + " + Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + ) +endif() -IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) - SET(ONEDPL_DEFAULT ON) -ELSE() - SET(ONEDPL_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ONEDPL ${ONEDPL_DEFAULT}) +if(KOKKOS_ENABLE_SYCL) + set(ONEDPL_DEFAULT ON) +else() + set(ONEDPL_DEFAULT OFF) +endif() +kokkos_tpl_option(ONEDPL ${ONEDPL_DEFAULT}) -IF (WIN32) - SET(LIBDL_DEFAULT Off) -ELSE() - SET(LIBDL_DEFAULT On) -ENDIF() -KOKKOS_TPL_OPTION(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) +if(WIN32) + set(LIBDL_DEFAULT Off) +else() + set(LIBDL_DEFAULT On) +endif() +kokkos_tpl_option(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) -SET(HPX_DEFAULT ON) -ELSE() -SET(HPX_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(HPX ${HPX_DEFAULT}) +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) + set(HPX_DEFAULT ON) +else() + set(HPX_DEFAULT OFF) +endif() +kokkos_tpl_option(HPX ${HPX_DEFAULT}) -KOKKOS_TPL_OPTION(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) +kokkos_tpl_option(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) - SET(LIBQUADMATH_DEFAULT ON) -ELSE() - SET(LIBQUADMATH_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) + set(LIBQUADMATH_DEFAULT ON) +else() + set(LIBQUADMATH_DEFAULT OFF) +endif() +kokkos_tpl_option(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake -KOKKOS_IMPORT_TPL(HPX INTERFACE) -KOKKOS_IMPORT_TPL(CUDA INTERFACE) -KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBDL) -IF (NOT WIN32) - KOKKOS_IMPORT_TPL(THREADS INTERFACE) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(ROCM INTERFACE) -ENDIF() -KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) -KOKKOS_IMPORT_TPL(LIBQUADMATH) -KOKKOS_IMPORT_TPL(ROCTHRUST) +kokkos_import_tpl(HPX INTERFACE) +kokkos_import_tpl(CUDA INTERFACE) +kokkos_import_tpl(HWLOC) +kokkos_import_tpl(LIBDL) +if(NOT WIN32) + kokkos_import_tpl(THREADS INTERFACE) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_import_tpl(ROCM INTERFACE) +endif() +kokkos_import_tpl(ONEDPL INTERFACE) +kokkos_import_tpl(LIBQUADMATH) +kokkos_import_tpl(ROCTHRUST) -IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) +if(Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) - KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics) -ENDIF() + kokkos_export_cmake_tpl(desul REQUIRED COMPONENTS atomics) +endif() -if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) +if(Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) find_package(mdspan REQUIRED) - KOKKOS_EXPORT_CMAKE_TPL(mdspan REQUIRED) + kokkos_export_cmake_tpl(mdspan REQUIRED) endif() -IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED COMPONENTS CXX) - # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency - # so we just append the flags here instead of linking with the OpenMP target. - IF(KOKKOS_HAS_TRILINOS) - COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) - ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) - ENDIF() - IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) - ENDIF() - IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) - ENDIF() -ENDIF() +if(Kokkos_ENABLE_OPENMP) + find_package(OpenMP 3.0 REQUIRED COMPONENTS CXX) + kokkos_export_cmake_tpl(OpenMP REQUIRED COMPONENTS CXX) + if(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + global_append(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + endif() + if(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + global_append(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + endif() +endif() #Convert list to newlines (which CMake doesn't always like in cache variables) -STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") +string(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable -UNSET(KOKKOS_TPL_EXPORTS CACHE) -SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) +unset(KOKKOS_TPL_EXPORTS CACHE) +set(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 6da543a2c85..2fda803b118 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -1,82 +1,47 @@ #These are tribits wrappers only ever called by Kokkos itself -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) -INCLUDE(GNUInstallDirs) +include(CMakeParseArguments) +include(CTest) +include(GNUInstallDirs) -MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") +message(STATUS "The project name is: ${PROJECT_NAME}") -IF(GTest_FOUND) - SET(KOKKOS_GTEST_LIB GTest::gtest) - MESSAGE(STATUS "Using gtest found in ${GTest_DIR}") -ELSE() # fallback to internal gtest - SET(KOKKOS_GTEST_LIB kokkos_gtest) - MESSAGE(STATUS "Using internal gtest for testing") -ENDIF() +if(GTest_FOUND) + set(KOKKOS_GTEST_LIB GTest::gtest) + message(STATUS "Using gtest found in ${GTest_DIR}") +else() # fallback to internal gtest + set(KOKKOS_GTEST_LIB kokkos_gtest) + message(STATUS "Using internal gtest for testing") +endif() -FUNCTION(VERIFY_EMPTY CONTEXT) +function(VERIFY_EMPTY CONTEXT) if(${ARGN}) - MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + message(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") endif() -ENDFUNCTION() - -#Leave this here for now - but only do for tribits -#This breaks the standalone CMake -IF (KOKKOS_HAS_TRILINOS) - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP) - SET(${PROJECT_NAME}_ENABLE_OpenMP OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX) - SET(${PROJECT_NAME}_ENABLE_HPX OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG) - SET(${PROJECT_NAME}_ENABLE_DEBUG OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS) - SET(${PROJECT_NAME}_ENABLE_TESTS OFF) - ENDIF() - - IF(NOT DEFINED TPL_ENABLE_Pthread) - SET(TPL_ENABLE_Pthread OFF) - ENDIF() -ENDIF() - -MACRO(KOKKOS_PROCESS_SUBPACKAGES) - ADD_SUBDIRECTORY(core) - ADD_SUBDIRECTORY(containers) - ADD_SUBDIRECTORY(algorithms) - ADD_SUBDIRECTORY(simd) - if (NOT KOKKOS_HAS_TRILINOS) - ADD_SUBDIRECTORY(example) - ADD_SUBDIRECTORY(benchmarks) - endif() -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_DEF) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_DEF() - else() - #do nothing - endif() -ENDMACRO() - -MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) - KOKKOS_LIB_TYPE(${LIBRARY_NAME} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${LIBRARY_NAME} ${INCTYPE} $) - - INSTALL( +endfunction() + +macro(KOKKOS_PROCESS_SUBPACKAGES) + add_subdirectory(core) + add_subdirectory(containers) + add_subdirectory(algorithms) + add_subdirectory(simd) + add_subdirectory(example) + add_subdirectory(benchmarks) +endmacro() + +macro(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) + kokkos_lib_type(${LIBRARY_NAME} INCTYPE) + target_include_directories(${LIBRARY_NAME} ${INCTYPE} $) + + install( TARGETS ${LIBRARY_NAME} EXPORT ${PROJECT_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT ${PACKAGE_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT ${PACKAGE_NAME} ) - INSTALL( + install( TARGETS ${LIBRARY_NAME} EXPORT KokkosTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -84,157 +49,131 @@ MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) - VERIFY_EMPTY(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDMACRO() + verify_empty(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endmacro() + +function(KOKKOS_ADD_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "TESTONLY" "" "SOURCES;TESTONLYLIBS" ${ARGN}) -FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE(${ROOT_NAME} ${ARGN}) + set_source_files_properties(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + add_executable(${EXE_NAME} ${PARSE_SOURCES}) + if(PARSE_TESTONLYLIBS) + target_link_libraries(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) + endif() + verify_empty(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) + #All executables must link to all the kokkos targets + #This is just private linkage because exe is final + target_link_libraries(${EXE_NAME} PRIVATE Kokkos::kokkos) +endfunction() + +function(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES;CATEGORIES;ARGS" ${ARGN}) + verify_empty(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + + kokkos_add_test_executable(${ROOT_NAME} SOURCES ${PARSE_SOURCES}) + if(PARSE_ARGS) + set(TEST_NUMBER 0) + foreach(ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + string(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + list(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + math(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + kokkos_add_test( + NAME + ${TEST_NAME} + EXE + ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION + " FAILED " + ARGS + ${ARG_STR_LIST} + ) + endforeach() else() - CMAKE_PARSE_ARGUMENTS(PARSE - "TESTONLY" - "" - "SOURCES;TESTONLYLIBS" - ${ARGN}) - - SET_SOURCE_FILES_PROPERTIES(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) - #All executables must link to all the kokkos targets - #This is just private linkage because exe is final - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos) + kokkos_add_test(NAME ${ROOT_NAME} EXE ${ROOT_NAME} FAIL_REGULAR_EXPRESSION " FAILED ") + endif() + # We noticed problems with -fvisibility=hidden for inline static variables + # if Kokkos was built as shared library. + if(BUILD_SHARED_LIBS AND NOT ${TEST_NAME}_DISABLE) + set_property(TARGET ${EXE_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) + set_property(TARGET ${EXE_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) + endif() + if(NOT + (Kokkos_INSTALL_TESTING + OR Kokkos_ENABLE_SYCL + OR Kokkos_ENABLE_HPX + OR Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "Intel" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.2.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.3.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC")) + ) + if(MSVC) + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "/GR-") + else() + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "-fno-rtti") + endif() + endif() +endfunction() + +function(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) + set(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + if(NOT TARGET ${TARGET_NAME}) + message(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") endif() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - - IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) - ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - IF (PARSE_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${PARSE_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - KOKKOS_ADD_TEST(NAME ${TEST_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${ARG_STR_LIST} - ) - ENDFOREACH() - ELSE() - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ) - ENDIF() - ENDIF() - # We noticed problems with -fvisibility=hidden for inline static variables - # if Kokkos was built as shared library. - IF(BUILD_SHARED_LIBS) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) - ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) - SET(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - IF (NOT TARGET ${TARGET_NAME}) - MESSAGE(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") - ENDIF() - SET_PROPERTY(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT) + set_property(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) +endfunction() + +macro(KOKKOS_SETUP_BUILD_ENVIRONMENT) # This is needed for both regular build and install tests - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) #set an internal option, if not already set - SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") - IF (Kokkos_INSTALL_TESTING) - SET(KOKKOS_ENABLE_TESTS ON) - SET(KOKKOS_ENABLE_BENCHMARKS ON) - SET(KOKKOS_ENABLE_EXAMPLES ON) + set(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") + if(Kokkos_INSTALL_TESTING) + set(KOKKOS_ENABLE_TESTS ON) + set(KOKKOS_ENABLE_BENCHMARKS ON) + set(KOKKOS_ENABLE_EXAMPLES ON) # This looks a little weird, but what we are doing # is to NOT build Kokkos but instead look for an # installed Kokkos - then build examples and tests # against that installed Kokkos - FIND_PACKAGE(Kokkos REQUIRED) + find_package(Kokkos REQUIRED) # Just grab the configuration from the installation - FOREACH(DEV ${Kokkos_DEVICES}) - SET(KOKKOS_ENABLE_${DEV} ON) - ENDFOREACH() - FOREACH(OPT ${Kokkos_OPTIONS}) - SET(KOKKOS_ENABLE_${OPT} ON) - ENDFOREACH() - FOREACH(TPL ${Kokkos_TPLS}) - SET(KOKKOS_ENABLE_${TPL} ON) - ENDFOREACH() - FOREACH(ARCH ${Kokkos_ARCH}) - SET(KOKKOS_ARCH_${ARCH} ON) - ENDFOREACH() - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) - IF (NOT KOKKOS_HAS_TRILINOS) - SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") - ENDIF() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES" - ${ARGN}) - KOKKOS_ADD_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ${PARSE_UNPARSED_ARGUMENTS} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - ) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_POSTPROCESS) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_POSTPROCESS() + foreach(DEV ${Kokkos_DEVICES}) + set(KOKKOS_ENABLE_${DEV} ON) + endforeach() + foreach(OPT ${Kokkos_OPTIONS}) + set(KOKKOS_ENABLE_${OPT} ON) + endforeach() + foreach(TPL ${Kokkos_TPLS}) + set(KOKKOS_ENABLE_${TPL} ON) + endforeach() + foreach(ARCH ${Kokkos_ARCH}) + set(KOKKOS_ARCH_${ARCH} ON) + endforeach() + else() + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") + include(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) endif() -ENDMACRO() +endmacro() + +macro(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES" ${ARGN}) + # Don't do anything if the user disabled the test + if(NOT ${PACKAGE_NAME}_${ROOT_NAME}_DISABLE) + kokkos_add_executable( + ${ROOT_NAME} SOURCES ${PARSE_SOURCES} ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS ${KOKKOS_GTEST_LIB} + ) + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + endif() +endmacro() ## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based ## on enabled backends. @@ -242,265 +181,214 @@ ENDMACRO() ## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp -MACRO(KOKKOS_CONFIGURE_CORE) - MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") - CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) -ENDMACRO() +macro(KOKKOS_CONFIGURE_CORE) + message(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" + "${KOKKOS_ENABLED_DEVICES}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" + "${DEVICE_SETUP_LIST}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" + "${KOKKOS_ENABLED_DEVICES}" + ) + configure_file(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +endmacro() ## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. -MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) +macro(KOKKOS_INSTALL_ADDITIONAL_FILES) # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler - IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") - ELSE() - IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") - ENDIF() - ENDIF() - - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler - ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler - @ONLY) - - INSTALL(PROGRAMS - "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" - "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" - "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" - DESTINATION ${CMAKE_INSTALL_BINDIR}) - INSTALL(FILES - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + if(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + set(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + else() + if(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + set(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + endif() + endif() + + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler @ONLY + ) + + install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - DESTINATION ${KOKKOS_HEADER_DIR}) -ENDMACRO() - + DESTINATION ${KOKKOS_HEADER_DIR} + ) +endmacro() -FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "PLAIN_STYLE" - "" - "" - ${ARGN}) +function(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) + cmake_parse_arguments(PARSE "PLAIN_STYLE" "" "" ${ARGN}) - IF((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) + if((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) #I can use link options #check for CXX linkage using the simple 3.18 way - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_LINK_OPTIONS}> - ) - ELSE() + target_link_options(${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_LINK_OPTIONS}>) + else() #I can use link options #just assume CXX linkage - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() + target_link_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS}) + endif() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_OPTIONS}> + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_OPTIONS}> ) - TARGET_COMPILE_DEFINITIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_DEFINITIONS}> + target_compile_definitions( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_DEFINITIONS}> ) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES} - ) + target_link_libraries(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES}) - IF (KOKKOS_ENABLE_CUDA) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> + if(KOKKOS_ENABLE_CUDA) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> ) - SET(NODEDUP_CUDAFE_OPTIONS) - FOREACH(OPT ${KOKKOS_CUDAFE_OPTIONS}) - LIST(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> + set(NODEDUP_CUDAFE_OPTIONS) + foreach(OPT ${KOKKOS_CUDAFE_OPTIONS}) + list(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_ENABLE_HIP) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> + if(KOKKOS_ENABLE_HIP) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> ) - ENDIF() - - LIST(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) - IF (XOPT_LENGTH GREATER 1) - MESSAGE(FATAL_ERROR "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12") - ENDIF() - IF(KOKKOS_XCOMPILER_OPTIONS) - SET(NODEDUP_XCOMPILER_OPTIONS) - FOREACH(OPT ${KOKKOS_XCOMPILER_OPTIONS}) + endif() + + list(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) + if(XOPT_LENGTH GREATER 1) + message( + FATAL_ERROR + "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12" + ) + endif() + if(KOKKOS_XCOMPILER_OPTIONS) + set(NODEDUP_XCOMPILER_OPTIONS) + foreach(OPT ${KOKKOS_XCOMPILER_OPTIONS}) #I have to do this for now because we can't guarantee 3.12 support #I really should do this with the shell option - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> + list(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) + list(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_CXX_STANDARD_FEATURE) + if(KOKKOS_CXX_STANDARD_FEATURE) #GREAT! I can do this the right way - TARGET_COMPILE_FEATURES(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) - IF (NOT KOKKOS_USE_CXX_EXTENSIONS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) - ENDIF() - ELSE() + target_compile_features(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) + if(NOT KOKKOS_USE_CXX_EXTENSIONS) + set_target_properties(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) + endif() + else() #OH, well, no choice but the wrong way - TARGET_COMPILE_OPTIONS(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES" - ${ARGN}) - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - FOREACH(source ${PARSE_SOURCES}) + target_compile_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) + endif() +endfunction() + +function(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES" ${ARGN}) + + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) + endif() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + foreach(source ${PARSE_SOURCES}) set_source_files_properties(${source} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - ENDFOREACH() + endforeach() - IF(PARSE_STATIC) - SET(LINK_TYPE STATIC) - ENDIF() + if(PARSE_STATIC) + set(LINK_TYPE STATIC) + endif() - IF(PARSE_SHARED) - SET(LINK_TYPE SHARED) - ENDIF() + if(PARSE_SHARED) + set(LINK_TYPE SHARED) + endif() # MSVC and other platforms want to have # the headers included as source files # for better dependency detection - ADD_LIBRARY( - ${LIBRARY_NAME} - ${LINK_TYPE} - ${PARSE_HEADERS} - ${PARSE_SOURCES} - ) + add_library(${LIBRARY_NAME} ${LINK_TYPE} ${PARSE_HEADERS} ${PARSE_SOURCES}) - IF(PARSE_SHARED OR BUILD_SHARED_LIBS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES - VERSION ${Kokkos_VERSION} - SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + if(PARSE_SHARED OR BUILD_SHARED_LIBS) + set_target_properties( + ${LIBRARY_NAME} PROPERTIES VERSION ${Kokkos_VERSION} SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} ) - ENDIF() + endif() - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) + kokkos_internal_add_library_install(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name #that matches the install Kokkos:: name - ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "ADD_BUILD_OPTIONS" - "" - "HEADERS" - ${ARGN} - ) - IF (KOKKOS_HAS_TRILINOS) - # We do not pass headers to trilinos. They would get installed - # to the default include folder, but we want headers installed - # preserving the directory structure, e.g. impl - # If headers got installed in both locations, it breaks some - # downstream packages - TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} - ADDED_LIB_TARGET_NAME_OUT ${LIBRARY_NAME}_TARGET_NAME ) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${${LIBRARY_NAME}_TARGET_NAME}) - ENDIF() - ELSE() - # Forward the headers, we want to know about all headers - # to make sure they appear correctly in IDEs - KOKKOS_INTERNAL_ADD_LIBRARY( - ${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME}) - ENDIF() - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) - ELSE() - ADD_LIBRARY(${NAME} INTERFACE) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - FOREACH(DIR ${ARGN}) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $) - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_ADD_TEST_DIRECTORIES) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_TEST_DIRECTORIES(${ARGN}) - ELSE() - IF(KOKKOS_ENABLE_TESTS) - FOREACH(TEST_DIR ${ARGN}) - ADD_SUBDIRECTORY(${TEST_DIR}) - ENDFOREACH() - ENDIF() - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_EXAMPLE_DIRECTORIES) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXAMPLE_DIRECTORIES(${ARGN}) - else() - IF(KOKKOS_ENABLE_EXAMPLES) - FOREACH(EXAMPLE_DIR ${ARGN}) - ADD_SUBDIRECTORY(${EXAMPLE_DIR}) - ENDFOREACH() - ENDIF() + add_library(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) +endfunction() + +function(KOKKOS_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "ADD_BUILD_OPTIONS" "" "HEADERS" ${ARGN}) + # Forward the headers, we want to know about all headers + # to make sure they appear correctly in IDEs + kokkos_internal_add_library(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) + if(PARSE_ADD_BUILD_OPTIONS) + kokkos_set_library_properties(${LIBRARY_NAME}) + endif() +endfunction() + +function(KOKKOS_ADD_INTERFACE_LIBRARY NAME) + add_library(${NAME} INTERFACE) + kokkos_internal_add_library_install(${NAME}) +endfunction() + +function(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + foreach(DIR ${ARGN}) + target_include_directories(${TARGET} ${INCTYPE} $) + endforeach() +endfunction() + +function(KOKKOS_LIB_COMPILE_OPTIONS TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + target_compile_options(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) +endfunction() + +macro(KOKKOS_ADD_TEST_DIRECTORIES) + if(KOKKOS_ENABLE_TESTS) + foreach(TEST_DIR ${ARGN}) + add_subdirectory(${TEST_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_EXAMPLE_DIRECTORIES) + if(KOKKOS_ENABLE_EXAMPLES) + foreach(EXAMPLE_DIR ${ARGN}) + add_subdirectory(${EXAMPLE_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_BENCHMARK_DIRECTORIES) + if(KOKKOS_ENABLE_BENCHMARKS) + foreach(BENCHMARK_DIR ${ARGN}) + add_subdirectory(${BENCHMARK_DIR}) + endforeach() endif() -ENDMACRO() - -MACRO(KOKKOS_ADD_BENCHMARK_DIRECTORIES) - IF(KOKKOS_ENABLE_BENCHMARKS) - FOREACH(BENCHMARK_DIR ${ARGN}) - ADD_SUBDIRECTORY(${BENCHMARK_DIR}) - ENDFOREACH() - ENDIF() -ENDMACRO() +endmacro() diff --git a/lib/kokkos/cmake/msvc.cmake b/lib/kokkos/cmake/msvc.cmake index 85421bdbaaa..1de13585c73 100644 --- a/lib/kokkos/cmake/msvc.cmake +++ b/lib/kokkos/cmake/msvc.cmake @@ -1,11 +1,9 @@ - -FUNCTION(kokkos_set_msvc_flags full_standard int_standard) - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - +function(kokkos_set_msvc_flags full_standard int_standard) + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + endif() +endfunction() diff --git a/lib/kokkos/cmake/pgi.cmake b/lib/kokkos/cmake/pgi.cmake index e98e8495588..45f59dcd10b 100644 --- a/lib/kokkos/cmake/pgi.cmake +++ b/lib/kokkos/cmake/pgi.cmake @@ -1,8 +1,6 @@ - function(kokkos_set_pgi_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake index 4e05d225348..52d8368d041 100644 --- a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,8 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) - +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake index 3d5b03805d4..f51bce5d64d 100644 --- a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -15,29 +15,26 @@ # ************************************************************************ # @HEADER -SET(USE_THREADS FALSE) +set(USE_THREADS FALSE) -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake index 8560ec60f1b..b449f45135a 100644 --- a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +tribits_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/lib/kokkos/containers/CMakeLists.txt b/lib/kokkos/containers/CMakeLists.txt index 0857d7007b4..8ee8bb41a28 100644 --- a/lib/kokkos/containers/CMakeLists.txt +++ b/lib/kokkos/containers/CMakeLists.txt @@ -1,9 +1,9 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT KOKKOS_ENABLE_OPENACC) -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -KOKKOS_ADD_TEST_DIRECTORIES(performance_tests) -ENDIF() +if(NOT KOKKOS_ENABLE_OPENACC) + kokkos_add_test_directories(unit_tests) + kokkos_add_test_directories(performance_tests) +endif() diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index e325e45e85d..8d4d605b087 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -1,7 +1,6 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) string(TOUPPER ${Tag} DEVICE) @@ -10,14 +9,8 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) if(Kokkos_ENABLE_${DEVICE}) message(STATUS "Sources Test${Tag}.cpp") - set(SOURCES - TestMain.cpp - Test${Tag}.cpp - ) + set(SOURCES TestMain.cpp Test${Tag}.cpp) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - ContainersPerformanceTest_${Tag} - SOURCES ${SOURCES} - ) + kokkos_add_executable_and_test(ContainersPerformanceTest_${Tag} SOURCES ${SOURCES}) endif() endforeach() diff --git a/lib/kokkos/containers/performance_tests/TestScatterView.hpp b/lib/kokkos/containers/performance_tests/TestScatterView.hpp index a74f833b9f5..953b8bff6e5 100644 --- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp @@ -25,8 +25,8 @@ namespace Perf { template void test_scatter_view(int m, int n) { - Kokkos::View original_view("original_view", - n); + Kokkos::View original_view("original_view", + n); { auto scatter_view = Kokkos::Experimental::create_scatter_view< Kokkos::Experimental::ScatterSum, Duplication, Contribution>( @@ -40,8 +40,8 @@ void test_scatter_view(int m, int n) { { auto num_threads = unique_token.size(); std::cout << "num_threads " << num_threads << '\n'; - Kokkos::View - hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n); + Kokkos::View hand_coded_duplicate_view( + "hand_coded_duplicate", num_threads, n); auto f2 = KOKKOS_LAMBDA(int i) { auto thread_id = unique_token.acquire(); for (int j = 0; j < 10; ++j) { diff --git a/lib/kokkos/containers/src/CMakeLists.txt b/lib/kokkos/containers/src/CMakeLists.txt index b7d85ebf11d..b386fbe6750 100644 --- a/lib/kokkos/containers/src/CMakeLists.txt +++ b/lib/kokkos/containers/src/CMakeLists.txt @@ -1,33 +1,27 @@ #need these here for now -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -SET(KOKKOS_CONTAINERS_SRCS) -APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CONTAINER_HEADERS) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +set(KOKKOS_CONTAINERS_SRCS) +append_glob(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CONTAINER_HEADERS) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) - -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) -KOKKOS_ADD_LIBRARY( - kokkoscontainers - SOURCES ${KOKKOS_CONTAINERS_SRCS} - HEADERS ${KOKKOS_CONTAINERS_HEADERS} -) +kokkos_add_library(kokkoscontainers SOURCES ${KOKKOS_CONTAINERS_SRCS} HEADERS ${KOKKOS_CONTAINERS_HEADERS}) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) +kokkos_link_internal_library(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp index f50ab0a0f7e..409260f0218 100644 --- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp @@ -271,7 +271,7 @@ class Bitset { offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask; - block = Impl::rotate_right(block, offset); + block = Impl::rotate_right(block, offset); return (((!(scan_direction & BIT_SCAN_REVERSE) ? Impl::bit_scan_forward(block) : Impl::int_log2(block)) + diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index a37a2bdcebd..6a2e6f73a15 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -275,14 +275,29 @@ class DualView : public ViewTraits { const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : modified_flags(t_modified_flags("DualView::modified_flags")), - d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) { - // without UVM, host View mirrors - if constexpr (Kokkos::Impl::has_type::value) - h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); - else - h_view = Kokkos::create_mirror_view(d_view); + : modified_flags(t_modified_flags("DualView::modified_flags")) { + if constexpr (Impl::ViewCtorProp::sequential_host_init) { + h_view = t_host(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + static_assert(Impl::ViewCtorProp::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!Impl::ViewCtorProp::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); + + d_view = Kokkos::create_mirror_view_and_copy( + typename traits::memory_space{}, h_view); + } else { + d_view = t_dev(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + + // without UVM, host View mirrors + if constexpr (Kokkos::Impl::has_type::value) + h_view = + Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); + else + h_view = Kokkos::create_mirror_view(d_view); + } } //! Copy constructor (shallow copy) @@ -338,23 +353,21 @@ class DualView : public ViewTraits { // does the DualView have only one device struct impl_dualview_is_single_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; // does the given device match the device of t_dev? template struct impl_device_matches_tdev_device { - enum : bool { - value = std::is_same::value - }; + enum : bool { value = std::is_same_v }; }; // does the given device match the device of t_host? template struct impl_device_matches_thost_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -362,7 +375,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_thost_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -370,7 +383,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -378,8 +391,8 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_memory_space { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -389,11 +402,6 @@ class DualView : public ViewTraits { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the nested if_c expressions in the return - /// value's type. That just tells the method what the return type - /// should be: t_dev if the \c Device template parameter matches - /// this DualView's device type, else t_host. - /// /// For example, suppose you create a DualView on Cuda, like this: /// \code /// using dual_view_type = @@ -410,56 +418,47 @@ class DualView : public ViewTraits { /// typename dual_view_type::t_host hostView = DV.view (); /// \endcode template - KOKKOS_INLINE_FUNCTION const typename std::conditional_t< - impl_device_matches_tdev_device::value, t_dev, - typename std::conditional_t< - impl_device_matches_thost_device::value, t_host, - typename std::conditional_t< - impl_device_matches_thost_exec::value, t_host, - typename std::conditional_t< - impl_device_matches_tdev_exec::value, t_dev, - typename std::conditional_t< - impl_device_matches_tdev_memory_space::value, - t_dev, t_host>>>>> - view() const { - constexpr bool device_is_memspace = - std::is_same::value; - constexpr bool device_is_execspace = - std::is_same::value; - constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; - constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; - constexpr bool device_exec_is_t_host_exec = - std::is_same::value; - constexpr bool device_mem_is_t_host_mem = - std::is_same::value; - constexpr bool device_is_t_host_device = - std::is_same::value; - constexpr bool device_is_t_dev_device = - std::is_same::value; - - static_assert( - device_is_t_dev_device || device_is_t_host_device || - (device_is_memspace && - (device_mem_is_t_dev_mem || device_mem_is_t_host_mem)) || - (device_is_execspace && - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)) || - ((!device_is_execspace && !device_is_memspace) && - ((device_mem_is_t_dev_mem || device_mem_is_t_host_mem) || - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))), - "Template parameter to .view() must exactly match one of the " - "DualView's device types or one of the execution or memory spaces"); - - return Impl::if_c::value, - t_dev, t_host>::select(d_view, h_view); + KOKKOS_FUNCTION auto view() const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is a memory space but doesn't " + "match either of DualView's memory spaces!"); + return h_view; + } + } else { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is an execution space but " + "doesn't match either of DualView's execution spaces!"); + return h_view; + } + } else { + static_assert(std::is_same_v, + "The template argument is neither a memory space, " + "execution space, or device!"); + if constexpr (std::is_same_v) + return d_view; + else { + static_assert(std::is_same_v, + "The template argument is a device but " + "doesn't match either of DualView's devices!"); + return h_view; + } + } + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } KOKKOS_INLINE_FUNCTION @@ -475,27 +474,27 @@ class DualView : public ViewTraits { template static int get_device_side() { constexpr bool device_is_memspace = - std::is_same::value; + std::is_same_v; constexpr bool device_is_execspace = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_host_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_host_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_host_device = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_dev_device = - std::is_same::value; + std::is_same_v; static_assert( device_is_t_dev_device || device_is_t_host_device || @@ -627,9 +626,9 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}); } @@ -637,9 +636,9 @@ class DualView : public ViewTraits { template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}, exec); } @@ -669,18 +668,18 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}); } template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}, exec); } @@ -943,12 +942,21 @@ class DualView : public ViewTraits { Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents); if (sizeMismatch) { - ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if constexpr (alloc_prop_input::initialize) { - h_view = create_mirror_view(typename t_host::memory_space(), d_view); + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + ::Kokkos::realloc(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); } else { - h_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_host::memory_space(), d_view); + ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); + if constexpr (alloc_prop_input::initialize) { + h_view = create_mirror_view(typename t_host::memory_space(), d_view); + } else { + h_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_host::memory_space(), d_view); + } } } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { @@ -1062,9 +1070,22 @@ class DualView : public ViewTraits { } }; - constexpr bool has_execution_space = alloc_prop_input::has_execution_space; + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!alloc_prop_input::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); - if constexpr (has_execution_space) { + if (sizeMismatch) { + sync(); + ::Kokkos::resize(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); + } + return; + } else if constexpr (alloc_prop_input::has_execution_space) { using ExecSpace = typename alloc_prop_input::execution_space; const auto& exec_space = Impl::get_property(arg_prop); @@ -1182,15 +1203,15 @@ class DualView : public ViewTraits { } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> extent(const iType& r) const { return d_view.extent(r); } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> extent_int(const iType& r) const { return static_cast(d_view.extent(r)); } diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 5f7fcaf69e7..2f2f4433e7c 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -38,6 +38,23 @@ class DynRankView; // forward declare namespace Impl { +template +struct ViewDataTypeFromRank { + using type = typename ViewDataTypeFromRank::type*; +}; + +template +struct ViewDataTypeFromRank { + using type = T; +}; + +template +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( + DynRankView v, + std::enable_if_t::specialize, + void>>* = nullptr); + template struct DynRankDimTraits { enum : size_t { unspecified = KOKKOS_INVALID_INDEX }; @@ -91,54 +108,59 @@ struct DynRankDimTraits { } // Create the layout for the rank-7 view. + // Because the underlying View is rank-7, preserve "unspecified" for + // dimension 8. + // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.dimension[7] != unspecified ? layout.dimension[7] : 1); + Layout new_layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified); + new_layout.stride = layout.stride; + return new_layout; } // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value), Layout> + (std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.stride[0], - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.stride[1], - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.stride[2], - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.stride[3], - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.stride[4], - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.stride[5], - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.stride[6], - layout.dimension[7] != unspecified ? layout.dimension[7] : 1, - layout.stride[7]); + return Layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.stride[0], + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.stride[1], + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.stride[2], + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.stride[3], + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.stride[4], + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.stride[5], + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.stride[6], + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified, + layout.stride[7]); } // Extra overload to match that for specialize types template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), typename Traits::array_layout> createLayout(const Kokkos::Impl::ViewCtorProp& /* prop */, const typename Traits::array_layout& layout) { @@ -164,9 +186,8 @@ struct DynRankDimTraits { // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value) && - std::is_integral::value, + (std::is_same_v || + std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -182,8 +203,7 @@ reconstructLayout(const Layout& layout, iType dynrank) { // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value) && - std::is_integral::value, + (std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -284,40 +304,43 @@ namespace Impl { template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && - std::is_void::value && - (std::is_same::value || - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value)))), - Kokkos::Impl::ViewToDynRankViewTag>> { + std::enable_if_t< + (std::is_same_v && + std::is_void_v && + std::is_void_v && + (std::is_same_v || + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>)))), + Kokkos::Impl::ViewToDynRankViewTag>> { private: enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { is_assignable_layout = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; public: @@ -345,7 +368,7 @@ class ViewMapping< src.layout()); // Check this for integer input1 for padding, etc dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle::assign( src.m_map.m_impl_handle, src.m_track.m_tracker); - dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); + dst.m_track.m_tracker.assign(src.m_track.m_tracker, DstTraits::is_managed); dst.m_rank = Kokkos::View::rank(); } }; @@ -378,10 +401,11 @@ struct is_dyn_rank_view> : public std::true_type { template inline constexpr bool is_dyn_rank_view_v = is_dyn_rank_view::value; +// Inherit privately from View, this way we don't import anything funky +// for example the rank member vs the rank() function of DynRankView template -class DynRankView : public ViewTraits { - static_assert(!std::is_array::value && - !std::is_pointer::value, +class DynRankView : private View { + static_assert(!std::is_array_v && !std::is_pointer_v, "Cannot template DynRankView with array or pointer datatype - " "must be pod"); @@ -391,28 +415,66 @@ class DynRankView : public ViewTraits { template friend class Kokkos::Impl::ViewMapping; + size_t m_rank{}; + public: using drvtraits = ViewTraits; using view_type = View; - using traits = ViewTraits; - private: - using map_type = - Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; - - track_type m_track; - map_type m_map; - unsigned m_rank; + using drdtraits = Impl::DynRankDimTraits; public: - KOKKOS_INLINE_FUNCTION + // typedefs from ViewTraits, overriden + using data_type = typename drvtraits::data_type; + using const_data_type = typename drvtraits::const_data_type; + using non_const_data_type = typename drvtraits::non_const_data_type; + + // typedefs from ViewTraits not overriden + using value_type = typename view_type::value_type; + using const_value_type = typename view_type::const_value_type; + using non_const_value_type = typename view_type::non_const_value_type; + using traits = typename view_type::traits; + using array_layout = typename view_type::array_layout; + + using execution_space = typename view_type::execution_space; + using memory_space = typename view_type::memory_space; + using device_type = typename view_type::device_type; + + using memory_traits = typename view_type::memory_traits; + using host_mirror_space = typename view_type::host_mirror_space; + using size_type = typename view_type::size_type; + + using reference_type = typename view_type::reference_type; + using pointer_type = typename view_type::pointer_type; + + using scalar_array_type = value_type; + using const_scalar_array_type = const_value_type; + using non_const_scalar_array_type = non_const_value_type; + using specialize = typename view_type::specialize; + + // typedefs in View for mdspan compatibility + // cause issues with MSVC+CUDA + // using layout_type = typename view_type::layout_type; + using index_type = typename view_type::index_type; + using element_type = typename view_type::element_type; + using rank_type = typename view_type::rank_type; + using reference = reference_type; + using data_handle_type = pointer_type; + + KOKKOS_FUNCTION view_type& DownCast() const { return (view_type&)(*this); } - KOKKOS_INLINE_FUNCTION + + // FIXME: this function make NO sense, the above one already is marked const + // Maybe one would want to get back a view of const?? + KOKKOS_FUNCTION const view_type& ConstDownCast() const { return (const view_type&)(*this); } + // FIXME: deprecate DownCast in favor of to_view + // KOKKOS_FUNCTION + // view_type to_view() const { return *this; } + // Types below - at least the HostMirror requires the value_type, NOT the rank // 7 data_type of the traits @@ -436,113 +498,32 @@ class DynRankView : public ViewTraits { typename drvtraits::array_layout, typename drvtraits::host_mirror_space>; + using host_mirror_type = HostMirror; //---------------------------------------- // Domain rank and extents // enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the // enum? - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() const; - //---------------------------------------- /* Deprecate all 'dimension' functions in favor of * ISO/C++ vocabulary 'extent'. */ - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) * - m_map.extent(3) * m_map.extent(4) * m_map.extent(5) * - m_map.extent(6) * m_map.extent(7); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return (m_map.data() != nullptr); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - //---------------------------------------- private: enum { is_layout_left = - std::is_same::value, + std::is_same_v, is_layout_right = - std::is_same::value, + std::is_same_v, - is_layout_stride = std::is_same::value, + is_layout_stride = + std::is_same_v, - is_default_map = std::is_void::value && + is_default_map = std::is_void_v && (is_layout_left || is_layout_right || is_layout_stride) }; @@ -570,476 +551,150 @@ class DynRankView : public ViewTraits { #endif public: - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION constexpr unsigned rank() const { return m_rank; } - // operators () - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type operator()() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding...) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // Phalanx is violating this, since they use the operator to access ALL - // elements in the allocation KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , - // this->rank(), m_track, m_map) ) - return data()[i0]; - } - - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding... AND a Trilinos/Sacado scalar type ) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - !std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // auto map = impl_map(); - const size_t dim_scalar = m_map.dimension_scalar(); - const size_t bytes = this->span() / dim_scalar; - - using tmp_view_type = Kokkos::View< - DataType*, typename traits::array_layout, typename traits::device_type, - Kokkos::MemoryTraits>; - tmp_view_type rankone_view(this->data(), bytes, dim_scalar); - return rankone_view(i0); - } - - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5, - const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type access() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); + using view_type::data; + using view_type::extent; + using view_type::extent_int; // FIXME: not tested + using view_type::impl_map; // FIXME: not tested + using view_type::is_allocated; + using view_type::label; + using view_type::size; + using view_type::span; + using view_type::span_is_contiguous; // FIXME: not tested + using view_type::stride; // FIXME: not tested + using view_type::stride_0; // FIXME: not tested + using view_type::stride_1; // FIXME: not tested + using view_type::stride_2; // FIXME: not tested + using view_type::stride_3; // FIXME: not tested + using view_type::stride_4; // FIXME: not tested + using view_type::stride_5; // FIXME: not tested + using view_type::stride_6; // FIXME: not tested + using view_type::stride_7; // FIXME: not tested + using view_type::use_count; + + KOKKOS_FUNCTION reference_type + operator()(index_type i0 = 0, index_type i1 = 0, index_type i2 = 0, + index_type i3 = 0, index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); + } + +// This is an accomodation for Phalanx, that is usint the operator[] to access +// all elements in a linear fashion even when the rank is not 1 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { + if constexpr (std::is_same_v) { + return view_type::data()[i0]; + } else { + const size_t dim_scalar = view_type::impl_map().dimension_scalar(); + const size_t bytes = view_type::span() / dim_scalar; + + using tmp_view_type = + Kokkos::View>; + tmp_view_type rankone_view(view_type::data(), bytes, dim_scalar); + return rankone_view(i0); + } } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); +#else + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 1u) + Kokkos::abort("DynRankView operator[] can only be used for rank-1"); +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); } +#endif - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5, const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + KOKKOS_FUNCTION reference_type access(index_type i0 = 0, index_type i1 = 0, + index_type i2 = 0, index_type i3 = 0, + index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - //---------------------------------------- // Standard constructor, destructor, and assignment operators... KOKKOS_DEFAULTED_FUNCTION ~DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView() : m_track(), m_map(), m_rank() {} // Default ctor + KOKKOS_DEFAULTED_FUNCTION DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} - - KOKKOS_INLINE_FUNCTION - DynRankView(DynRankView&& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + // Make this conditionally explicit? + template + KOKKOS_FUNCTION DynRankView(const DynRankView& rhs) + : view_type(rhs), m_rank(rhs.m_rank) {} - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(const DynRankView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; + template + KOKKOS_FUNCTION DynRankView& operator=(const DynRankView& rhs) { + view_type::operator=(rhs); + m_rank = rhs.m_rank; return *this; } - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(DynRankView&& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; - return *this; +#if 0 // TODO: this will later be swapped in depending on whether the new View + // impl is active + private: + template + KOKKOS_FUNCTION typename view_type::extents_type create_rank7_extents( + const Ext& ext) { + return typename view_type::extents_type( + ext.rank() > 0 ? ext.extent(0) : 1, ext.rank() > 1 ? ext.extent(1) : 1, + ext.rank() > 2 ? ext.extent(2) : 1, ext.rank() > 3 ? ext.extent(3) : 1, + ext.rank() > 4 ? ext.extent(4) : 1, ext.rank() > 5 ? ext.extent(5) : 1, + ext.rank() > 6 ? ext.extent(6) : 1); } - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. + public: + // Copy/Assign View to DynRankView template - KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); + KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs, + size_t new_rank) + : view_type(rhs.data_handle(), drdtraits::createLayout(rhs.layout())), + m_rank(new_rank) { + if (new_rank > rhs.rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); } template - KOKKOS_INLINE_FUNCTION DynRankView& operator=( - const DynRankView& rhs) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); - m_track.assign(rhs.m_track, traits::is_managed); + KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + view_type::operator=(view_type( + rhs.data_handle(), + typename view_type::mapping_type(create_rank7_extents(rhs.extents())), + rhs.accessor())); m_rank = rhs.rank(); return *this; } - - // Copy/Assign View to DynRankView +#else template - KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs) - : m_track(), m_map(), m_rank(View::rank()) { + KOKKOS_FUNCTION DynRankView(const View& rhs, size_t new_rank) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping; static_assert(Mapping::is_assignable, - "Incompatible View to DynRankView copy construction"); + "Incompatible View to DynRankView copy assignment"); + if (new_rank > View::rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); Mapping::assign(*this, rhs); + m_rank = new_rank; } template - KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + KOKKOS_FUNCTION DynRankView& operator=(const View& rhs) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping { static_assert(Mapping::is_assignable, "Incompatible View to DynRankView copy assignment"); Mapping::assign(*this, rhs); + m_rank = View::rank(); return *this; } +#endif + + template + KOKKOS_FUNCTION DynRankView(const View& rhs) + : DynRankView(rhs, View::rank()) {} //---------------------------------------- // Allocation tracking properties - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.use_count(); } - - inline const std::string label() const { - return m_track.template get_label(); - } - //---------------------------------------- // Allocation according to allocation properties and array layout // unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that // rank deduction can properly take place + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), - m_map(), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing DynRankView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout), - Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - } + std::enable_if_t::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} - // Wrappers template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout)), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing DynRankView to wrap user memory must supply matching " - "pointer type"); - } + std::enable_if_t::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} //---------------------------------------- // Constructor(s) // Simple dimension-only layout + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} @@ -1188,16 +809,20 @@ class DynRankView : public ViewTraits { //---------------------------------------- // Memory span required to wrap these dimensions. + // FIXME: this function needs to be tested static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, + [[maybe_unused]] const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + // FIXME: check that arg_N7 is not set by user (in debug mode) + return view_type::required_allocation_size(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_INVALID_INDEX, + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, const size_t arg_N2 = KOKKOS_INVALID_INDEX, const size_t arg_N3 = KOKKOS_INVALID_INDEX, @@ -1205,55 +830,38 @@ class DynRankView : public ViewTraits { const size_t arg_N5 = KOKKOS_INVALID_INDEX, const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), arg_N0, - arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, typename traits::array_layout& arg_layout) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), - arg_layout) {} + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + typename traits::array_layout& arg_layout) + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_layout) {} //---------------------------------------- // Shared scratch memory constructor - static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - const size_t num_passed_args = - (arg_N0 != KOKKOS_INVALID_INDEX) + (arg_N1 != KOKKOS_INVALID_INDEX) + - (arg_N2 != KOKKOS_INVALID_INDEX) + (arg_N3 != KOKKOS_INVALID_INDEX) + - (arg_N4 != KOKKOS_INVALID_INDEX) + (arg_N5 != KOKKOS_INVALID_INDEX) + - (arg_N6 != KOKKOS_INVALID_INDEX) + (arg_N7 != KOKKOS_INVALID_INDEX); - - if (std::is_void::value && - num_passed_args != traits::rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - {} - - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + // Note: We must pass 7 valid args since view_type is rank 7 + static inline size_t shmem_size( + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + return view_type::shmem_size(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, + arg_N6, arg_N7); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const typename traits::array_layout& arg_layout) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(arg_layout) // is this correct? - )))), - arg_layout) {} + : view_type(arg_space, drdtraits::createLayout(arg_layout)), + m_rank(drdtraits::computeRank(arg_layout)) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, @@ -1264,21 +872,38 @@ class DynRankView : public ViewTraits { const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, - arg_N6, arg_N7)))))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) {} + : DynRankView(arg_space, typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, + arg_N5, arg_N6, arg_N7)) {} + + KOKKOS_FUNCTION constexpr auto layout() const { + switch (rank()) { + case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); + case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); + case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); + case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); + case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); + case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); + case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); + case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); + default: + KOKKOS_IF_ON_HOST( + Kokkos::abort( + std::string( + "Calling DynRankView::layout on DRV of unexpected rank " + + std::to_string(rank())) + .c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "Calling DynRankView::layout on DRV of unexpected rank");) + } + // control flow should never reach here + return view_type::layout(); + } }; template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank( - const DynRankView& DRV) { +KOKKOS_FUNCTION constexpr unsigned rank(const DynRankView& DRV) { return DRV.rank(); } // needed for transition to common constexpr method in view and dynrankview // to return rank @@ -1293,181 +918,46 @@ struct DynRankSubviewTag {}; } // namespace Impl -namespace Impl { - -template -class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value)), - Kokkos::Impl::DynRankSubviewTag>, - SrcTraits, Args...> { - private: - enum { - RZ = false, - R0 = bool(is_integral_extent<0, Args...>::value), - R1 = bool(is_integral_extent<1, Args...>::value), - R2 = bool(is_integral_extent<2, Args...>::value), - R3 = bool(is_integral_extent<3, Args...>::value), - R4 = bool(is_integral_extent<4, Args...>::value), - R5 = bool(is_integral_extent<5, Args...>::value), - R6 = bool(is_integral_extent<6, Args...>::value) - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) - }; - - using array_layout = Kokkos::LayoutStride; - - using value_type = typename SrcTraits::value_type; - - using data_type = value_type*******; - - public: - using traits_type = Kokkos::ViewTraits; - - using type = - Kokkos::View; - - template - struct apply { - static_assert(Kokkos::is_memory_traits::value); - - using traits_type = - Kokkos::ViewTraits; - - using type = Kokkos::View; - }; - - using dimension = typename SrcTraits::dimension; - - template - struct ExtentGenerator { - KOKKOS_INLINE_FUNCTION - static SubviewExtents<7, rank> generator( - const dimension& dim, Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), - Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), - Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6()) { - return SubviewExtents<7, rank>(dim, arg0, arg1, arg2, arg3, arg4, arg5, - arg6); - } - }; - - using ret_type = Kokkos::DynRankView; - - template - KOKKOS_INLINE_FUNCTION static ret_type subview( - const unsigned src_rank, Kokkos::DynRankView const& src, - Args... args) { - using DstType = ViewMapping; - - using DstDimType = std::conditional_t< - (rank == 0), ViewDimension<>, - std::conditional_t< - (rank == 1), ViewDimension<0>, - std::conditional_t< - (rank == 2), ViewDimension<0, 0>, - std::conditional_t< - (rank == 3), ViewDimension<0, 0, 0>, - std::conditional_t< - (rank == 4), ViewDimension<0, 0, 0, 0>, - std::conditional_t< - (rank == 5), ViewDimension<0, 0, 0, 0, 0>, - std::conditional_t< - (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>, - ViewDimension<0, 0, 0, 0, 0, 0, 0>>>>>>>>; - - using dst_offset_type = ViewOffset; - using dst_handle_type = typename DstType::handle_type; - - ret_type dst; - - const SubviewExtents<7, rank> extents = ExtentGenerator::generator( - src.m_map.m_impl_offset.m_dim, args...); - - dst_offset_type tempdst(src.m_map.m_impl_offset, extents); - - dst.m_track = src.m_track; - - dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0; - dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1; - dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2; - dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3; - dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4; - dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5; - dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6; - - dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0; - dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1; - dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2; - dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3; - dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4; - dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5; - dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6; - - dst.m_map.m_impl_handle = - dst_handle_type(src.m_map.m_impl_handle + - src.m_map.m_impl_offset( - extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6))); - - dst.m_rank = - (src_rank > 0 ? unsigned(R0) : 0) + (src_rank > 1 ? unsigned(R1) : 0) + - (src_rank > 2 ? unsigned(R2) : 0) + (src_rank > 3 ? unsigned(R3) : 0) + - (src_rank > 4 ? unsigned(R4) : 0) + (src_rank > 5 ? unsigned(R5) : 0) + - (src_rank > 6 ? unsigned(R6) : 0); - - return dst; - } -}; - -} // namespace Impl - template using Subdynrankview = typename Kokkos::Impl::ViewMapping::ret_type; -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subdynrankview(const Kokkos::DynRankView& src, Args... args) { - if (src.rank() > sizeof...(Args)) // allow sizeof...(Args) >= src.rank(), - // ignore the remaining args - { - Kokkos::abort( - "subdynrankview: num of args must be >= rank of the source " - "DynRankView"); - } - - using metafcn = - Kokkos::Impl::ViewMapping, Args...>; - - return metafcn::subview(src.rank(), src, args...); +template +KOKKOS_INLINE_FUNCTION auto subdynrankview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + auto sub = subview(drv.DownCast(), arg0, arg1, arg2, arg3, arg4, arg5, arg6); + using sub_t = decltype(sub); + size_t new_rank = (drv.rank() > 0 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 1 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 2 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 3 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 4 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 5 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 6 && !std::is_integral_v ? 1 : 0); + + using return_type = + DynRankView; + return static_cast( + DynRankView( + sub, new_rank)); } - -// Wrapper to allow subview function name -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subview(const Kokkos::DynRankView& src, Args... args) { - return subdynrankview(src, args...); +template +KOKKOS_INLINE_FUNCTION auto subview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + return subdynrankview(drv, arg0, arg1, arg2, arg3, arg4, arg5, arg6); } } // namespace Kokkos @@ -1482,12 +972,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && @@ -1638,11 +1128,11 @@ namespace Impl { underlying memory, to facilitate implementation of deep_copy() and other routines that are defined on View */ template -KOKKOS_FUNCTION auto as_view_of_rank_n( +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( DynRankView v, - typename std::enable_if::specialize, void>::value>::type* = - nullptr) { + std::enable_if_t< + std::is_same_v::specialize, void>>*) { if (v.rank() != N) { KOKKOS_IF_ON_HOST( const std::string message = @@ -1653,7 +1143,7 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");) } - auto layout = v.impl_map().layout(); + auto layout = v.DownCast().layout(); if constexpr (std::is_same_v || std::is_same_v || @@ -1691,43 +1181,16 @@ void apply_to_view_of_static_rank(Function&& f, DynRankView a) { } // namespace Impl -template -KOKKOS_INLINE_FUNCTION constexpr auto DynRankView::layout() const -> - typename traits::array_layout { - switch (rank()) { - case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); - case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); - case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); - case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); - case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); - case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); - case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); - case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); - default: - KOKKOS_IF_ON_HOST( - Kokkos::abort( - std::string( - "Calling DynRankView::layout on DRV of unexpected rank " + - std::to_string(rank())) - .c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "Calling DynRankView::layout on DRV of unexpected rank");) - } - // control flow should never reach here - return m_map.layout(); -} - /** \brief Deep copy a value from Host memory into a view. */ template inline void deep_copy( const ExecSpace& e, const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); Impl::apply_to_view_of_static_rank( @@ -1738,8 +1201,8 @@ template inline void deep_copy( const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Impl::apply_to_view_of_static_rank([=](auto view) { deep_copy(view, value); }, dst); } @@ -1750,8 +1213,8 @@ inline void deep_copy( const ExecSpace& e, typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(e, dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1759,8 +1222,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1773,15 +1236,13 @@ inline void deep_copy( template inline void deep_copy( const ExecSpace& exec_space, const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1826,15 +1287,13 @@ inline void deep_copy( template inline void deep_copy( const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1894,7 +1353,7 @@ struct MirrorDRViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1909,26 +1368,6 @@ struct MirrorDRViewType { std::conditional_t; }; -template -struct MirrorDRVType { - // The incoming view_type - using src_view_type = typename Kokkos::DynRankView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::DynRankView; -}; - } // namespace Impl namespace Impl { @@ -1945,10 +1384,9 @@ inline auto create_mirror(const DynRankView& src, arg_prop, std::string(src.label()).append("_mirror")); if constexpr (Impl::ViewCtorProp::has_memory_space) { - using dst_type = typename Impl::MirrorDRVType< + using dst_type = typename Impl::MirrorDRViewType< typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - + P...>::dest_view_type; return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); } else { @@ -1989,7 +1427,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(const Space&, const Kokkos::DynRankView& src) { +inline auto create_mirror(const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } @@ -1999,8 +1438,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::DynRankView& src) { +inline auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -2026,12 +1465,12 @@ inline auto create_mirror_view( [[maybe_unused]] const typename Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename DynRankView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename DynRankView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename DynRankView::data_type, + typename DynRankView::HostMirror::data_type>) { return typename DynRankView::HostMirror(src); } else { return Kokkos::Impl::choose_create_mirror(src, arg_prop); @@ -2102,7 +1541,7 @@ inline auto create_mirror_view( // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::DynRankView& src) { diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index a4b74e246e0..caae3f791f0 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -40,10 +40,10 @@ struct ChunkedArrayManager { using pointer_type = ValueType*; using track_type = Kokkos::Impl::SharedAllocationTracker; - ChunkedArrayManager() = default; - ChunkedArrayManager(ChunkedArrayManager const&) = default; - ChunkedArrayManager(ChunkedArrayManager&&) = default; - ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; + ChunkedArrayManager() = default; + ChunkedArrayManager(ChunkedArrayManager const&) = default; + ChunkedArrayManager(ChunkedArrayManager&&) = default; + ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default; template @@ -129,10 +129,10 @@ struct ChunkedArrayManager { /// allocation template struct Destroy { - Destroy() = default; - Destroy(Destroy&&) = default; - Destroy(const Destroy&) = default; - Destroy& operator=(Destroy&&) = default; + Destroy() = default; + Destroy(Destroy&&) = default; + Destroy(const Destroy&) = default; + Destroy& operator=(Destroy&&) = default; Destroy& operator=(const Destroy&) = default; Destroy(std::string label, value_type** arg_chunk, @@ -250,7 +250,7 @@ class DynamicView : public Kokkos::ViewTraits { // It is assumed that the value_type is trivially copyable; // when this is not the case, potential problems can occur. - static_assert(std::is_void::value, + static_assert(std::is_void_v, "DynamicView only implemented for non-specialized View type"); private: @@ -363,7 +363,7 @@ class DynamicView : public Kokkos::ViewTraits { enum { reference_type_is_lvalue_reference = - std::is_lvalue_reference::value + std::is_lvalue_reference_v }; KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { @@ -463,11 +463,11 @@ class DynamicView : public Kokkos::ViewTraits { //---------------------------------------------------------------------- - ~DynamicView() = default; - DynamicView() = default; - DynamicView(DynamicView&&) = default; - DynamicView(const DynamicView&) = default; - DynamicView& operator=(DynamicView&&) = default; + ~DynamicView() = default; + DynamicView() = default; + DynamicView(DynamicView&&) = default; + DynamicView(const DynamicView&) = default; + DynamicView& operator=(DynamicView&&) = default; DynamicView& operator=(const DynamicView&) = default; template @@ -572,7 +572,7 @@ struct MirrorDynamicViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -665,9 +665,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorDynamicViewType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::DynamicView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -693,14 +693,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::DynamicView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::DynamicView::HostMirror(src); } else { @@ -835,21 +835,17 @@ inline void deep_copy(const View& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } template @@ -861,21 +857,17 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } namespace Impl { @@ -964,7 +956,7 @@ struct ViewCopy, // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::Experimental::DynamicView& src) { diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 3adc70b1904..cf23c25b86b 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -50,9 +50,9 @@ inline constexpr bool is_offset_view_v = is_offset_view::value; #define KOKKOS_INVALID_INDEX_RANGE \ { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET } -template ::value && - std::is_signed::value, - iType> = 0> +template && std::is_signed_v, + iType> = 0> using IndexRange = Kokkos::Array; using index_list_type = std::initializer_list; @@ -118,11 +118,11 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( (enum {LEN = 1024}; char buffer[LEN]; const std::string label = tracker.template get_label(); int n = snprintf(buffer, LEN, - "OffsetView bounds error of view labeled %s (", - label.c_str()); + "OffsetView bounds error of view labeled %s (", + label.c_str()); offsetview_error_operator_bounds<0>(buffer + n, LEN - n, map, begins, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE( (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) @@ -180,44 +180,40 @@ void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank, } // namespace Impl template -class OffsetView : public ViewTraits { - public: - using traits = ViewTraits; - +class OffsetView : public View { private: template friend class OffsetView; - template - friend class View; // FIXME delete this line - template - friend class Kokkos::Impl::ViewMapping; - using map_type = Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; + using base_t = View; public: - enum { Rank = map_type::Rank }; - using begins_type = Kokkos::Array; + // typedefs to reduce typing base_t:: further down + using traits = typename base_t::traits; + // FIXME: should be base_t::index_type after refactor + using index_type = typename base_t::memory_space::size_type; + using pointer_type = typename base_t::pointer_type; + + using begins_type = Kokkos::Array; template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t begin(const iType local_dimension) const { - return local_dimension < Rank ? m_begins[local_dimension] - : KOKKOS_INVALID_OFFSET; + return static_cast(local_dimension) < base_t::rank() + ? m_begins[local_dimension] + : KOKKOS_INVALID_OFFSET; } KOKKOS_FUNCTION begins_type begins() const { return m_begins; } template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t end(const iType local_dimension) const { - return begin(local_dimension) + m_map.extent(local_dimension); + return begin(local_dimension) + base_t::extent(local_dimension); } private: - track_type m_track; - map_type m_map; begins_type m_begins; public: @@ -245,529 +241,60 @@ class OffsetView : public ViewTraits { typename traits::array_layout, typename traits::host_mirror_space>; - //---------------------------------------- - // Domain rank and extents - - /** \brief rank() to be implemented - */ - // KOKKOS_FUNCTION - // static - // constexpr unsigned rank() { return map_type::Rank; } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_FUNCTION constexpr typename traits::array_layout layout() const { - return m_map.layout(); - } - - KOKKOS_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } - KOKKOS_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } - KOKKOS_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } - KOKKOS_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } - KOKKOS_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } - KOKKOS_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } - KOKKOS_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } - KOKKOS_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); + template + KOKKOS_FUNCTION typename base_t::reference_type offset_operator( + std::integer_sequence, OtherIndexTypes... indices) const { + return base_t::operator()((indices - m_begins[I])...); } - template - KOKKOS_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_FUNCTION constexpr pointer_type data() const { return m_map.data(); } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_FUNCTION - const Kokkos::Impl::ViewMapping& implementation_map() const { - return m_map; - } - - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ - ARG; - -#else - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); - + template +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)) #endif - public: - //------------------------------ - // Rank 0 operator() - - KOKKOS_FORCEINLINE_FUNCTION - reference_type operator()() const { return m_map.reference(); } - //------------------------------ - // Rank 1 operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (2 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.reference(j0, j1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - if constexpr (is_layout_left) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; - else - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; - } else if constexpr (is_layout_right) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; - else - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + - j1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined(KOKKOS_COMPILER_INTEL) - __builtin_unreachable(); + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator[]( + const OtherIndexType& idx) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)); #endif + return base_t::operator[](idx - m_begins[0]); } - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.reference(j0, j1, j2); - } - - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.reference(j0, j1, j2, j3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.reference(j0, j1, j2, j3, j4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.reference(j0, j1, j2, j3, j4, j5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map - .m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6, j7)]; + template +#ifndef KOKKOS_ENABLE_CXX17 + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())) +#endif + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator()( + OtherIndexTypes... indices) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert( + (std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())); +#endif + return offset_operator(std::make_index_sequence(), + indices...); } - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6, j7); - } + template + KOKKOS_FUNCTION constexpr typename base_t::reference_type access( + OtherIndexTypes... args) const = delete; -#undef KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY + //---------------------------------------- //---------------------------------------- // Standard destructor, constructors, and assignment operators - KOKKOS_DEFAULTED_FUNCTION - ~OffsetView() = default; - KOKKOS_FUNCTION - OffsetView() : m_track(), m_map() { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = KOKKOS_INVALID_OFFSET; - } - - KOKKOS_FUNCTION - OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(rhs.m_map), - m_begins(rhs.m_begins) {} - - KOKKOS_FUNCTION - OffsetView(OffsetView&& rhs) - : m_track(std::move(rhs.m_track)), - m_map(std::move(rhs.m_map)), - m_begins(std::move(rhs.m_begins)) {} - - KOKKOS_FUNCTION - OffsetView& operator=(const OffsetView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_begins = rhs.m_begins; - return *this; - } - - KOKKOS_FUNCTION - OffsetView& operator=(OffsetView&& rhs) { - m_track = std::move(rhs.m_track); - m_map = std::move(rhs.m_map); - m_begins = std::move(rhs.m_begins); - return *this; + OffsetView() : base_t() { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = KOKKOS_INVALID_OFFSET; } // interoperability with View @@ -778,20 +305,10 @@ class OffsetView : public ViewTraits { public: KOKKOS_FUNCTION - view_type view() const { - view_type v(m_track, m_map); - return v; - } + view_type view() const { return *this; } template - KOKKOS_FUNCTION OffsetView(const View& aview) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - + KOKKOS_FUNCTION OffsetView(const View& aview) : base_t(aview) { for (size_t i = 0; i < View::rank(); ++i) { m_begins[i] = 0; } @@ -800,19 +317,14 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const index_list_type& minIndices) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(aview) { + KOKKOS_IF_ON_HOST( + (Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, aview.label());)) + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -820,27 +332,13 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const begins_type& beg) - : m_track(aview.impl_track()), m_map(), m_begins(beg) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - } + : base_t(aview), m_begins(beg) {} // may assign unmanaged from managed. template KOKKOS_FUNCTION OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(), - m_begins(rhs.m_begins) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); // swb what about assign? - } + : base_t(rhs.view()), m_begins(rhs.m_begins) {} private: enum class subtraction_failure { @@ -879,7 +377,7 @@ class OffsetView : public ViewTraits { static subtraction_failure runtime_check_begins_ends_host(const B& begins, const E& ends) { std::string message; - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) message += "begins.size() " "(" + @@ -887,19 +385,19 @@ class OffsetView : public ViewTraits { ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) message += "ends.size() " "(" + - std::to_string(begins.size()) + + std::to_string(ends.size()) + ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; @@ -941,7 +439,7 @@ class OffsetView : public ViewTraits { message = "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView\n" + message; - Kokkos::Impl::throw_runtime_exception(message); + Kokkos::abort(message.c_str()); } return subtraction_failure::none; @@ -951,11 +449,11 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION static subtraction_failure runtime_check_begins_ends_device( const B& begins, const E& ends) { - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: begins has bad Rank"); - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: ends has bad Rank"); @@ -993,20 +491,25 @@ class OffsetView : public ViewTraits { // Precondition: begins.size() == ends.size() == m_begins.size() == Rank template KOKKOS_FUNCTION OffsetView(const pointer_type& p, const B& begins_, - const E& ends_, - subtraction_failure) - : m_track() // no tracking - , - m_map(Kokkos::Impl::ViewCtorProp(p), - typename traits::array_layout( - Rank > 0 ? at(ends_, 0) - at(begins_, 0) : 0, - Rank > 1 ? at(ends_, 1) - at(begins_, 1) : 0, - Rank > 2 ? at(ends_, 2) - at(begins_, 2) : 0, - Rank > 3 ? at(ends_, 3) - at(begins_, 3) : 0, - Rank > 4 ? at(ends_, 4) - at(begins_, 4) : 0, - Rank > 5 ? at(ends_, 5) - at(begins_, 5) : 0, - Rank > 6 ? at(ends_, 6) - at(begins_, 6) : 0, - Rank > 7 ? at(ends_, 7) - at(begins_, 7) : 0)) { + const E& ends_, subtraction_failure) + : base_t(Kokkos::view_wrap(p), + typename traits::array_layout( + base_t::rank() > 0 ? at(ends_, 0) - at(begins_, 0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 1 ? at(ends_, 1) - at(begins_, 1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 2 ? at(ends_, 2) - at(begins_, 2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 3 ? at(ends_, 3) - at(begins_, 3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 4 ? at(ends_, 4) - at(begins_, 4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 5 ? at(ends_, 5) - at(begins_, 5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 6 ? at(ends_, 6) - at(begins_, 6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 7 ? at(ends_, 7) - at(begins_, 7) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG)) { for (size_t i = 0; i != m_begins.size(); ++i) { m_begins[i] = at(begins_, i); }; @@ -1040,15 +543,6 @@ class OffsetView : public ViewTraits { : OffsetView(p, begins_, ends_, runtime_check_begins_ends(begins_, ends_)) {} - //---------------------------------------- - // Allocation tracking properties - KOKKOS_FUNCTION - int use_count() const { return m_track.use_count(); } - - const std::string label() const { - return m_track.template get_label(); - } - // Choosing std::pair as type for the arguments allows constructing an // OffsetView using list initialization syntax, e.g., // OffsetView dummy("dummy", {-1, 3}, {-2,2}); @@ -1070,18 +564,34 @@ class OffsetView : public ViewTraits { const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE ) - : OffsetView( - Kokkos::Impl::ViewCtorProp(arg_label), - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(Kokkos::Impl::ViewCtorProp(arg_label), + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG - 1 + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit OffsetView( @@ -1094,18 +604,34 @@ class OffsetView : public ViewTraits { const std::pair range5 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range6 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE) - : OffsetView( - arg_prop, - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(arg_prop, + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit KOKKOS_FUNCTION OffsetView( @@ -1113,9 +639,14 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { + : base_t(arg_prop, arg_layout) { + KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, + base_t::label());)) + + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -1132,42 +663,9 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track(), - m_map() - - { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i]; - - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Kokkos::Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "OffsetView allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing OffsetView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, - Kokkos::Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(arg_prop, arg_layout) { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = minIndices.begin()[i]; } }; @@ -1177,7 +675,7 @@ class OffsetView : public ViewTraits { */ template KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { - return V.Rank; + return V.rank(); } // Temporary until added to view //---------------------------------------------------------------------------- @@ -1185,8 +683,8 @@ KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { namespace Impl { template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> -shift_input(const T arg, const int64_t offset) { +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> shift_input( + const T arg, const int64_t offset) { return arg - offset; } @@ -1197,13 +695,13 @@ Kokkos::ALL_t shift_input(const Kokkos::ALL_t arg, const int64_t /*offset*/) { template KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, Kokkos::pair> + std::enable_if_t, Kokkos::pair> shift_input(const Kokkos::pair arg, const int64_t offset) { return Kokkos::make_pair(arg.first - offset, arg.second - offset); } template -inline std::enable_if_t::value, std::pair> -shift_input(const std::pair arg, const int64_t offset) { +inline std::enable_if_t, std::pair> shift_input( + const std::pair arg, const int64_t offset) { return std::make_pair(arg.first - offset, arg.second - offset); } @@ -1212,7 +710,7 @@ KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin( const size_t i, Kokkos::Array& subviewBegins, std::enable_if_t shiftedArg, const Arg arg, const A viewBegins, size_t& counter) { - if (!std::is_integral::value) { + if (!std::is_integral_v) { subviewBegins[counter] = shiftedArg == arg ? viewBegins[i] : 0; counter++; } @@ -1621,7 +1119,7 @@ KOKKOS_INLINE_FUNCTION ViewTraits, Args...>::type>::type subview(const OffsetView& src, Args... args) { static_assert( - OffsetView::Rank == sizeof...(Args), + OffsetView::rank() == sizeof...(Args), "subview requires one argument for each source OffsetView rank"); return Kokkos::Experimental::Impl::subview_offset(src, args...); @@ -1641,12 +1139,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1672,12 +1170,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1704,11 +1202,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); auto dstView = dst.view(); @@ -1719,11 +1217,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1733,11 +1231,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const View& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1748,11 +1246,11 @@ template inline void deep_copy( const View& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); Kokkos::deep_copy(dst, value.view()); @@ -1770,7 +1268,7 @@ struct MirrorOffsetViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1786,27 +1284,6 @@ struct MirrorOffsetViewType { std::conditional_t; }; -template -struct MirrorOffsetType { - // The incoming view_type - using src_view_type = typename Kokkos::Experimental::OffsetView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it.) - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = - Kokkos::Experimental::OffsetView; -}; - } // namespace Impl namespace Impl { @@ -1825,10 +1302,12 @@ inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + return typename Kokkos::Impl::MirrorOffsetViewType< + Space, T, P...>::dest_view_type(prop_copy, src.layout(), + {src.begin(0), src.begin(1), + src.begin(2), src.begin(3), + src.begin(4), src.begin(5), + src.begin(6), src.begin(7)}); } else { return typename Kokkos::Experimental::OffsetView::HostMirror( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); @@ -1877,9 +1356,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorOffsetType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::OffsetView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } @@ -1905,14 +1384,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::OffsetView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::OffsetView::HostMirror(src); } else { diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index 9d04cf6acd0..52af567c61d 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -184,16 +184,16 @@ struct DefaultContribution -struct DefaultDuplication { +struct DefaultDuplication { using type = Kokkos::Experimental::ScatterNonDuplicated; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; @@ -532,32 +532,56 @@ void args_to_array(size_t* array, int pos, T dim0, Dims... dims) { subview where the index specified is the largest-stride one. */ template struct Slice { - using next = Slice; - using value_type = typename next::value_type; - - static value_type get(V const& src, const size_t i, Args... args) { + using next = Slice; + static auto get(V const& src, const size_t i, Args... args) { return next::get(src, i, Kokkos::ALL, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, i, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, args..., i); } }; +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; +#endif + template struct ReduceDuplicates; @@ -905,7 +929,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(arg); @@ -1028,10 +1052,7 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1233,8 +1254,8 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1460,7 +1478,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(thread_id, arg); @@ -1470,9 +1488,9 @@ class ScatterAccess::array_layout, typename ViewTraits::device_type, Op, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits::execution_space, typename std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>>::type, diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 8ce868cac21..ec1b8905c76 100644 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -190,7 +190,7 @@ struct GraphRowViewConst { const typename GraphType::entries_type& colidx_in, const ordinal_type& stride, const ordinal_type& count, const OffsetType& idx, - const std::enable_if_t::value, int>& = 0) + const std::enable_if_t, int>& = 0) : colidx_(&colidx_in(idx)), stride_(stride), length(count) {} /// \brief Number of entries in the row. diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index c3a8b67df8d..4f47051a5c1 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -34,7 +34,7 @@ #include #include -#include +#include #include @@ -746,7 +746,7 @@ class UnorderedMap { /// 'const value_type' via Cuda texture fetch must return by value. template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_void::value, // !is_set + !std::is_void_v, // !is_set std::conditional_t> value_at(size_type i) const { KOKKOS_EXPECTS(i < capacity()); @@ -808,8 +808,8 @@ class UnorderedMap { // Re-allocate the views of the calling UnorderedMap according to src // capacity, and deep copy the src data. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> create_copy_view( UnorderedMap const &src) { if (m_hash_lists.data() != src.m_hash_lists.data()) { @@ -821,8 +821,8 @@ class UnorderedMap { // Allocate views of the calling UnorderedMap with the same capacity as the // src. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> allocate_view( UnorderedMap const &src) { insertable_map_type tmp; @@ -852,8 +852,8 @@ class UnorderedMap { // Deep copy view data from src. This requires that the src capacity is // identical to the capacity of the calling UnorderedMap. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> deep_copy_view( UnorderedMap const &src) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp index 88109fb0ba5..83ccfbf6305 100644 --- a/lib/kokkos/containers/src/Kokkos_Vector.hpp +++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp @@ -172,9 +172,8 @@ class KOKKOS_DEPRECATED vector private: template - struct impl_is_input_iterator - : /* TODO replace this */ std::bool_constant< - !std::is_convertible::value> {}; + struct impl_is_input_iterator : /* TODO replace this */ std::bool_constant< + !std::is_convertible_v> {}; public: // TODO: can use detection idiom to generate better error message here later diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index e69e46bb6a8..6255a86c461 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -1,8 +1,7 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) string(TOUPPER ${Tag} DEVICE) @@ -12,57 +11,49 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) set(UnitTestSources UnitTestMain.cpp) set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) file(MAKE_DIRECTORY ${dir}) - foreach(Name - Bitset - DualView - DynamicView - DynViewAPI_generic - DynViewAPI_rank12345 - DynViewAPI_rank67 - ErrorReporter - OffsetView - ScatterView - StaticCrsGraph - WithoutInitializing - UnorderedMap - Vector - ViewCtorPropEmbeddedDim - ) + foreach( + Name + Bitset + DualView + DynamicView + DynViewAPI_generic + DynViewAPI_rank12345 + DynViewAPI_rank67 + DynRankView_TeamScratch + ErrorReporter + OffsetView + ScatterView + StaticCrsGraph + WithoutInitializing + UnorderedMap + Vector + ViewCtorPropEmbeddedDim + ) if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 AND Name STREQUAL "Vector") continue() # skip Kokkos::vector test if deprecated code 4 is not enabled endif() # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. set(file ${dir}/Test${Tag}_${Name}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include \n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include \n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() #fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj if(KOKKOS_ENABLE_CUDA AND WIN32) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc 0 if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) + kokkos_add_executable_and_test(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() -SET(COMPILE_ONLY_SOURCES - TestCreateMirror.cpp - TestDualViewParameterPack.cpp - TestIsViewTrait.cpp -) -KOKKOS_ADD_EXECUTABLE( - ContainersTestCompileOnly - SOURCES - TestCompileMain.cpp - ${COMPILE_ONLY_SOURCES} +set(COMPILE_ONLY_SOURCES TestCreateMirror.cpp TestDualViewParameterPack.cpp TestIsViewTrait.cpp + TestDynRankViewTypedefs.cpp ) +kokkos_add_executable(ContainersTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES}) diff --git a/lib/kokkos/containers/unit_tests/TestBitset.hpp b/lib/kokkos/containers/unit_tests/TestBitset.hpp index 9923453f72c..91dc1710e5f 100644 --- a/lib/kokkos/containers/unit_tests/TestBitset.hpp +++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp @@ -39,7 +39,7 @@ struct TestBitset { TestBitset(bitset_type const& bitset) : m_bitset(bitset) {} - unsigned testit(unsigned collisions) { + unsigned testit(unsigned long long collisions) { execution_space().fence(); unsigned count = 0; diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index 2512cb5c491..5d03e6202a8 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -71,7 +71,7 @@ struct test_dualview_copy_construction_and_assignment { using SrcViewType = Kokkos::DualView; using DstViewType = - Kokkos::DualView; + Kokkos::DualView; SrcViewType a("A", n, m); @@ -520,58 +520,26 @@ namespace { * that we keep the semantics of UVM DualViews intact. */ // modify if we have other UVM enabled backends -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) // OR other UVM builds -#define UVM_ENABLED_BUILD -#endif - -#ifdef UVM_ENABLED_BUILD -template -struct UVMSpaceFor; -#endif - -#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA -template <> -struct UVMSpaceFor { - using type = Kokkos::CudaUVMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_SYCL // specific to SYCL -template <> -struct UVMSpaceFor { - using type = Kokkos::Experimental::SYCLSharedUSMSpace; -}; -#endif -#ifdef KOKKOS_ENABLE_HIP // specific to HIP -template <> -struct UVMSpaceFor { - using type = Kokkos::HIPManagedSpace; -}; -#endif - -#ifdef UVM_ENABLED_BUILD -template <> -struct UVMSpaceFor { - using type = typename UVMSpaceFor::type; -}; +#ifdef KOKKOS_HAS_SHARED_SPACE +template +using TestSharedSpace = Kokkos::SharedSpace; #else -template -struct UVMSpaceFor { - using type = typename ExecSpace::memory_space; -}; +template +using TestSharedSpace = typename ExecutionSpace::memory_space; #endif using ExecSpace = Kokkos::DefaultExecutionSpace; -using MemSpace = typename UVMSpaceFor::type; +using MemSpace = TestSharedSpace; using DeviceType = Kokkos::Device; using DualViewType = Kokkos::DualView; -using d_device = DeviceType; -using h_device = Kokkos::Device< - Kokkos::DefaultHostExecutionSpace, - typename UVMSpaceFor::type>; +using ConstDualViewType = + Kokkos::DualView; +using d_device = DeviceType; +using h_device = + Kokkos::Device>; TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { DualViewType dv("myView", 100); @@ -635,14 +603,69 @@ TEST(TEST_CATEGORY, dualview_template_views_return_correct_executionspace_views) { DualViewType dv("myView", 100); dv.clear_sync_state(); - using hvt = decltype(dv.view()); - using dvt = decltype(dv.view()); + using hvt = decltype(dv.view()); + using dvt = decltype(dv.view()); ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), dvt::device_type::execution_space::name()); ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), hvt::device_type::execution_space::name()); } +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_views_from_const_dual_view) { + DualViewType dv("myView", 100); + ConstDualViewType const_dv = dv; + dv.clear_sync_state(); + ASSERT_EQ(dv.view(), + const_dv.view()); + ASSERT_EQ(dv.view(), + const_dv.view()); +} + +// User-defined types with a View data member, only host-constructible +template +class S { + V v_; + + public: + template + S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + S() : v_("v", 10) {} +}; + +template +auto initialize_view_of_views() { + Kokkos::DualView dv_v( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 3u); + + V v("v", 2); + V w("w", 2); + dv_v.h_view(0) = v; + dv_v.h_view(1) = w; + + dv_v.modify_host(); + dv_v.sync_device(); + + return dv_v; +} + +TEST(TEST_CATEGORY, dualview_sequential_host_init) { + auto dv_v = initialize_view_of_views>(); + dv_v.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv_v.d_view.size(), 2u); + ASSERT_EQ(dv_v.h_view.size(), 2u); + + initialize_view_of_views>>(); + + Kokkos::DualView dv( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 1u); + dv.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv.d_view.size(), 2u); + ASSERT_EQ(dv.h_view.size(), 2u); + dv.realloc(Kokkos::view_alloc(Kokkos::SequentialHostInit), 3u); + ASSERT_EQ(dv.d_view.size(), 3u); + ASSERT_EQ(dv.h_view.size(), 3u); +} } // anonymous namespace } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp b/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp new file mode 100644 index 00000000000..95117a22e6e --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp @@ -0,0 +1,260 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +// clang-format off +template +struct data_analysis { + using data_type = DataType; + using const_data_type = const DataType; + using runtime_data_type = DataType; + using runtime_const_data_type = const DataType; + using non_const_data_type = std::remove_const_t; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type*; + using const_data_type = typename data_analysis::const_data_type*; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type*; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type[N]; + using const_data_type = typename data_analysis::const_data_type[N]; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type[N]; +}; + +template +constexpr bool test_view_typedefs_impl() { + // ======================== + // inherited from ViewTraits + // ======================== + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + + // FIXME: these should be deprecated and for proper testing (I.e. where this is different from data_type) + // we would need ensemble types which use the hidden View dimension facility of View (i.e. which make + // "specialize" not void) + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + static_assert(std::is_same_v); + + // FIXME: value_type definition conflicts with mdspan value_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + + // FIXME: should maybe be deprecated + static_assert(std::is_same_v); + + // FIXME: should be deprecated and is some complicated impl type + // static_assert(!std::is_void_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should be deprecated in favor of reference + static_assert(std::is_same_v); + // FIXME: should be deprecated in favor of data_handle_type + static_assert(std::is_same_v); + + // ========================================= + // in Legacy View: some helper View variants + // ========================================= + + // FIXME: in contrast to View, hooks_policy is not propagated + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + +/* FIXME: these don't exist in DynRankView, should they? + using uniform_layout_type = std::conditional_t), + Kokkos::LayoutLeft, Layout>; + + // Uhm uniformtype removes all memorytraits? + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + + using anonymous_device_type = Kokkos::Device; + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); +*/ + + // ================================== + // mdspan compatibility + // ================================== + + // FIXME: This typedef caused some weird issue with MSVC+NVCC + // static_assert(std::is_same_v); + // FIXME: Not supported yet + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + + static_assert(std::is_same_v); + // FIXME: should be remove_const_t + static_assert(std::is_same_v); + // FIXME: should be extents_type::index_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should come from accessor_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + return true; +} + +// Helper function to unpack data type and other args from the View, and pass them on +template +struct ViewParams {}; + +template +constexpr bool test_view_typedefs(ViewParams) { + return test_view_typedefs_impl, Kokkos::ViewTraits, + T, L, S, M, HostMirrorSpace, ValueType, ReferenceType>(); +} + + +constexpr bool is_host_exec = std::is_same_v; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) || defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +constexpr bool has_unified_mem_space = true; +#else +constexpr bool has_unified_mem_space = false; +#endif + +// The test take explicit template arguments for: LayoutType, Space, MemoryTraits, HostMirrorSpace, ValueType, ReferenceType +// The ViewParams is just a type pack for the View template arguments + +// Kokkos::View +namespace TestInt { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestIntDefaultExecutionSpace { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, it is HostSpace (note difference from View ...) + using host_mirror_space = std::conditional_t>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestFloatPPHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::HostSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View> +namespace TestFloatPPDeviceDefaultHostExecHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::Device; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams>{})); +} + +// Kokkos::View> +namespace TestIntAtomic { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs>>>( + ViewParams>{})); +} +// clang-format on +} // namespace diff --git a/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp b/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp new file mode 100644 index 00000000000..e5f8860de76 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp @@ -0,0 +1,72 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +void test_dyn_rank_view_team_scratch() { + using execution_space = TEST_EXECSPACE; + using memory_space = execution_space::scratch_memory_space; + using drv_type = Kokkos::DynRankView; + using policy_type = Kokkos::TeamPolicy; + using team_type = policy_type::member_type; + + int N0 = 10, N1 = 4, N2 = 3; + size_t shmem_size = drv_type::shmem_size(N0, N1, N2); + ASSERT_GE(shmem_size, N0 * N1 * N2 * sizeof(int)); + + Kokkos::View> + errors("errors"); + auto policy = policy_type(1, Kokkos::AUTO) + .set_scratch_size(0, Kokkos::PerTeam(shmem_size)); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(const team_type& team) { + drv_type scr(team.team_scratch(0), N0, N1, N2); + // Control that the code ran at all + if (scr.rank() != 3) errors() |= 1u; + if (scr.extent_int(0) != N0) errors() |= 2u; + if (scr.extent_int(1) != N1) errors() |= 4u; + if (scr.extent_int(2) != N2) errors() |= 8u; + Kokkos::parallel_for( + Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { scr(i, j, k) = i * 100 + j * 10 + k; }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { + if (scr(i, j, k) != i * 100 + j * 10 + k) + errors() |= 16u; + }); + errors() |= 256u; + }); + unsigned h_errors = 0; + Kokkos::deep_copy(h_errors, errors); + + ASSERT_EQ((h_errors & 1u), 0u) << "Rank mismatch"; + ASSERT_EQ((h_errors & 2u), 0u) << "extent 0 mismatch"; + ASSERT_EQ((h_errors & 4u), 0u) << "extent 1 mismatch"; + ASSERT_EQ((h_errors & 8u), 0u) << "extent 2 mismatch"; + ASSERT_EQ((h_errors & 16u), 0u) << "data access incorrect"; + ASSERT_EQ(h_errors, 256u); +} + +TEST(TEST_CATEGORY, dyn_rank_view_team_scratch) { + test_dyn_rank_view_team_scratch(); +} + +} // namespace diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 4ecb6cf25cc..930c76c32c4 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -792,9 +792,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -817,9 +816,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -846,9 +844,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -879,8 +876,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -915,8 +911,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -943,8 +938,6 @@ class TestDynViewAPI { dView0 d("d"); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - // Rank 0 Kokkos::resize(d); @@ -1121,8 +1114,6 @@ class TestDynViewAPI { Kokkos::deep_copy(error_flag_host, error_flag); ASSERT_EQ(error_flag_host(), 0); #endif // MDRangePolict Rank < 7 - -#endif // defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) } static void run_test_scalar() { diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index c8f8fed3b8b..94ccea86eb9 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -71,7 +71,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -85,7 +84,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -93,7 +91,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -108,7 +105,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -123,7 +119,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -137,7 +132,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -145,7 +139,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -160,7 +153,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -175,7 +167,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -189,14 +180,12 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // remove the final 3/4 entries i.e. first 1/4 remain unsigned da_resize = arg_total_size / 8; da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -210,7 +199,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Reproducer to demonstrate compile-time error of deep_copy @@ -229,7 +217,6 @@ struct TestDynamicView { device_dynamic_view.resize_serial(da_size); // Use parallel_for to populate device_dynamic_view and verify values -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); @@ -243,7 +230,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // Use an on-device View as intermediate to deep_copy the // device_dynamic_view to host, zero out the device_dynamic_view, @@ -251,13 +237,11 @@ struct TestDynamicView { Kokkos::deep_copy(device_view, device_dynamic_view); Kokkos::deep_copy(host_view, device_view); Kokkos::deep_copy(device_view, host_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); -#endif Kokkos::deep_copy(device_dynamic_view, device_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + value_type new_result_sum = 0.0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, da_size), @@ -267,21 +251,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif - - // Try to deep_copy device_dynamic_view directly to/from host. - // host-to-device currently fails to compile because DP and SP are - // swapped in the deep_copy implementation. - // Once that's fixed, both deep_copy's will fail at runtime because the - // destination execution space cannot access the source memory space. - // Check if the memory spaces are different before testing the deep_copy. - if (!Kokkos::SpaceAccessibility::accessible) { - ASSERT_THROW(Kokkos::deep_copy(host_view, device_dynamic_view), - std::runtime_error); - ASSERT_THROW(Kokkos::deep_copy(device_dynamic_view, host_view), - std::runtime_error); - } } } }; diff --git a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp index 0003a29468c..4ebab889c78 100644 --- a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp +++ b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp @@ -149,7 +149,6 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase { } }; -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase { @@ -178,7 +177,6 @@ struct ErrorReporterDriverUseLambda driver_base::check_expectations(reporter_capacity, test_size); } }; -#endif #ifdef KOKKOS_ENABLE_OPENMP struct ErrorReporterDriverNativeOpenMP @@ -205,8 +203,7 @@ struct ErrorReporterDriverNativeOpenMP // FIXME_MSVC MSVC just gets confused when using the base class in the // KOKKOS_CLASS_LAMBDA -#if !defined(KOKKOS_COMPILER_MSVC) && \ - (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +#ifndef KOKKOS_COMPILER_MSVC TEST(TEST_CATEGORY, ErrorReporterViaLambda) { TestErrorReporter>(); } diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index c133922e3de..706b40fff38 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -56,7 +56,18 @@ void test_offsetview_construction() { offset_view_type ov("firstOV", range0, range1); ASSERT_EQ("firstOV", ov.label()); - ASSERT_EQ(2, ov.Rank); + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + ASSERT_EQ(2u, ov.Rank); +#endif +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + + ASSERT_EQ(2u, ov.rank()); ASSERT_EQ(ov.begin(0), -1); ASSERT_EQ(ov.end(0), 4); @@ -67,7 +78,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(0), 5u); ASSERT_EQ(ov.extent(1), 5u); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -149,7 +159,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif { offset_view_type ovCopy(ov); @@ -184,7 +193,6 @@ void test_offsetview_construction() { range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, point3_type{{extent0, extent1, extent2}}); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -207,7 +215,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif } view_type viewFromOV = ov.view(); @@ -232,7 +239,6 @@ void test_offsetview_construction() { view_type aView("aView", ov.extent(0), ov.extent(1)); Kokkos::deep_copy(aView, ov); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -242,7 +248,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif } { // test view to offsetview deep copy @@ -251,7 +256,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, 99); Kokkos::deep_copy(ov, aView); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -261,7 +265,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif } } @@ -329,46 +332,131 @@ void test_offsetview_unmanaged_construction() { ASSERT_EQ(bb, ib); ASSERT_EQ(bb, ii); } +} + +template +void test_offsetview_unmanaged_construction_death() { + // Preallocated memory (Only need a valid address for this test) + Scalar s; + + // Regular expression syntax on Windows is a pain. `.` does not match `\n`. + // Feel free to make it work if you have time to spare. +#ifdef _WIN32 +#define SKIP_REGEX_ON_WINDOWS(REGEX) "" +#else +#define SKIP_REGEX_ON_WINDOWS(REGEX) REGEX +#endif { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must be positive - ASSERT_NO_THROW(offset_view_type(&s, {0}, {1})); - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0})); - ASSERT_THROW(offset_view_type(&s, {0}, {-1}), std::runtime_error); + (void)offset_view_type(&s, {0}, {1}); + (void)offset_view_type(&s, {0}, {0}); + ASSERT_DEATH( + offset_view_type(&s, {0}, {-1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(-1\\) - begins\\[0\\] \\(0\\)\\) must be " + "non-negative")); } { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must not overflow - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0x7fffffffffffffffl})); - ASSERT_THROW(offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW( + (void)offset_view_type(&s, {0}, {0x7fffffffffffffffl}); + ASSERT_DEATH( + offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-1\\)\\) " + "overflows")); + ASSERT_DEATH( offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), - std::runtime_error); + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); + ASSERT_DEATH( + offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(0\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); } { using offset_view_type = Kokkos::Experimental::OffsetView; - // Should throw when the rank of begins and/or ends doesn't match that of - // OffsetView - ASSERT_THROW(offset_view_type(&s, {0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1}), std::runtime_error); - ASSERT_NO_THROW(offset_view_type(&s, {0, 0}, {1, 1})); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), - std::runtime_error); + // Should throw when the rank of begins and/or ends doesn't match that + // of OffsetView + ASSERT_DEATH( + offset_view_type(&s, {0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + (void)offset_view_type(&s, {0, 0}, {1, 1}); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); } +#undef SKIP_REGEX_ON_WINDOWS } template @@ -377,8 +465,8 @@ void test_offsetview_subview() { Kokkos::Experimental::OffsetView sliceMe("offsetToSlice", {-10, 20}); { - auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0); - ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken."; + auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0); + ASSERT_EQ(offsetSubview.rank(), 0u) << "subview of offset is broken."; } } { // test subview 2 @@ -387,13 +475,13 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), -2); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -406,30 +494,29 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::make_pair(-30, -21)); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; ASSERT_EQ(offsetSubview.begin(0), -20); ASSERT_EQ(offsetSubview.end(0), 31); ASSERT_EQ(offsetSubview.begin(1), 0); ASSERT_EQ(offsetSubview.end(1), 9); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -455,25 +542,24 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif } // slice 2 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -486,73 +572,72 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } // slice 2 auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2a.rank(), 2u) << "subview of offset is broken."; { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } // slice 3 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, BinaryOperation op) { @@ -586,6 +671,7 @@ void test_offsetview_offsets_rank1() { KOKKOS_LAMBDA(const int ii, int& lerrors) { offset_view_type ov(v, {ii}); lerrors += (ov(3) != element({3 - ii})); + lerrors += (ov[3] != element({3 - ii})); }, errors); @@ -655,7 +741,6 @@ void test_offsetview_offsets_rank3() { ASSERT_EQ(0, errors); } -#endif TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); @@ -665,11 +750,15 @@ TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); } +TEST(TEST_CATEGORY_DEATH, offsetview_unmanaged_construction) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + test_offsetview_unmanaged_construction_death(); +} + TEST(TEST_CATEGORY, offsetview_subview) { test_offsetview_subview(); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) TEST(TEST_CATEGORY, offsetview_offsets_rank1) { test_offsetview_offsets_rank1(); } @@ -681,7 +770,6 @@ TEST(TEST_CATEGORY, offsetview_offsets_rank2) { TEST(TEST_CATEGORY, offsetview_offsets_rank3) { test_offsetview_offsets_rank3(); } -#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index 733f43122ce..72c1afbe96a 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -33,11 +33,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -134,11 +134,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -235,11 +235,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -335,11 +335,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -714,7 +714,7 @@ void test_scatter_view(int64_t n) { test_sv_config.run_test(n); } #ifdef KOKKOS_ENABLE_SERIAL - if (!std::is_same::value) { + if (!std::is_same_v) { #endif test_scatter_view_config::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); } } /* namespace TestStaticCrsGraph */ diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp index 4a7e826ecbe..fc7435a75e5 100644 --- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -460,7 +460,7 @@ struct UnorderedMapInsert { //! Insert multiple values. template - void insert(Args &&... args) const { + void insert(Args &&...args) const { static_assert(sizeof...(Args) > 1, "Prefer the single value version"); constexpr size_t size = sizeof...(Args); Kokkos::Array values{ @@ -534,8 +534,6 @@ TEST(TEST_CATEGORY, UnorderedMap_shallow_copyable_on_device) { ASSERT_EQ(1u, test_map_copy.m_map.size()); } -#if !defined(KOKKOS_ENABLE_CUDA) || \ - (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_LAMBDA)) void test_unordered_map_device_capture() { TestMapCopy::map_type map; @@ -549,7 +547,6 @@ void test_unordered_map_device_capture() { TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) { test_unordered_map_device_capture(); } -#endif /** * @test This test ensures that an @ref UnorderedMap can be built diff --git a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp index 0246f11ddfe..2edddcce34f 100644 --- a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp +++ b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp @@ -48,7 +48,7 @@ struct TestViewCtorProp_EmbeddedDim { void operator()(const int i) const { v(i) = i; } }; - static void test_vcpt(const int N0, const int N1) { + static void test_vcpt(const size_t N0, const size_t N1) { // Create two views to test { using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; @@ -78,16 +78,16 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); #if 0 // debug output - for ( int i = 0; i < N0*N1; ++i ) { - printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + for ( size_t i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%zu) = %lf\n ", i, hcv1(i) ); } printf( " Common value type view: %s \n", typeid( CVT() ).name() ); printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); - if ( std::is_same< CommonViewValueType, double >::value == true ) { + if ( std::is_same_v< CommonViewValueType, double > == true ) { printf("Proper common value_type\n"); } else { @@ -115,7 +115,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } @@ -148,7 +148,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } { @@ -169,7 +169,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } diff --git a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index e8558628dc8..2932898554c 100644 --- a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -44,6 +44,12 @@ Kokkos::CudaSpace>) \ GTEST_SKIP() << "skipping since unified memory requires additional " \ "fences"; +#elif defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; #else #define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE #endif @@ -51,8 +57,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 5, 6, 7, - 8); + Kokkos::DualView bla("bla", 5, 6, 7, 8); auto success = validate_absence( [&]() { @@ -82,8 +87,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -112,8 +116,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -245,7 +248,7 @@ TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif @@ -280,7 +283,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_scatterview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 4, 5, 6, 7); auto success = validate_absence( @@ -312,7 +315,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -343,7 +346,7 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -384,13 +387,12 @@ TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif #if defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the HPX backend always fences with async " "dispatch disabled"; #endif diff --git a/lib/kokkos/core/CMakeLists.txt b/lib/kokkos/core/CMakeLists.txt index 0917928001a..21f05f62724 100644 --- a/lib/kokkos/core/CMakeLists.txt +++ b/lib/kokkos/core/CMakeLists.txt @@ -1,22 +1,14 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() -FUNCTION(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) - IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() - ENDIF() +function(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) + if(NOT Kokkos_ENABLE_BENCHMARKS) + return() + endif() - IF(KOKKOS_HAS_TRILINOS) - message( - STATUS - "Benchmarks are not supported when building as part of Trilinos" - ) - RETURN() - ENDIF() + add_subdirectory(${DIR_NAME}) +endfunction() - ADD_SUBDIRECTORY(${DIR_NAME}) -ENDFUNCTION() - -KOKKOS_ADD_TEST_DIRECTORIES(unit_test) -KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test) +kokkos_add_test_directories(unit_test) +kokkos_add_benchmark_directory(perf_test) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index e0dba03e1ec..0cb2c804d38 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -1,50 +1,36 @@ # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. # FIXME_OPENACC - temporarily disabled due to unimplemented features -IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - RETURN() -ENDIF() -IF (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - RETURN() -ENDIF() +if((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + return() +endif() +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + return() +endif() # all PerformanceTest_* executables are part of regular tests # TODO: finish converting these into benchmarks (in progress) -IF(KOKKOS_ENABLE_TESTS) - IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE ( - PerformanceTest_SharedSpace - SOURCES test_sharedSpace.cpp - ) - ENDIF() - - KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - - IF(NOT Kokkos_ENABLE_OPENMPTARGET) - # FIXME OPENMPTARGET needs tasking - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_TaskDag - SOURCES test_taskdag.cpp - CATEGORIES PERFORMANCE - ) - ENDIF() -ENDIF() - -IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() -ENDIF() - -IF (KOKKOS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") -ENDIF() +if(KOKKOS_ENABLE_TESTS) + if(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) + kokkos_add_executable(PerformanceTest_SharedSpace SOURCES test_sharedSpace.cpp) + endif() + + kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + + kokkos_add_executable_and_test(PerformanceTest_TaskDag SOURCES test_taskdag.cpp CATEGORIES PERFORMANCE) +endif() + +if(NOT Kokkos_ENABLE_BENCHMARKS) + return() +endif() # Find or download google/benchmark library find_package(benchmark QUIET 1.5.6) -IF(benchmark_FOUND) - MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") -ELSE() +if(benchmark_FOUND) + message(STATUS "Using google benchmark found in ${benchmark_DIR}") +else() message(STATUS "No installed google benchmark found, fetching from GitHub") include(FetchContent) - SET(BENCHMARK_ENABLE_TESTING OFF) + set(BENCHMARK_ENABLE_TESTING OFF) list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") FetchContent_Declare( @@ -57,143 +43,93 @@ ELSE() list(POP_BACK CMAKE_MESSAGE_INDENT) # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") - ENDIF() + if(CMAKE_CXX_CLANG_TIDY) + set_target_properties(benchmark PROPERTIES CXX_CLANG_TIDY "") + endif() target_compile_options(benchmark PRIVATE -w) target_compile_options(benchmark_main PRIVATE -w) -ENDIF() +endif() +function(KOKKOS_ADD_BENCHMARK NAME) + cmake_parse_arguments(BENCHMARK "" "" "SOURCES" ${ARGN}) + if(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) + message(WARNING "Unexpected arguments when adding a benchmark: " ${BENCHMARK_UNPARSED_ARGUMENTS}) + endif() -FUNCTION(KOKKOS_ADD_BENCHMARK NAME) - CMAKE_PARSE_ARGUMENTS( - BENCHMARK - "" - "" - "SOURCES" - ${ARGN} - ) - IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - MESSAGE( - WARNING - "Unexpected arguments when adding a benchmark: " - ${BENCHMARK_UNPARSED_ARGUMENTS} - ) - ENDIF() - - SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) - LIST(APPEND BENCHMARK_SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp - ) + set(BENCHMARK_NAME Kokkos_${NAME}) + list(APPEND BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp) - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - ${BENCHMARK_SOURCES} - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version - ) - TARGET_INCLUDE_DIRECTORIES( - ${BENCHMARK_NAME} - SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include - ) + add_executable(${BENCHMARK_NAME} ${BENCHMARK_SOURCES}) + target_link_libraries(${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version) + target_include_directories(${BENCHMARK_NAME} SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include) - FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) - SET_SOURCE_FILES_PROPERTIES( - ${SOURCE_FILE} - PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE} - ) - ENDFOREACH() - - STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - SET( - BENCHMARK_ARGS - --benchmark_counters_tabular=true - --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json - ) + foreach(SOURCE_FILE ${BENCHMARK_SOURCES}) + set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + endforeach() - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} - ) -ENDFUNCTION() - -SET( - BENCHMARK_SOURCES - PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - PerfTestHexGrad.cpp - PerfTest_MallocFree.cpp - PerfTest_ViewAllocate.cpp - PerfTest_ViewCopy_a123.cpp - PerfTest_ViewCopy_b123.cpp - PerfTest_ViewCopy_c123.cpp - PerfTest_ViewCopy_d123.cpp - PerfTest_ViewCopy_a45.cpp - PerfTest_ViewCopy_b45.cpp - PerfTest_ViewCopy_c45.cpp - PerfTest_ViewCopy_d45.cpp - PerfTest_ViewCopy_a6.cpp - PerfTest_ViewCopy_b6.cpp - PerfTest_ViewCopy_c6.cpp - PerfTest_ViewCopy_d6.cpp - PerfTest_ViewCopy_a7.cpp - PerfTest_ViewCopy_b7.cpp - PerfTest_ViewCopy_c7.cpp - PerfTest_ViewCopy_d7.cpp - PerfTest_ViewCopy_a8.cpp - PerfTest_ViewCopy_b8.cpp - PerfTest_ViewCopy_c8.cpp - PerfTest_ViewCopy_d8.cpp - PerfTest_ViewCopy_Raw.cpp - PerfTest_ViewFill_123.cpp - PerfTest_ViewFill_45.cpp - PerfTest_ViewFill_6.cpp - PerfTest_ViewFill_7.cpp - PerfTest_ViewFill_8.cpp - PerfTest_ViewFill_Raw.cpp - PerfTest_ViewResize_123.cpp - PerfTest_ViewResize_45.cpp - PerfTest_ViewResize_6.cpp - PerfTest_ViewResize_7.cpp - PerfTest_ViewResize_8.cpp - PerfTest_ViewResize_Raw.cpp -) + string(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) + set(BENCHMARK_ARGS --benchmark_counters_tabular=true --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json) + + add_test(NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS}) +endfunction() -IF(Kokkos_ENABLE_OPENMPTARGET) -# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - LIST(REMOVE_ITEM BENCHMARK_SOURCES +set(BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Benchmark - SOURCES ${BENCHMARK_SOURCES} + PerfTestHexGrad.cpp + PerfTest_MallocFree.cpp + PerfTest_ViewAllocate.cpp + PerfTest_ViewCopy_a123.cpp + PerfTest_ViewCopy_b123.cpp + PerfTest_ViewCopy_c123.cpp + PerfTest_ViewCopy_d123.cpp + PerfTest_ViewCopy_a45.cpp + PerfTest_ViewCopy_b45.cpp + PerfTest_ViewCopy_c45.cpp + PerfTest_ViewCopy_d45.cpp + PerfTest_ViewCopy_a6.cpp + PerfTest_ViewCopy_b6.cpp + PerfTest_ViewCopy_c6.cpp + PerfTest_ViewCopy_d6.cpp + PerfTest_ViewCopy_a7.cpp + PerfTest_ViewCopy_b7.cpp + PerfTest_ViewCopy_c7.cpp + PerfTest_ViewCopy_d7.cpp + PerfTest_ViewCopy_a8.cpp + PerfTest_ViewCopy_b8.cpp + PerfTest_ViewCopy_c8.cpp + PerfTest_ViewCopy_d8.cpp + PerfTest_ViewCopy_Raw.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp + PerfTest_ViewFill_Raw.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp + PerfTest_ViewResize_Raw.cpp ) -IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) - KOKKOS_ADD_BENCHMARK( - Benchmark_Atomic_MinMax - SOURCES test_atomic_minmax_simple.cpp +if(Kokkos_ENABLE_OPENMPTARGET) + # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + list(REMOVE_ITEM BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp ) -ENDIF() +endif() + +kokkos_add_benchmark(PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES}) + +kokkos_add_benchmark(Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp) # FIXME_NVHPC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - KOKKOS_ADD_BENCHMARK( - PerformanceTest_Mempool - SOURCES test_mempool.cpp - ) -ENDIF() +if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + kokkos_add_benchmark(PerformanceTest_Mempool SOURCES test_mempool.cpp) +endif() -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Atomic - SOURCES test_atomic.cpp -) +kokkos_add_benchmark(PerformanceTest_Atomic SOURCES test_atomic.cpp) diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp index 98cb246c71e..1ebe750f216 100644 --- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp +++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp @@ -34,10 +34,10 @@ struct HexGrad { enum { NSpace = 3, NNode = 8 }; using elem_coord_type = - Kokkos::View; + Kokkos::View; using elem_grad_type = - Kokkos::View; + Kokkos::View; elem_coord_type coords; elem_grad_type grad_op; diff --git a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp index 2110f38a916..03340a5d6de 100644 --- a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp @@ -21,7 +21,6 @@ #include #include -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Test { template std::pair custom_reduction_test(int N, int R) { @@ -130,4 +129,3 @@ BENCHMARK(CustomReduction) ->UseManualTime(); } // namespace Test -#endif diff --git a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index d2a3d0b823a..aa23ddbb607 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -56,8 +56,7 @@ bool is_overlapping(const Kokkos::HIP&) { #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> -bool is_overlapping( - const Kokkos::Experimental::SYCL&) { +bool is_overlapping(const Kokkos::SYCL&) { return true; } #endif diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 67a8d7e5554..e4db40e128c 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) @@ -38,6 +37,5 @@ BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) ->UseManualTime(); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp index c11074d9154..57bba83a9c1 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) @@ -28,6 +27,5 @@ BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp index 2d1bcbb3cab..ab469cb647c 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewResize_NoInit_Raw) ->ArgName("N") ->Arg(N) @@ -30,6 +29,5 @@ BENCHMARK(ViewResize_NoInit_Raw) ->Arg(N) ->UseManualTime() ->Iterations(R); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/test_mempool.cpp b/lib/kokkos/core/perf_test/test_mempool.cpp index 9905740afb4..bdfe59b0b3b 100644 --- a/lib/kokkos/core/perf_test/test_mempool.cpp +++ b/lib/kokkos/core/perf_test/test_mempool.cpp @@ -198,7 +198,7 @@ static void Mempool_Fill(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, @@ -225,7 +225,7 @@ static void Mempool_Alloc_Dealloc(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, diff --git a/lib/kokkos/core/perf_test/test_sharedSpace.cpp b/lib/kokkos/core/perf_test/test_sharedSpace.cpp index 4f140c9409a..3c06770e286 100644 --- a/lib/kokkos/core/perf_test/test_sharedSpace.cpp +++ b/lib/kokkos/core/perf_test/test_sharedSpace.cpp @@ -103,7 +103,7 @@ size_t getDeviceMemorySize() { #elif defined KOKKOS_ENABLE_HIP return Kokkos::HIP{}.hip_device_prop().totalGlobalMem; #elif defined KOKKOS_ENABLE_SYCL - auto device = Kokkos::Experimental::SYCL{}.sycl_queue().get_device(); + auto device = Kokkos::SYCL{}.sycl_queue().get_device(); return device.get_info(); #else #error \ diff --git a/lib/kokkos/core/perf_test/test_taskdag.cpp b/lib/kokkos/core/perf_test/test_taskdag.cpp index fccaab64ddf..347d9748b5a 100644 --- a/lib/kokkos/core/perf_test/test_taskdag.cpp +++ b/lib/kokkos/core/perf_test/test_taskdag.cpp @@ -32,6 +32,11 @@ int main() { return 0; } #include +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + using ExecSpace = Kokkos::DefaultExecutionSpace; inline long eval_fib(long n) { @@ -223,4 +228,8 @@ int main(int argc, char* argv[]) { return 0; } +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + #endif diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index b84677e61b6..72663739a14 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -1,118 +1,125 @@ -KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} - ${KOKKOS_TOP_BUILD_DIR} -) -IF (NOT desul_FOUND) - IF(KOKKOS_ENABLE_CUDA) - SET(DESUL_ATOMICS_ENABLE_CUDA ON) - ENDIF() - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP) - SET(DESUL_ATOMICS_ENABLE_HIP ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_SYCL) - SET(DESUL_ATOMICS_ENABLE_SYCL ON) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) - ENDIF() - ENDIF() - IF(KOKKOS_ENABLE_OPENMPTARGET) - SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP - ENDIF() - IF(KOKKOS_ENABLE_OPENACC) - SET(DESUL_ATOMICS_ENABLE_OPENACC ON) - ENDIF() - CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp - ) - KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ${KOKKOS_TOP_BUILD_DIR}) +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + set(DESUL_ATOMICS_ENABLE_CUDA ON) + endif() + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_HIP) + set(DESUL_ATOMICS_ENABLE_HIP ON) + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_SYCL) + set(DESUL_ATOMICS_ENABLE_SYCL ON) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + endif() + endif() + if(KOKKOS_ENABLE_OPENMPTARGET) + set(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP + endif() + if(KOKKOS_ENABLE_OPENACC) + # FIXME_OPENACC FIXME_CLACC - Below condition will be removed if Clacc can compile atomics. + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + set(DESUL_ATOMICS_ENABLE_OPENACC ON) + endif() + endif() + configure_file( + ${KOKKOS_SOURCE_DIR}/tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp ) -ENDIF() + kokkos_include_directories(${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/" +install( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.hpp" PATTERN "*.h" ) -SET(KOKKOS_CORE_SRCS) -APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CORE_HEADERS) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) - -IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENMP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENMPTARGET) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENACC) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_THREADS) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_HPX) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_SERIAL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) -ENDIF() - -IF (NOT desul_FOUND) - IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_CUDA.cpp) - ELSEIF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_HIP.cpp) - ELSEIF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_SYCL.cpp) - ENDIF() - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/*/*/*.inc*) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) - - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul" - "${CMAKE_CURRENT_BINARY_DIR}/desul" +set(KOKKOS_CORE_SRCS) +append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CORE_HEADERS) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) + +if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/Kokkos_Cuda_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENMP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/Kokkos_OpenMP_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENACC) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) +endif() + +if(KOKKOS_ENABLE_THREADS) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) +endif() + +if(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) +endif() + +if(KOKKOS_ENABLE_HPX) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/Kokkos_HPX_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) +endif() + +if(KOKKOS_ENABLE_SERIAL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/Kokkos_Serial_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) +endif() + +if(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +endif() + +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_CUDA.cpp) + elseif(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_HIP.cpp) + elseif(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_SYCL.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/*/*/*.inc*) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) + + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul" "${CMAKE_CURRENT_BINARY_DIR}/desul" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.inc" @@ -120,33 +127,26 @@ IF (NOT desul_FOUND) PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal desul_atomics copy") -ELSE() - MESSAGE(STATUS "Using external desul_atomics install found at:") - MESSAGE(STATUS " " ${desul_DIR}) -ENDIF() - + message(STATUS "Using internal desul_atomics copy") +else() + message(STATUS "Using external desul_atomics install found at:") + message(STATUS " " ${desul_DIR}) +endif() -KOKKOS_ADD_LIBRARY( - kokkoscore - SOURCES ${KOKKOS_CORE_SRCS} - HEADERS ${KOKKOS_CORE_HEADERS} +kokkos_add_library( + kokkoscore SOURCES ${KOKKOS_CORE_SRCS} HEADERS ${KOKKOS_CORE_HEADERS} ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -IF (NOT desul_FOUND) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include - ) -ENDIF() +if(NOT desul_FOUND) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -IF (Kokkos_ENABLE_IMPL_MDSPAN) - MESSAGE(STATUS "Experimental mdspan support is enabled") +if(Kokkos_ENABLE_IMPL_MDSPAN) + message(STATUS "Experimental mdspan support is enabled") # Some compilers now include mdspan... we just flag on their version # for now until we can get some compiler detection support @@ -154,62 +154,56 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) check_include_file_cxx(experimental/mdspan KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN) check_include_file_cxx(mdspan KOKKOS_COMPILER_SUPPORTS_MDSPAN) - if (Kokkos_ENABLE_MDSPAN_EXTERNAL) - MESSAGE(STATUS "Using external mdspan") + if(Kokkos_ENABLE_MDSPAN_EXTERNAL) + message(STATUS "Using external mdspan") target_link_libraries(kokkoscore PUBLIC std::mdspan) elseif(KOKKOS_COMPILER_SUPPORTS_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied mdspan") elseif(KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied experimental/mdspan") else() - KOKKOS_LIB_INCLUDE_DIRECTORIES( - kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include - ) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/__p0009_bits/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/mdspan) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/__p0009_bits/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/mdspan) - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/" + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "mdspan" PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal mdspan directory ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include") + message(STATUS "Using internal mdspan directory ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include") endif() -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) +kokkos_link_tpl(kokkoscore PUBLIC HWLOC) +kokkos_link_tpl(kokkoscore PUBLIC CUDA) +kokkos_link_tpl(kokkoscore PUBLIC HPX) +kokkos_link_tpl(kokkoscore PUBLIC LIBDL) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread -IF (NOT WIN32) - KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) -ENDIF() +if(NOT WIN32) + kokkos_link_tpl(kokkoscore PUBLIC THREADS) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_link_tpl(kokkoscore PUBLIC ROCM) +endif() # FIXME: We need a proper solution to figure out whether to enable # libatomic # Most compilers only require libatomic for 128-bit CAS # I (CT) had removed 128bit CAS from desul to not need libatomic. -IF (KOKKOS_ENABLE_OPENMPTARGET) +if(KOKKOS_ENABLE_OPENMPTARGET) target_link_libraries(kokkoscore PUBLIC atomic) -ENDIF() +endif() -IF (desul_FOUND) +if(desul_FOUND) target_link_libraries(kokkoscore PUBLIC desul_atomics) -ENDIF() +endif() -# FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency so we -# just append the flags in cmake/kokkos_tpls.cmake instead of linking with the -# OpenMP target. -IF(Kokkos_ENABLE_OPENMP AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_OPENMP) target_link_libraries(kokkoscore PUBLIC OpenMP::OpenMP_CXX) -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH) +kokkos_link_tpl(kokkoscore PUBLIC LIBQUADMATH) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index fd86976d3ba..07c35e6611f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -35,7 +35,6 @@ static_assert(false, #include // CUDA_SAFE_CALL #include -#include #include #include #include diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 6ae24022c8f..8bcd6525c96 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -201,7 +201,14 @@ void *impl_allocate_common(const int device_id, } } #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - if (arg_alloc_size >= memory_threshold_g) { + // FIXME_KEPLER Everything after Kepler should support cudaMallocAsync + int device_supports_cuda_malloc_async; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaDeviceGetAttribute(&device_supports_cuda_malloc_async, + cudaDevAttrMemoryPoolsSupported, device_id)); + + if (arg_alloc_size >= memory_threshold_g && + device_supports_cuda_malloc_async == 1) { error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); if (error_code == cudaSuccess) { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index e1d062d72d5..1ccf38a4a15 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -73,9 +73,9 @@ class CudaSpace { CudaSpace(int device_id, cudaStream_t stream); public: - CudaSpace(CudaSpace&& rhs) = default; - CudaSpace(const CudaSpace& rhs) = default; - CudaSpace& operator=(CudaSpace&& rhs) = default; + CudaSpace(CudaSpace&& rhs) = default; + CudaSpace(const CudaSpace& rhs) = default; + CudaSpace& operator=(CudaSpace&& rhs) = default; CudaSpace& operator=(const CudaSpace& rhs) = default; ~CudaSpace() = default; @@ -174,9 +174,9 @@ class CudaUVMSpace { CudaUVMSpace(int device_id, cudaStream_t stream); public: - CudaUVMSpace(CudaUVMSpace&& rhs) = default; - CudaUVMSpace(const CudaUVMSpace& rhs) = default; - CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(const CudaUVMSpace& rhs) = default; + CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; CudaUVMSpace& operator=(const CudaUVMSpace& rhs) = default; ~CudaUVMSpace() = default; @@ -266,9 +266,9 @@ class CudaHostPinnedSpace { CudaHostPinnedSpace(int device_id, cudaStream_t stream); public: - CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; - CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; - CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; + CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace& operator=(const CudaHostPinnedSpace& rhs) = default; ~CudaHostPinnedSpace() = default; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index 5a821ab64a3..058b1f538d5 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -51,7 +51,8 @@ class GraphNodeKernelImpl m_graph_node_ptr = nullptr; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... - mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; public: using Policy = PolicyType; @@ -61,25 +62,20 @@ class GraphNodeKernelImpl - GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, Cuda const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) // This is super ugly, but it works for now and is the most minimal change // to the codebase for now... - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} // FIXME @graph Forward through the instance once that works in the backends template GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::CudaSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", ex, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -90,13 +86,21 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const CudaSpace& mem) const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast(mem.allocate(alloc_label.c_str(), sizeof(base_t))), + [alloc_label, mem](base_t* ptr) { + mem.deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr) - return m_driver_storage; + return m_driver_storage.get(); } + + auto get_driver_storage() const { return m_driver_storage; } }; struct CudaGraphNodeAggregateKernel { @@ -128,7 +132,8 @@ struct get_graph_node_kernel_type // {{{1 template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const CudaSpace& mem, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = @@ -136,7 +141,7 @@ auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(mem); } template diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index 625d8c317a1..8e800e756d2 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -51,7 +51,14 @@ struct GraphImpl { using node_details_t = GraphNodeBackendSpecificDetails; - void _instantiate_graph() { + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; + + public: + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; cudaGraphNode_t error_node = nullptr; char error_log[error_log_size]; @@ -60,10 +67,10 @@ struct GraphImpl { ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph, &error_node, error_log, error_log_size))); + KOKKOS_ENSURES(m_graph_exec); // TODO @graphs print out errors } - public: using root_node_impl_t = GraphNodeImpl; @@ -74,11 +81,11 @@ struct GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl() { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to @@ -129,6 +136,8 @@ struct GraphImpl { kernel.set_cuda_graph_node_ptr(&cuda_node); kernel.execute(); KOKKOS_ENSURES(bool(cuda_node)); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } template @@ -158,13 +167,13 @@ struct GraphImpl { &cuda_node, 1))); } - void submit() { + void submit(const execution_space& exec) { if (!bool(m_graph_exec)) { - _instantiate_graph(); + instantiate(); } KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() - ->cuda_graph_launch_wrapper(m_graph_exec))); + (exec.impl_internal_space_instance()->cuda_graph_launch_wrapper( + m_graph_exec))); } execution_space const& get_execution_space() const noexcept { @@ -197,6 +206,9 @@ struct GraphImpl { m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_kernel_impl_t{}); } + + cudaGraph_t cuda_graph() { return m_graph; } + cudaGraphExec_t cuda_graph_exec() { return m_graph_exec; } }; } // end namespace Impl diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 158c8acdda6..ec5768a7f0f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -//#include -//#include -//#include -//#include +// #include +// #include +// #include +// #include #include #include #include @@ -687,16 +687,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << " KOKKOS_ENABLE_CUDA: yes\n"; os << "Cuda Options:\n"; - os << " KOKKOS_ENABLE_CUDA_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA - os << "yes\n"; -#else - os << "no\n"; -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - os << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; - os << "yes\n"; -#endif os << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE os << "yes\n"; @@ -708,12 +698,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "yes\n"; #else os << "no\n"; -#endif - os << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - os << "yes\n"; -#else - os << "no\n"; #endif os << " KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC: "; #ifdef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index b0dadb45f72..2d00e735cb9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -209,8 +209,8 @@ inline void configure_shmem_preference(const int cuda_device, // Use multiples of 8kB const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; size_t carveout = shmem_per_block == 0 - ? 0 - : 100 * + ? 0 + : 100 * (((num_blocks_desired * shmem_per_block + min_shmem_size_per_sm - 1) / min_shmem_size_per_sm) * @@ -491,7 +491,10 @@ struct CudaParallelLaunchKernelInvoker< cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel( + CudaSpace::impl_create(cuda_instance->m_cudaDev, + cuda_instance->m_stream), + driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl @@ -714,7 +717,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::launch_kernel((Args &&) args...); + base_t::launch_kernel((Args&&)args...); } }; @@ -728,7 +731,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::create_parallel_launch_graph_node((Args &&) args...); + base_t::create_parallel_launch_graph_node((Args&&)args...); } }; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 63038984004..c50ff430345 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -95,11 +95,39 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxthreads = m_rp.space().cuda_device_prop().maxThreadsDim; + [[maybe_unused]] const auto maxThreadsPerBlock = + m_rp.space().cuda_device_prop().maxThreadsPerBlock; + // make sure the Z dimension (it is less than x,y limits) isn't exceeded + const auto clampZ = [&](const int input) { + return (input > maxthreads[2] ? maxthreads[2] : input); + }; + // make sure the block dimensions don't exceed the max number of threads + // allowed + const auto check_block_sizes = [&]([[maybe_unused]] const dim3& block) { + KOKKOS_ASSERT(block.x > 0 && + block.x <= static_cast(maxthreads[0])); + KOKKOS_ASSERT(block.y > 0 && + block.y <= static_cast(maxthreads[1])); + KOKKOS_ASSERT(block.z > 0 && + block.z <= static_cast(maxthreads[2])); + KOKKOS_ASSERT(block.x * block.y * block.z <= + static_cast(maxThreadsPerBlock)); + }; + // make sure the grid dimensions don't exceed the max number of blocks + // allowed + const auto check_grid_sizes = [&]([[maybe_unused]] const dim3& grid) { + KOKKOS_ASSERT(grid.x > 0 && + grid.x <= static_cast(maxblocks[0])); + KOKKOS_ASSERT(grid.y > 0 && + grid.y <= static_cast(maxblocks[1])); + KOKKOS_ASSERT(grid.z > 0 && + grid.z <= static_cast(maxblocks[2])); + }; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -108,13 +136,12 @@ class ParallelFor, Kokkos::Cuda> { (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, maxblocks[1]), 1); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], clampZ(m_rp.m_tile[2])); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -125,15 +152,16 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, maxblocks[2])); + // ensure we don't exceed the capability of the device + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 4) { // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], - m_rp.m_tile[3]); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + clampZ(m_rp.m_tile[3])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -143,14 +171,15 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 5) { // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - KOKKOS_ASSERT(block.z > 0); + m_rp.m_tile[2] * m_rp.m_tile[3], clampZ(m_rp.m_tile[4])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -159,6 +188,7 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 6) { @@ -166,7 +196,8 @@ class ParallelFor, Kokkos::Cuda> { // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); + clampZ(m_rp.m_tile[4] * m_rp.m_tile[5])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -174,6 +205,7 @@ class ParallelFor, Kokkos::Cuda> { maxblocks[1]), std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 334834938a1..8251fcb248d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -48,7 +48,7 @@ class ParallelFor, Kokkos::Cuda> { const FunctorType m_functor; const Policy m_policy; - ParallelFor() = delete; + ParallelFor() = delete; ParallelFor& operator=(const ParallelFor&) = delete; template diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 71e77518210..a2955e3ab61 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -539,9 +539,14 @@ class ParallelFor, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -631,7 +636,7 @@ class ParallelReduce word_count(m_functor_reducer.get_reducer().value_size() / sizeof(word_size_type)); - reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = + m_functor_reducer.get_reducer().init(reinterpret_cast( + kokkos_impl_cuda_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league const int int_league_size = (int)m_league_size; @@ -895,11 +901,16 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 86d6d91bbee..5090e84c38c 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -31,6 +31,9 @@ //---------------------------------------------------------------------------- +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #if defined(__CUDA_ARCH__) #define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ { \ @@ -584,9 +587,9 @@ class TaskExec { private: enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize }; - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; + TaskExec(TaskExec&&) = delete; + TaskExec(TaskExec const&) = delete; + TaskExec& operator=(TaskExec&&) = delete; TaskExec& operator=(TaskExec const&) = delete; friend class Kokkos::Impl::TaskQueue< @@ -1224,5 +1227,7 @@ KOKKOS_INLINE_FUNCTION void single( #undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() + #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index c2b5f1fa789..aec692c2c36 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -184,24 +184,37 @@ class CudaTeamMember { * ( 1 == blockDim.z ) */ template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer) const noexcept { team_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE(( + typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( - (typename Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, TeamPolicy, - ReducerType, typename ReducerType::value_type>::Reducer - wrapped_reducer(reducer); - cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y); - reducer.reference() = value;)) + (cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);)) } //-------------------------------------------------------------------------- @@ -260,23 +273,42 @@ class CudaTeamMember { //---------------------------------------- template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer) { vector_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE( + (typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_vector_reduce(wrapped_reducer, value); + reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer_v> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( (if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; unsigned mask = blockDim.x == 32 @@ -287,7 +319,7 @@ class CudaTeamMember { for (int i = blockDim.x; (i >>= 1);) { Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask); if ((int)threadIdx.x < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -297,7 +329,7 @@ class CudaTeamMember { // and thus different threads could have different results. Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask); - value = tmp2; reducer.reference() = tmp2;)) + value = tmp2;)) } //---------------------------------------- @@ -487,14 +519,21 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { KOKKOS_IF_ON_DEVICE( - (typename ReducerType::value_type value; + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - reducer.init(value); + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value);)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); reducer.reference() = value;)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -518,16 +557,25 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; + KOKKOS_IF_ON_DEVICE( - (ValueType val; Kokkos::Sum reducer(val); + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value{}; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; - i < loop_boundaries.end; i += blockDim.y) { closure(i, val); } + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference();)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } template @@ -548,16 +596,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value; - reducer.init(value); + KOKKOS_IF_ON_DEVICE( + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, value); } + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value;)) - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value);)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -573,18 +632,27 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE((ValueType val; Kokkos::Sum reducer(val); - reducer.init(reducer.reference()); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, val); } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference();)) + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) } //---------------------------------------------------------------------------- @@ -632,13 +700,22 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< Closure const& closure, ReducerType const& reducer) { KOKKOS_IF_ON_DEVICE(( - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.x) { closure(i, reducer.reference()); } + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } - Impl::CudaTeamMember::vector_reduce(reducer); + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value; )) // Avoid bogus warning about reducer value being uninitialized with combined @@ -667,15 +744,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE( - (result = ValueType(); - for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; i += blockDim.x) { closure(i, result); } + KOKKOS_IF_ON_DEVICE(( + + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - Impl::CudaTeamMember::vector_reduce(Kokkos::Sum(result)); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - )) + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp index a3f4f2f4ccc..9e0c5819f71 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -125,8 +125,8 @@ struct in_place_shfl_op { struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -136,28 +136,28 @@ struct in_place_shfl_fn : in_place_shfl_op { }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { return __shfl_up_sync(mask, val, lane, width); } }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -168,7 +168,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index 517c592af72..0ac2d4052d2 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -23,15 +23,12 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, - const View& dst) { +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, void* dst, size_t cnt) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() - ->cuda_memset_async_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); + ->cuda_memset_async_wrapper(dst, 0, cnt))); } }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp index aced2083ffb..8de3a8758fa 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -27,6 +27,8 @@ #include +#include + namespace Kokkos { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -49,34 +51,44 @@ void HIP::impl_initialize(InitializationSettings const& settings) { Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( hipGetDeviceProperties(&Impl::HIPInternal::m_deviceProp, hip_device_id)); - const auto& hipProp = Impl::HIPInternal::m_deviceProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(hip_device_id)); - // number of multiprocessors - Impl::HIPInternal::m_multiProcCount = hipProp.multiProcessorCount; + // Check that we are running on the expected architecture. We print a warning + // instead of erroring out because AMD does not guarantee that gcnArchName + // will always contain the gfx flag. + if (Kokkos::show_warnings()) { + if (std::string_view arch_name = + Impl::HIPInternal::m_deviceProp.gcnArchName; + arch_name.find(KOKKOS_ARCH_AMD_GPU) != 0) { + std::cerr + << "Kokkos::HIP::initialize WARNING: running kernels compiled for " + << KOKKOS_ARCH_AMD_GPU << " on " << arch_name << " device.\n"; + } + } - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::HIPInternal::m_maxWarpCount = - hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; - if (Impl::HIPTraits::WarpSize < Impl::HIPInternal::m_maxWarpCount) { - Impl::HIPInternal::m_maxWarpCount = Impl::HIPTraits::WarpSize; + // Print a warning if the user did not select the right GFX942 architecture +#ifdef KOKKOS_ARCH_AMD_GFX942 + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 1)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300X " + "(discrete GPU) on a MI300A (APU).\n"; + } +#endif +#ifdef KOKKOS_ARCH_AMD_GFX942_APU + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 0)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300A " + "(APU) on a MI300X (discrete GPU).\n"; } +#endif - //---------------------------------- - // Maximum number of blocks - Impl::HIPInternal::m_maxBlock[0] = hipProp.maxGridSize[0]; - Impl::HIPInternal::m_maxBlock[1] = hipProp.maxGridSize[1]; - Impl::HIPInternal::m_maxBlock[2] = hipProp.maxGridSize[2]; - - // theoretically, we can get 40 WF's / CU, but only can sustain 32 see - // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 - Impl::HIPInternal::m_maxWavesPerCU = 32; - Impl::HIPInternal::m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; - Impl::HIPInternal::m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + // theoretically on GFX 9XX GPUs, we can get 40 WF's / CU, but only can + // sustain 32 see + // https://github.com/ROCm/clr/blob/4d0b815d06751735e6a50fa46e913fdf85f751f0/hipamd/src/hip_platform.cpp#L362-L366 + const int maxWavesPerCU = + Impl::HIPInternal::m_deviceProp.major <= 9 ? 32 : 64; Impl::HIPInternal::m_maxThreadsPerSM = - Impl::HIPInternal::m_maxWavesPerCU * Impl::HIPTraits::WarpSize; + maxWavesPerCU * Impl::HIPTraits::WarpSize; // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME @@ -146,10 +158,6 @@ void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const { #else os << "no\n"; #endif -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY - os << " KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY: "; - os << "yes\n"; -#endif os << "\nRuntime Configuration:\n"; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 1f084c41e50..90e5cf73559 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -113,8 +113,9 @@ unsigned hip_internal_get_block_size(const HIPInternal *hip_instance, const unsigned min_waves_per_eu = LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1; const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize; - const unsigned shmem_per_sm = hip_instance->m_shmemPerSM; - unsigned block_size = tperb_reg; + const unsigned shmem_per_sm = + hip_instance->m_deviceProp.maxSharedMemoryPerMultiProcessor; + unsigned block_size = tperb_reg; do { unsigned total_shmem = f(block_size); // find how many threads we can fit with this blocksize based on LDS usage diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 5f0df72df17..584cc63d958 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -44,22 +44,17 @@ class GraphNodeKernelImpl // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::HIP const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, HIP const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} template GraphNodeKernelImpl(Kokkos::HIP const& exec_space, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", exec_space, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_hip_graph_ptr(hipGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -73,18 +68,29 @@ class GraphNodeKernelImpl hipGraph_t const* get_hip_graph_ptr() const { return m_graph_ptr; } - Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const HIP& exec) const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast( + HIPSpace().allocate(exec, alloc_label.c_str(), sizeof(base_t))), + // FIXME_HIP Custom deletor should use same 'exec' as for allocation. + [alloc_label](base_t* ptr) { + HIPSpace().deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; + return m_driver_storage.get(); } + auto get_driver_storage() const { return m_driver_storage; } + private: Kokkos::ObservingRawPtr m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; }; struct HIPGraphNodeAggregateKernel { @@ -114,13 +120,14 @@ struct get_graph_node_kernel_type Kokkos::ParallelReduceTag>> {}; template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const HIP& exec, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = static_cast(kernel); - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(exec); } template diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index a0989fe6711..4f97214ca68 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -42,11 +42,11 @@ class GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); @@ -60,7 +60,7 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::HIP& exec); Kokkos::HIP const& get_execution_space() const noexcept; @@ -69,18 +69,28 @@ class GraphImpl { template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; hipGraphNode_t error_node = nullptr; char error_log[error_log_size]; KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphInstantiate( &m_graph_exec, m_graph, &error_node, error_log, error_log_size)); + KOKKOS_ENSURES(m_graph_exec); } + hipGraph_t hip_graph() { return m_graph; } + hipGraphExec_t hip_graph_exec() { return m_graph_exec; } + + private: Kokkos::HIP m_execution_space; hipGraph_t m_graph = nullptr; hipGraphExec_t m_graph_exec = nullptr; + + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; }; inline GraphImpl::~GraphImpl() { @@ -123,6 +133,8 @@ inline void GraphImpl::add_node( kernel.set_hip_graph_node_ptr(&node); kernel.execute(); KOKKOS_ENSURES(node); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } // Requires PredecessorRef is a specialization of GraphNodeRef that has @@ -145,16 +157,15 @@ inline void GraphImpl::add_predecessor( hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::HIP& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - KOKKOS_IMPL_HIP_SAFE_CALL( - hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphLaunch(m_graph_exec, exec.hip_stream())); } -inline Kokkos::HIP const& GraphImpl::get_execution_space() const - noexcept { +inline Kokkos::HIP const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index e0b25c69399..54e8c315e3f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -77,7 +77,8 @@ std::size_t scratch_count(const std::size_t size) { //---------------------------------------------------------------------------- int HIPInternal::concurrency() { - static int const concurrency = m_maxThreadsPerSM * m_multiProcCount; + static int const concurrency = + m_maxThreadsPerSM * m_deviceProp.multiProcessorCount; return concurrency; } @@ -97,6 +98,13 @@ void HIPInternal::print_configuration(std::ostream &s) const { << "undefined\n"; #endif + s << "macro KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC: "; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + s << "yes\n"; +#else + s << "no\n"; +#endif + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); @@ -177,8 +185,16 @@ void HIPInternal::initialize(hipStream_t stream) { // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + unsigned int maxWarpCount = + m_deviceProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; + if (Impl::HIPTraits::WarpSize < maxWarpCount) { + maxWarpCount = Impl::HIPTraits::WarpSize; + } + const unsigned reduce_block_count = - m_maxWarpCount * Impl::HIPTraits::WarpSize; + maxWarpCount * Impl::HIPTraits::WarpSize; (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); @@ -353,14 +369,8 @@ void HIPInternal::finalize() { m_num_scratch_locks = 0; } -int HIPInternal::m_hipDev = -1; -unsigned HIPInternal::m_multiProcCount = 0; -unsigned HIPInternal::m_maxWarpCount = 0; -std::array HIPInternal::m_maxBlock = {0, 0, 0}; -unsigned HIPInternal::m_maxWavesPerCU = 0; -int HIPInternal::m_shmemPerSM = 0; -int HIPInternal::m_maxShmemPerBlock = 0; -int HIPInternal::m_maxThreadsPerSM = 0; +int HIPInternal::m_hipDev = -1; +int HIPInternal::m_maxThreadsPerSM = 0; hipDeviceProp_t HIPInternal::m_deviceProp; @@ -372,15 +382,7 @@ std::mutex HIPInternal::constantMemMutex; //---------------------------------------------------------------------------- Kokkos::HIP::size_type hip_internal_multiprocessor_count() { - return HIPInternal::singleton().m_multiProcCount; -} - -Kokkos::HIP::size_type hip_internal_maximum_warp_count() { - return HIPInternal::singleton().m_maxWarpCount; -} - -std::array hip_internal_maximum_grid_count() { - return HIPInternal::singleton().m_maxBlock; + return HIPInternal::singleton().m_deviceProp.multiProcessorCount; } Kokkos::HIP::size_type *hip_internal_scratch_space(const HIP &instance, diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 437a84253f0..d8043dc23d7 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -31,7 +31,7 @@ namespace Impl { struct HIPTraits { #if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ - defined(KOKKOS_ARCH_AMD_GFX942) + defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ @@ -52,8 +52,6 @@ struct HIPTraits { //---------------------------------------------------------------------------- -HIP::size_type hip_internal_maximum_warp_count(); -std::array hip_internal_maximum_grid_count(); HIP::size_type hip_internal_multiprocessor_count(); HIP::size_type *hip_internal_scratch_space(const HIP &instance, @@ -72,12 +70,6 @@ class HIPInternal { using size_type = ::Kokkos::HIP::size_type; static int m_hipDev; - static unsigned m_multiProcCount; - static unsigned m_maxWarpCount; - static std::array m_maxBlock; - static unsigned m_maxWavesPerCU; - static int m_shmemPerSM; - static int m_maxShmemPerBlock; static int m_maxThreadsPerSM; static hipDeviceProp_t m_deviceProp; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 7cd0afcf47f..e243eb07e78 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -25,11 +25,7 @@ #include #include -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) -#define KOKKOS_IMPL_HIP_GRAPH_ENABLED -#endif - -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH #include #include #endif @@ -173,15 +169,15 @@ struct DeduceHIPLaunchMechanism { static constexpr HIPLaunchMechanism launch_mechanism = ((property & force_global_launch) == force_global_launch) ? HIPLaunchMechanism::GlobalMemory - : ((property & light_weight) == light_weight) - ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit - ? HIPLaunchMechanism::LocalMemory - : HIPLaunchMechanism::GlobalMemory) - : (((property & heavy_weight) == heavy_weight) - ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage - ? HIPLaunchMechanism::ConstantMemory - : HIPLaunchMechanism::GlobalMemory) - : (default_launch_mechanism)); + : ((property & light_weight) == light_weight) + ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit + ? HIPLaunchMechanism::LocalMemory + : HIPLaunchMechanism::GlobalMemory) + : (((property & heavy_weight) == heavy_weight) + ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage + ? HIPLaunchMechanism::ConstantMemory + : HIPLaunchMechanism::GlobalMemory) + : (default_launch_mechanism)); }; template m_stream, ManageStream::no), driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl // which is guaranteed to be alive until the graph instance itself is // destroyed, where there should be a fence ensuring that the allocation // associated with this kernel on the device side isn't deleted. - hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, - hip_instance->m_stream); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), + hipMemcpyDefault, hip_instance->m_stream)); void const *args[] = {&driver_ptr}; @@ -551,11 +549,11 @@ struct HIPParallelLaunch< LaunchMechanism>; HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, + const dim3 &block, const unsigned int shmem, const HIPInternal *hip_instance, const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { + if (hip_instance->m_deviceProp.sharedMemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( "HIPParallelLaunch FAILED: shared memory request is too large"); } @@ -585,7 +583,7 @@ void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker, HIP> { const Policy m_policy; public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { @@ -57,7 +57,7 @@ class ParallelFor, HIP> { inline void execute() const { using ClosureType = ParallelFor; if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); + auto const maxblocks = m_policy.space().hip_device_prop().maxGridSize; if (Policy::rank == 2) { dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); dim3 const grid( diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp index 9355c1c75fb..3985dc60f06 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -53,8 +53,8 @@ class ParallelFor, Kokkos::HIP> { public: using functor_type = FunctorType; - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp index bf0c2193383..83e890bce99 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -71,8 +71,8 @@ class ParallelFor, HIP> { } public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; __device__ inline void operator()() const { @@ -120,9 +120,14 @@ class ParallelFor, HIP> { m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -149,8 +154,9 @@ class ParallelFor, HIP> { static_cast(m_league_size)))); } - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + unsigned int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp index 0c24e5cc62a..fb4ff937cdf 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -46,6 +46,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::HIP::size_type), + std::conditional_t, + Kokkos::HIP::size_type>; using reducer_type = ReducerType; using size_type = HIP::size_type; @@ -72,7 +88,7 @@ class ParallelReduce const - word_count(reducer.value_size() / sizeof(size_type)); + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(word_size_type)); - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = reducer.init(reinterpret_cast( + kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league iterate_through_league(threadid, value); // Reduce with final value at blockDim.y - 1 location. bool do_final_reduce = (m_league_size == 0); if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); + do_final_reduce = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); if (do_final_reduce) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; + word_size_type* const shared = + kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; if (threadIdx.y == 0) { reducer.final(reinterpret_cast(shared)); @@ -227,7 +244,8 @@ class ParallelReduce(m_scratch_space), result, m_scratch_flags, blockDim.y)) { unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { @@ -249,8 +267,9 @@ class ParallelReduce(hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count)); m_scratch_flags = hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); @@ -306,11 +325,15 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction @@ -356,7 +379,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " "L0 scratch memory")); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index 83f829fddae..0b679218092 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -23,7 +23,7 @@ #include #include -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPSpace); #else diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index 1ca7bd5cd0e..a464609108c 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -20,7 +20,7 @@ #include #include -#if defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPSpace); #else KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index 4035bb01213..feee44ccaf1 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -100,7 +100,7 @@ template __device__ inline bool hip_inter_block_shuffle_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, FunctorType const& reducer, - HIP::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, HIP::size_type* const m_scratch_flags, int const max_active_thread = blockDim.y) { @@ -115,9 +115,8 @@ __device__ inline bool hip_inter_block_shuffle_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = - reinterpret_cast(m_scratch_space) + blockIdx.x; - *global = value; + pointer_type global = m_scratch_space + blockIdx.x; + *global = value; __threadfence(); } @@ -140,8 +139,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction( last_block = true; value = neutral; - pointer_type const global = - reinterpret_cast(m_scratch_space); + pointer_type const global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = blockDim.x * blockDim.y < warp_size diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 67635fc1c4c..47f07b31abf 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -51,28 +51,54 @@ static std::atomic is_first_hip_managed_allocation(true); namespace Kokkos { -HIPSpace::HIPSpace() : m_device(HIP().hip_device()) {} +HIPSpace::HIPSpace() + : m_device(HIP().hip_device()), m_stream(HIP().hip_stream()) {} HIPHostPinnedSpace::HIPHostPinnedSpace() {} HIPManagedSpace::HIPManagedSpace() : m_device(HIP().hip_device()) {} +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY +void* HIPSpace::allocate(const HIP& exec_space, + const size_t arg_alloc_size) const { + return allocate(exec_space, "[unlabeled]", arg_alloc_size); +} + +void* HIPSpace::allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(exec_space.hip_stream(), arg_label, arg_alloc_size, + arg_logical_size, true); +} +#endif + void* HIPSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } -void* HIPSpace::allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +void* HIPSpace::allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(m_stream, arg_label, arg_alloc_size, arg_logical_size, + false); } + void* HIPSpace::impl_allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { + [[maybe_unused]] const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, const size_t arg_logical_size, + [[maybe_unused]] const bool stream_sync_only) const { void* ptr = nullptr; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + auto const error_code = hipMallocAsync(&ptr, arg_alloc_size, stream); + if (stream_sync_only) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(stream)); + } else { + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); + } +#else auto const error_code = hipMalloc(&ptr, arg_alloc_size); +#endif + if (error_code != hipSuccess) { // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here @@ -80,6 +106,8 @@ void* HIPSpace::impl_allocate( Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { + const Kokkos::Tools::SpaceHandle arg_handle = + Kokkos::Tools::make_space_handle(name()); const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); @@ -219,7 +247,12 @@ void HIPSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + KOKKOS_IMPL_HIP_SAFE_CALL(hipFreeAsync(arg_alloc_ptr, m_stream)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); +#else KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); +#endif } void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr, diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index e1b4768b877..2380772cacf 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -58,14 +58,14 @@ class HIPSpace { /*--------------------------------*/ HIPSpace(); - HIPSpace(HIPSpace&& rhs) = default; - HIPSpace(const HIPSpace& rhs) = default; - HIPSpace& operator=(HIPSpace&& rhs) = default; + HIPSpace(HIPSpace&& rhs) = default; + HIPSpace(const HIPSpace& rhs) = default; + HIPSpace& operator=(HIPSpace&& rhs) = default; HIPSpace& operator=(const HIPSpace& rhs) = default; ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifdef KOKKOS_IMPL_HIP_UNIFIED_MEMORY template void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { return allocate(arg_alloc_size); @@ -77,15 +77,10 @@ class HIPSpace { return allocate(arg_label, arg_alloc_size, arg_logical_size); } #else - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const size_t arg_alloc_size) const { - return allocate(arg_alloc_size); - } - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return allocate(arg_label, arg_alloc_size, arg_logical_size); - } + void* allocate(const HIP& exec_space, const size_t arg_alloc_size) const; + void* allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; #endif void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, @@ -98,10 +93,10 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; + void* impl_allocate(const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size, + bool stream_sync_only) const; void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -114,6 +109,7 @@ class HIPSpace { private: int m_device; ///< Which HIP device + hipStream_t m_stream; }; template <> @@ -140,9 +136,9 @@ class HIPHostPinnedSpace { /*--------------------------------*/ HIPHostPinnedSpace(); - HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; - HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; - HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; + HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; HIPHostPinnedSpace& operator=(const HIPHostPinnedSpace& rhs) = default; ~HIPHostPinnedSpace() = default; @@ -213,9 +209,9 @@ class HIPManagedSpace { /*--------------------------------*/ HIPManagedSpace(); - HIPManagedSpace(HIPManagedSpace&& rhs) = default; - HIPManagedSpace(const HIPManagedSpace& rhs) = default; - HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(const HIPManagedSpace& rhs) = default; + HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; HIPManagedSpace& operator=(const HIPManagedSpace& rhs) = default; ~HIPManagedSpace() = default; @@ -280,7 +276,7 @@ static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); template <> struct MemorySpaceAccess { enum : bool { assignable = false }; -#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) enum : bool{accessible = false}; #else enum : bool { accessible = true }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index fb466d8a721..1724b4361db 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -183,7 +183,7 @@ class HIPTeamMember { typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy, ReducerType, typename ReducerType::value_type>::Reducer wrapped_reducer(reducer); - hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value; #else (void)reducer; @@ -191,6 +191,19 @@ class HIPTeamMember { #endif } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { +#ifdef __HIP_DEVICE_COMPILE__ + hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); +#else + (void)wrapped_reducer; + (void)value; +#endif + } + //-------------------------------------------------------------------------- /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. @@ -261,17 +274,37 @@ class HIPTeamMember { KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { +#ifdef __HIP_DEVICE_COMPILE__ + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; +#else + (void)reducer; + (void)value; +#endif + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { #ifdef __HIP_DEVICE_COMPILE__ if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = blockDim.x; (i >>= 1);) { in_place_shfl_down(tmp2, tmp, i, blockDim.x); if (static_cast(threadIdx.x) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -281,10 +314,9 @@ class HIPTeamMember { // and thus different threads could have different results. in_place_shfl(tmp2, tmp, 0, blockDim.x); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; #else - (void)reducer; + (void)wrapped_reducer; (void)value; #endif } @@ -479,15 +511,26 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -508,24 +551,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; - i += blockDim.y) { - closure(i, val); - } + for (iType i = loop_boundaries.start + threadIdx.y; + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } /** \brief Inter-thread parallel exclusive prefix sum. @@ -620,16 +663,26 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; i < loop_boundaries.end; i += blockDim.y * blockDim.x) { closure(i, value); } - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -642,25 +695,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); - - reducer.init(reducer.reference()); - - for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; i += blockDim.y * blockDim.x) { - closure(i, val); - } - - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- @@ -706,14 +761,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; i += blockDim.x) { - closure(i, reducer.reference()); + closure(i, value); } - Impl::HIPTeamMember::vector_reduce(reducer); + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -737,20 +804,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - result = ValueType(); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; - i += blockDim.x) { - closure(i, result); - } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - Impl::HIPTeamMember::vector_reduce(Kokkos::Sum(result)); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp index 67e1181125c..f21c65f16dd 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -222,7 +222,8 @@ class TeamPolicyInternal m_tune_team_size(bool(team_size_request <= 0)), m_tune_vector_length(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + const int max_grid_size_x = m_space.hip_device_prop().maxGridSize[0]; + if (league_size_ >= max_grid_size_x) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on HIP execution " "space."); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp index 30774c898b6..f5b1d321e8c 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -40,8 +40,8 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { using shfl_type = int; union conv_type { Scalar orig; @@ -65,16 +65,16 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( reinterpret_cast(in), lane_or_delta, width); } template __device__ inline std::enable_if_t - operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( *reinterpret_cast(&in), lane_or_delta, width); } @@ -82,8 +82,8 @@ struct in_place_shfl_op { // sizeof(Scalar) > sizeof(double) case template __device__ inline std::enable_if_t<(sizeof(Scalar) > sizeof(double))> - operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, const Scalar& val, int lane_or_delta, + int width) const noexcept { using shuffle_as_t = int; constexpr int N = sizeof(Scalar) / sizeof(shuffle_as_t); @@ -108,7 +108,7 @@ struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { @@ -123,7 +123,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { @@ -138,7 +138,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp new file mode 100644 index 00000000000..34d5ecf1a65 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp @@ -0,0 +1,36 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#endif + +#include +#include + +namespace Kokkos { +namespace Impl { + +// alternative to hipMemsetAsync, which sets the first `cnt` bytes of `dst` to 0 +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt) { + Kokkos::parallel_for( + "Kokkos::ZeroMemset via parallel_for", + Kokkos::RangePolicy(exec_space, 0, cnt), + KOKKOS_LAMBDA(size_t i) { static_cast(dst)[i] = 0; }); +} + +} // namespace Impl +} // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 4bca29868f7..18708cf8c56 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -23,12 +23,21 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst) { - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( - dst.data(), 0, dst.size() * sizeof(typename View::value_type), - exec_space.hip_stream())); +// hipMemsetAsync sets the first `cnt` bytes of `dst` to the provided value +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt); + +template <> +struct ZeroMemset { + ZeroMemset(const HIP& exec_space, void* dst, size_t cnt) { + // in ROCm <= 6.2.0, hipMemsetAsync on a host-allocated pointer + // returns an invalid value error, but accessing the data via a + // GPU kernel works. +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + zero_with_hip_kernel(exec_space, dst, cnt); +#else + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemsetAsync(dst, 0, cnt, exec_space.hip_stream())); +#endif } }; diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp index 245dc128ca8..7d499337908 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -32,12 +32,10 @@ static_assert(false, #include #include #include -#include #include #include #include #include -#include #include #include @@ -75,12 +73,12 @@ class hpx_thread_buffer { } public: - hpx_thread_buffer() = default; - ~hpx_thread_buffer() = default; - hpx_thread_buffer(const hpx_thread_buffer &) = delete; - hpx_thread_buffer(hpx_thread_buffer &&) = delete; + hpx_thread_buffer() = default; + ~hpx_thread_buffer() = default; + hpx_thread_buffer(const hpx_thread_buffer &) = delete; + hpx_thread_buffer(hpx_thread_buffer &&) = delete; hpx_thread_buffer &operator=(const hpx_thread_buffer &) = delete; - hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; + hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; void resize(const std::size_t num_threads, const std::size_t size_per_thread, const std::size_t extra_space = 0) noexcept; @@ -140,10 +138,10 @@ class HPX { hpx::execution::experimental::unique_any_sender<> &&sender) : m_instance_id(instance_id), m_sender{std::move(sender)} {} - instance_data(const instance_data &) = delete; - instance_data(instance_data &&) = delete; + instance_data(const instance_data &) = delete; + instance_data(instance_data &&) = delete; instance_data &operator=(const instance_data &) = delete; - instance_data &operator=(instance_data) = delete; + instance_data &operator=(instance_data) = delete; uint32_t m_instance_id{HPX::impl_default_instance_id()}; hpx::execution::experimental::unique_any_sender<> m_sender{ @@ -196,7 +194,7 @@ class HPX { HPX(HPX &&other) = default; HPX(const HPX &other) = default; - HPX &operator=(HPX &&) = default; + HPX &operator=(HPX &&) = default; HPX &operator=(const HPX &) = default; void print_configuration(std::ostream &os, bool /*verbose*/ = false) const; @@ -214,9 +212,9 @@ class HPX { struct impl_in_parallel_scope { impl_in_parallel_scope() noexcept; ~impl_in_parallel_scope() noexcept; - impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; - impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; - impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; + impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; impl_in_parallel_scope &operator=(impl_in_parallel_scope const &) = delete; }; @@ -249,13 +247,15 @@ class HPX { impl_instance_fence(name); } - static bool is_asynchronous(HPX const & = HPX()) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool is_asynchronous(HPX const & = HPX()) noexcept { #if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) return true; #else return false; #endif } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); @@ -281,8 +281,8 @@ class HPX { return impl_get_instance_data().m_buffer; } - hpx::execution::experimental::unique_any_sender<> &impl_get_sender() const - noexcept { + hpx::execution::experimental::unique_any_sender<> &impl_get_sender() + const noexcept { return impl_get_instance_data().m_sender; } @@ -447,6 +447,20 @@ class HPX { } }; +template +std::vector partition_space(HPX const &, Args... args) { + std::vector instances(sizeof...(args)); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + +template +std::vector partition_space(HPX const &, std::vector const &weights) { + std::vector instances(weights.size()); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + extern template void HPX::impl_bulk_plain_erased( bool, bool, std::function &&, int const, hpx::threads::thread_stacksize stacksize) const; @@ -1772,11 +1786,24 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } /** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each @@ -1810,14 +1837,26 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + wrapped_reducer.final(&value); + result = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -1995,7 +2060,9 @@ KOKKOS_INLINE_FUNCTION void single( } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #endif /* #if defined( KOKKOS_ENABLE_HPX ) */ #endif /* #ifndef KOKKOS_HPX_HPP */ diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp index 28c75b2515a..d775b7fac3b 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp @@ -25,6 +25,8 @@ #include +#include + #include #include @@ -33,6 +35,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -256,6 +263,10 @@ extern template class TaskQueue< } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 297b1fadee9..92dc506c5e9 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -30,6 +30,7 @@ static_assert(false, #include #include #include +#include namespace Kokkos { @@ -60,13 +61,13 @@ namespace Impl { // NOTE the comparison below is encapsulated to silent warnings about pointless // comparison of unsigned integer with zero template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T) { return false; } template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T arg) { return arg < T{}; } @@ -75,7 +76,7 @@ is_less_than_value_initialized_variable(T arg) { template constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = - (std::is_signed::value != std::is_signed::value); + (std::is_signed_v != std::is_signed_v); auto const ret = static_cast(arg); if (static_cast(ret) != arg || (is_different_signedness && @@ -183,7 +184,7 @@ struct MDRangePolicy template friend struct MDRangePolicy; - static_assert(!std::is_void::value, + static_assert(!std::is_void_v, "Kokkos Error: MD iteration pattern not defined"); using iteration_pattern = typename traits::iteration_pattern; @@ -238,9 +239,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) : MDRangePolicy( @@ -257,9 +258,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const typename traits::execution_space& work_space, const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) @@ -291,14 +292,14 @@ struct MDRangePolicy } template ::value>> + typename = std::enable_if_t>> MDRangePolicy(Kokkos::Array const& lower, Kokkos::Array const& upper, Kokkos::Array const& tile = Kokkos::Array{}) : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} template ::value>> + typename = std::enable_if_t>> MDRangePolicy(const typename traits::execution_space& work_space, Kokkos::Array const& lower, Kokkos::Array const& upper, @@ -330,7 +331,44 @@ struct MDRangePolicy } bool impl_tune_tile_size() const { return m_tune_tile_size; } + tile_type tile_size_recommended() const { + tile_type rec_tile_sizes = {}; + + for (std::size_t i = 0; i < rec_tile_sizes.size(); ++i) { + rec_tile_sizes[i] = tile_size_recommended(i); + } + return rec_tile_sizes; + } + + int max_total_tile_size() const { + return Impl::get_tile_size_properties(m_space).max_total_tile_size; + } + private: + int tile_size_recommended(const int tile_rank) const { + auto properties = Impl::get_tile_size_properties(m_space); + int last_rank = (inner_direction == Iterate::Right) ? rank - 1 : 0; + int rank_acc = + (inner_direction == Iterate::Right) ? tile_rank + 1 : tile_rank - 1; + int rec_tile_size = (std::pow(properties.default_tile_size, rank_acc) < + properties.max_total_tile_size) + ? properties.default_tile_size + : 1; + + if (tile_rank == last_rank) { + rec_tile_size = tile_size_last_rank( + properties, m_upper[last_rank] - m_lower[last_rank]); + } + return rec_tile_size; + } + + int tile_size_last_rank(const Impl::TileSizeProperties properties, + const index_type length) const { + return properties.default_largest_tile_size == 0 + ? std::max(length, 1) + : properties.default_largest_tile_size; + } + void init_helper(Impl::TileSizeProperties properties) { m_prod_tile_dims = 1; int increment = 1; @@ -341,6 +379,7 @@ struct MDRangePolicy rank_start = rank - 1; rank_end = -1; } + for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; @@ -368,9 +407,7 @@ struct MDRangePolicy m_tile[i] = 1; } } else { - m_tile[i] = properties.default_largest_tile_size == 0 - ? std::max(length, 1) - : properties.default_largest_tile_size; + m_tile[i] = tile_size_last_rank(properties, length); } } m_tile_end[i] = @@ -389,58 +426,55 @@ struct MDRangePolicy }; template -MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; +MDRangePolicy(const LT (&)[N], const UT (&)[N]) -> MDRangePolicy>; template MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], - const TT (&)[TN]) - ->MDRangePolicy>; + const TT (&)[TN]) -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template -MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; +MDRangePolicy(Array const&, Array const&) -> MDRangePolicy>; template MDRangePolicy(Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 9f5deed5d66..62f527aa025 100644 --- a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -41,10 +41,10 @@ class AnonymousSpace { using device_type = Kokkos::Device; /**\brief Default memory space instance */ - AnonymousSpace() = default; - AnonymousSpace(AnonymousSpace &&rhs) = default; - AnonymousSpace(const AnonymousSpace &rhs) = default; - AnonymousSpace &operator=(AnonymousSpace &&) = default; + AnonymousSpace() = default; + AnonymousSpace(AnonymousSpace &&rhs) = default; + AnonymousSpace(const AnonymousSpace &rhs) = default; + AnonymousSpace &operator=(AnonymousSpace &&) = default; AnonymousSpace &operator=(const AnonymousSpace &) = default; ~AnonymousSpace() = default; diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp index 4d905fbc553..493536b53be 100644 --- a/lib/kokkos/core/src/Kokkos_Array.hpp +++ b/lib/kokkos/core/src/Kokkos_Array.hpp @@ -35,7 +35,7 @@ namespace Kokkos { #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK namespace Impl { -template ::value> +template > struct ArrayBoundsCheck; template @@ -195,8 +195,10 @@ struct Array { return *reinterpret_cast(-1); } - KOKKOS_INLINE_FUNCTION pointer data() { return nullptr; } - KOKKOS_INLINE_FUNCTION const_pointer data() const { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr pointer data() { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { + return nullptr; + } friend KOKKOS_FUNCTION constexpr bool operator==(Array const&, Array const&) noexcept { @@ -365,7 +367,7 @@ struct KOKKOS_DEPRECATED #endif template -Array(T, Us...)->Array; +Array(T, Us...) -> Array; namespace Impl { @@ -377,7 +379,7 @@ KOKKOS_FUNCTION constexpr Array, N> to_array_impl( template KOKKOS_FUNCTION constexpr Array, N> to_array_impl( - T(&&a)[N], std::index_sequence) { + T (&&a)[N], std::index_sequence) { return {{std::move(a[I])...}}; } @@ -389,7 +391,7 @@ KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { } template -KOKKOS_FUNCTION constexpr auto to_array(T(&&a)[N]) { +KOKKOS_FUNCTION constexpr auto to_array(T (&&a)[N]) { return Impl::to_array_impl(std::move(a), std::make_index_sequence{}); } @@ -435,6 +437,32 @@ KOKKOS_FUNCTION constexpr T const&& get(Array const&& a) noexcept { } // namespace Kokkos // +// +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr T const* begin(Array const& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T* begin(Array& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T const* end(Array const& a) noexcept { + return a.data() + a.size(); +} + +template +KOKKOS_FUNCTION constexpr T* end(Array& a) noexcept { + return a.data() + a.size(); +} + +} // namespace Kokkos +// + #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp index 6fc903f2743..ba611360922 100644 --- a/lib/kokkos/core/src/Kokkos_Atomic.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp @@ -47,7 +47,6 @@ #include #include -#include #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp deleted file mode 100644 index bf57dcae650..00000000000 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ /dev/null @@ -1,196 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#include -#include - -#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() -#else -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() -#endif - -// clang-format off -namespace Kokkos { - -template KOKKOS_INLINE_FUNCTION -T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast(dest), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// Exchange - -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) { - return desul::atomic_compare_exchange_strong(const_cast(dest),expected, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) { - return desul::atomic_compare_exchange(const_cast(dest),compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -} -#undef KOKKOS_DESUL_MEM_SCOPE - -// clang-format on -#endif diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index 26db69ac1f1..40f51c5a334 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -24,14 +24,16 @@ static_assert(false, #include #include +#include // identity_type #include -// clang-format off namespace Kokkos { -// FIXME: These functions don't have any use/test in unit tests ... -// ========================================================== -inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline const char* atomic_query_version() { + return "KOKKOS_DESUL_ATOMICS"; +} +#endif #if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ !defined(__CUDA_ARCH__) @@ -53,197 +55,120 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() #endif -template KOKKOS_INLINE_FUNCTION -T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { atomic_store(dest,val); } +namespace Impl { +template +using not_deduced_atomic_t = + std::add_const_t>>; + +template +using enable_if_atomic_t = + std::enable_if_t && !std::is_const_v, + std::remove_volatile_t>; +} // namespace Impl -KOKKOS_INLINE_FUNCTION -void memory_fence() { - desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); -} +// clang-format off -KOKKOS_INLINE_FUNCTION -void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +// fences +KOKKOS_INLINE_FUNCTION void memory_fence() { desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void load_fence() { desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void store_fence() { desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } -KOKKOS_INLINE_FUNCTION -void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } +// load/store +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_load (T const* ptr) { return desul::atomic_load (const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_store(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_store(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_store() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_assign(T* ptr, Impl::not_deduced_atomic_t val) { atomic_store(ptr, val); } +#endif // atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_add(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_sub(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_max(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_min(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mul(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_div(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mod(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_and(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_or (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_xor(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_nand(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_nand(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_lshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_rshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_inc(T* ptr) { return desul::atomic_fetch_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_dec(T* ptr) { return desul::atomic_fetch_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } // atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_add_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_sub_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_max_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_min_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mul_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_div_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mod_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_and_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or_fetch (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_or_fetch (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_xor_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_lshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_rshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc_fetch(T* ptr) { return desul::atomic_inc_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec_fetch(T* ptr) { return desul::atomic_dec_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } // atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or (T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc(T* ptr) { desul::atomic_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec(T* ptr) { desul::atomic_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_inc() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_increment(T* ptr) { atomic_inc(ptr); } +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_dec() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_decrement(T* ptr) { atomic_dec(ptr); } +#endif -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +// exchange +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_exchange (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_exchange (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return desul::atomic_compare_exchange(const_cast*>(ptr), expected, desired, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_compare_exchange() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange_strong(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return expected == atomic_compare_exchange(ptr, expected, desired); } +#endif -// Exchange +// clang-format on +} // namespace Kokkos -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +namespace Kokkos::Impl { -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t expected, desul::Impl::dont_deduce_this_parameter_t desired) { - T expected_ref = expected; - return desul::atomic_compare_exchange_strong(dest, expected_ref, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); +template +KOKKOS_FUNCTION bool atomic_compare_exchange_strong(T* const dest, T& expected, + const T desired, + MemOrderSuccess succ, + MemOrderFailure fail) { + return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, + fail, KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t compare, desul::Impl::dont_deduce_this_parameter_t desired) { - return desul::atomic_compare_exchange(dest, compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); +template +KOKKOS_FUNCTION T atomic_load(const T* const src, MemoryOrder order) { + return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); } -namespace Impl { - template KOKKOS_INLINE_FUNCTION - bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) { - return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - T atomic_load(const T* const src, MemoryOrder order) { - return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - void atomic_store(T* const src, const T val, MemoryOrder order) { - return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); - } -} // namespace Impl +template +KOKKOS_FUNCTION void atomic_store(T* const src, const T val, + MemoryOrder order) { + return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); +} -} // namespace Kokkos +} // namespace Kokkos::Impl #undef KOKKOS_DESUL_MEM_SCOPE -// clang-format on #endif diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 7dd2a9ddbb7..8233c30b243 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -70,9 +70,8 @@ class complex& operator=(const complex&) noexcept = default; /// \brief Conversion constructor from compatible RType - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_INLINE_FUNCTION complex(const complex& other) noexcept // Intentionally do the conversions implicitly here so that users don't // get any warnings about narrowing, etc., that they would expect to get @@ -265,9 +264,8 @@ class #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 //! Copy constructor from volatile. - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex(const volatile complex& src) noexcept // Intentionally do the conversions implicitly here so that users don't @@ -296,7 +294,7 @@ class // vl = r; // vl = cr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator=( const Complex& src) volatile noexcept { re_ = src.re_; @@ -319,7 +317,7 @@ class // vl = vr; // vl = cvr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile complex& operator=( const volatile Complex& src) volatile noexcept { re_ = src.re_; @@ -341,7 +339,7 @@ class // l = cvr; // template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( const volatile Complex& src) noexcept { re_ = src.re_; @@ -539,7 +537,7 @@ inline bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -551,7 +549,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -590,7 +588,7 @@ inline bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -602,7 +600,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -778,16 +776,14 @@ KOKKOS_INLINE_FUNCTION complex pow(const complex& x, return x == T() ? T() : exp(y * log(x)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow( const T& x, const complex& y) { using type = Impl::promote_2_t; return pow(type(x), complex(y)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow(const complex& x, const U& y) { using type = Impl::promote_2_t; diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index df78a644a03..0bfb9eb5fa4 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -41,8 +41,7 @@ struct Dynamic {}; // Schedule Wrapper Type template struct Schedule { - static_assert(std::is_same::value || - std::is_same::value, + static_assert(std::is_same_v || std::is_same_v, "Kokkos: Invalid Schedule<> type."); using schedule_type = Schedule; using type = T; @@ -51,7 +50,7 @@ struct Schedule { // Specify Iteration Index Type template struct IndexType { - static_assert(std::is_integral::value, "Kokkos: Invalid IndexType<>."); + static_assert(std::is_integral_v, "Kokkos: Invalid IndexType<>."); using index_type = IndexType; using type = T; }; @@ -139,8 +138,8 @@ namespace Kokkos { \ public: \ static constexpr bool value = \ - std::is_base_of, T>::value || \ - std::is_base_of, T>::value; \ + std::is_base_of_v, T> || \ + std::is_base_of_v, T>; \ constexpr operator bool() const noexcept { return value; } \ }; \ template \ @@ -292,44 +291,6 @@ struct is_space { using execution_space = typename is_exe::space; using memory_space = typename is_mem::space; - - // For backward compatibility, deprecated in favor of - // Kokkos::Impl::HostMirror::host_mirror_space - - private: - // The actual definitions for host_memory_space and host_execution_spaces are - // in do_not_use_host_memory_space and do_not_use_host_execution_space to be - // able to use them within this class without deprecation warnings. - using do_not_use_host_memory_space = std::conditional_t< - std::is_same::value -#if defined(KOKKOS_ENABLE_CUDA) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_HIP) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_SYCL) - || std::is_same::value || - std::is_same::value -#endif - , - memory_space, Kokkos::HostSpace>; - - using do_not_use_host_execution_space = std::conditional_t< -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_HIP) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_SYCL) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - std::is_same::value || -#endif - false, - Kokkos::DefaultHostExecutionSpace, execution_space>; }; } // namespace Kokkos @@ -357,7 +318,7 @@ struct MemorySpaceAccess { * 2. All execution spaces that can access DstMemorySpace can also access * SrcMemorySpace. */ - enum { assignable = std::is_same::value }; + enum { assignable = std::is_same_v }; /**\brief For all DstExecSpace::memory_space == DstMemorySpace * DstExecSpace can access SrcMemorySpace. @@ -442,7 +403,7 @@ struct SpaceAccessibility { // If same memory space or not accessible use the AccessSpace // else construct a device with execution space and memory space. using space = std::conditional_t< - std::is_same::value || + std::is_same_v || !exe_access::accessible, AccessSpace, Kokkos::Device>; diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index e856b192471..7da59aa4e41 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -561,21 +561,20 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -649,21 +648,20 @@ void view_copy(const DstType& dst, const SrcType& src) { int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1350,22 +1348,20 @@ inline void contiguous_fill( } // Default implementation for execution spaces that don't provide a definition -template +template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { - using ValueType = typename ViewType::value_type; - alignas(alignof(ValueType)) unsigned char - zero_initialized_storage[sizeof(ValueType)] = {}; - contiguous_fill(exec_space, dst, - *reinterpret_cast(zero_initialized_storage)); + ZeroMemset(const ExecutionSpace& exec_space, void* dst, size_t cnt) { + contiguous_fill( + exec_space, + Kokkos::View( + static_cast(dst), cnt), + std::byte{}); } }; template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1375,20 +1371,20 @@ contiguous_fill_or_memset( && !std::is_same_v #endif ) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset>(exec_space, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec_space, dst.data(), + dst.size() * sizeof(typename ViewTraits::value_type)); else contiguous_fill(exec_space, dst, value); } template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1397,9 +1393,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1411,11 +1405,12 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset(exec, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec, dst.data(), dst.size() * sizeof(typename ViewType::value_type)); else #endif contiguous_fill(exec, dst, value); @@ -1423,9 +1418,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1441,8 +1434,8 @@ template inline void deep_copy( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; @@ -1464,8 +1457,8 @@ inline void deep_copy( } Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); // If contiguous we can simply do a 1D flat loop or use memset @@ -1482,21 +1475,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1539,8 +1531,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; @@ -1576,8 +1568,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using dst_type = View; @@ -1587,8 +1579,8 @@ inline void deep_copy( using dst_memory_space = typename dst_type::memory_space; using src_memory_space = typename src_type::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -1628,8 +1620,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; @@ -1641,8 +1633,8 @@ inline void deep_copy( using dst_value_type = typename dst_type::value_type; using src_value_type = typename src_type::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -1772,10 +1764,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2191,8 +2183,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const TeamType& team, const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Kokkos::parallel_for(Kokkos::TeamVectorRange(team, dst.span()), [&](const int& i) { dst.data()[i] = value; }); } @@ -2201,8 +2193,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { for (size_t i = 0; i < dst.span(); ++i) { dst.data()[i] = value; } @@ -2568,13 +2560,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2594,21 +2586,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -2649,13 +2640,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && !Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2696,8 +2687,8 @@ inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, std::enable_if_t::value && - std::is_same::specialize, - void>::value>* = nullptr) { + std::is_same_v::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; static_assert(src_traits::rank == 0, @@ -2734,8 +2725,8 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using src_traits = ViewTraits; @@ -2743,8 +2734,8 @@ inline void deep_copy( using src_memory_space = typename src_traits::memory_space; using dst_memory_space = typename dst_traits::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2784,15 +2775,15 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; using src_type = View; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -2922,10 +2913,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2994,11 +2985,11 @@ bool size_mismatch(const ViewType& view, unsigned int max_extent, /** \brief Resize a view with copying old data to new data at the corresponding * indices. */ template -inline typename std::enable_if< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value>::type +inline std::enable_if_t< + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, @@ -3048,10 +3039,10 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3066,10 +3057,10 @@ resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3085,10 +3076,10 @@ template inline std::enable_if_t< (Impl::is_view_ctor_property::value || Kokkos::is_execution_space::value) && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> resize(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3103,12 +3094,12 @@ resize(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3149,12 +3140,12 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3218,10 +3209,10 @@ inline void resize(Kokkos::View& v, /** \brief Resize a view with discarding old data. */ template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, const size_t n6, const size_t n7, @@ -3264,10 +3255,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3283,10 +3274,10 @@ realloc(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3302,10 +3293,10 @@ realloc(Kokkos::View& v, template inline std::enable_if_t< Impl::is_view_ctor_property::value && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> realloc(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3320,12 +3311,12 @@ realloc(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3365,12 +3356,12 @@ impl_realloc(Kokkos::View& v, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3435,7 +3426,7 @@ struct MirrorViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -3450,26 +3441,6 @@ struct MirrorViewType { std::conditional_t; }; -template -struct MirrorType { - // The incoming view_type - using src_view_type = typename Kokkos::View; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::View; -}; - // collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { @@ -3503,7 +3474,7 @@ inline auto create_mirror(const Kokkos::View& src, if constexpr (Impl::ViewCtorProp::has_memory_space) { using memory_space = typename decltype(prop_copy)::memory_space; using dst_type = - typename Impl::MirrorType::view_type; + typename Impl::MirrorViewType::dest_view_type; return dst_type(prop_copy, src.layout()); } else { using dst_type = typename View::HostMirror; @@ -3636,12 +3607,12 @@ inline auto create_mirror_view( const Kokkos::View& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::View< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename Kokkos::View::data_type, + typename Kokkos::View::HostMirror::data_type>) { check_view_ctor_args_create_mirror(); return typename Kokkos::View::HostMirror(src); } else { @@ -3785,8 +3756,7 @@ create_mirror_view_and_copy( const Space&, const Kokkos::View& src, std::string const& name = "", std::enable_if_t< - std::is_void::specialize>::value>* = - nullptr) { + std::is_void_v::specialize>>* = nullptr) { return create_mirror_view_and_copy( Kokkos::view_alloc(typename Space::memory_space{}, name), src); } diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 1f146563be2..9588d289a9c 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -63,7 +63,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include @@ -248,9 +250,9 @@ class KOKKOS_ATTRIBUTE_NODISCARD ScopeGuard { } ScopeGuard& operator=(const ScopeGuard&) = delete; - ScopeGuard& operator=(ScopeGuard&&) = delete; - ScopeGuard(const ScopeGuard&) = delete; - ScopeGuard(ScopeGuard&&) = delete; + ScopeGuard& operator=(ScopeGuard&&) = delete; + ScopeGuard(const ScopeGuard&) = delete; + ScopeGuard(ScopeGuard&&) = delete; }; } // namespace Kokkos @@ -281,7 +283,7 @@ std::vector partition_space(ExecSpace const& space, "Kokkos Error: partition_space expects an Execution Space as " "first argument"); static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); std::vector instances(weights.size()); diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 7edb35f00eb..5dbe5714293 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -106,8 +106,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = HIP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) -using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::SYCL; +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = SYCL; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Experimental::OpenACC; @@ -122,7 +121,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -162,7 +161,7 @@ using SharedSpace = CudaUVMSpace; using SharedSpace = HIPManagedSpace; #define KOKKOS_HAS_SHARED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) -using SharedSpace = Experimental::SYCLSharedUSMSpace; +using SharedSpace = SYCLSharedUSMSpace; #define KOKKOS_HAS_SHARED_SPACE // if only host compile point to HostSpace #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) @@ -184,7 +183,7 @@ using SharedHostPinnedSpace = CudaHostPinnedSpace; using SharedHostPinnedSpace = HIPHostPinnedSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) - using SharedHostPinnedSpace = Experimental::SYCLHostUSMSpace; + using SharedHostPinnedSpace = SYCLHostUSMSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) using SharedHostPinnedSpace = HostSpace; diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index 92931b58495..69223b64128 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -84,12 +84,12 @@ class Crs { /* * Default Constructors, operators and destructor */ - KOKKOS_DEFAULTED_FUNCTION Crs() = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; - KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; /** \brief Assign to a view of the rhs array. * If the old view is the last view @@ -148,7 +148,7 @@ class GetCrsTransposeCounts { public: KOKKOS_INLINE_FUNCTION - void operator()(index_type i) const { atomic_increment(&out[in.entries(i)]); } + void operator()(index_type i) const { atomic_inc(&out[in.entries(i)]); } GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out) : in(arg_in), out(arg_out) { using policy_type = RangePolicy; @@ -345,7 +345,7 @@ struct CountAndFill : public CountAndFillBase { closure.execute(); } auto nentries = Kokkos::get_crs_row_map_from_counts(this->m_crs.row_map, - this->m_counts); + this->m_counts); this->m_counts = counts_type(); this->m_crs.entries = entries_type("entries", nentries); { diff --git a/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp index ae28805a42e..8af10b2a409 100644 --- a/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp +++ b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp @@ -54,8 +54,8 @@ struct detector>, Op, Args...> { } // namespace Impl struct nonesuch : private Impl::nonesuch_base { - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; void operator=(nonesuch const&) = delete; }; @@ -81,7 +81,7 @@ inline constexpr bool is_detected_v = is_detected::value; template class Op, class... Args> inline constexpr bool is_detected_exact_v = - is_detected_exact::value; + is_detected_exact::value; // NOLINT template class Op, class... Args> inline constexpr bool is_detected_convertible_v = diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index b8d7f77deb3..dd7ce5ce21f 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -27,7 +27,10 @@ static_assert(false, #include #include #include +#include +#ifndef KOKKOS_ENABLE_IMPL_TYPEINFO #include +#endif #include //---------------------------------------------------------------------------- @@ -197,8 +200,7 @@ class RangePolicy : public Impl::PolicyTraits { /** \brief finalize chunk_size if it was set to AUTO*/ inline void set_auto_chunk_size() { #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v) { + if (std::is_same_v) { // chunk_size <=1 lets the compiler choose the workgroup size when // launching kernels m_granularity = 1; @@ -248,46 +250,49 @@ class RangePolicy : public Impl::PolicyTraits { // To be replaced with std::in_range (c++20) template - static void check_conversion_safety(const IndexType bound) { + static void check_conversion_safety([[maybe_unused]] const IndexType bound) { + // Checking that the round-trip conversion preserves input index value + if constexpr (std::is_convertible_v) { #if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) - std::string msg = - "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " - "is performed on a bound (" + - std::to_string(bound) + - "), which may " - "not preserve its original value.\n"; - bool warn = false; - - if constexpr (std::is_signed_v != - std::is_signed_v) { - // check signed to unsigned - if constexpr (std::is_signed_v) - warn |= (bound < static_cast( - std::numeric_limits::min())); - - // check unsigned to signed - if constexpr (std::is_signed_v) - warn |= (bound > static_cast( - std::numeric_limits::max())); - } + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; + + if constexpr (std::is_arithmetic_v && + (std::is_signed_v != + std::is_signed_v)) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); + + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } - // check narrowing - warn |= (static_cast(static_cast(bound)) != bound); + // check narrowing + warn |= + (static_cast(static_cast(bound)) != bound); - if (warn) { + if (warn) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 - Kokkos::abort(msg.c_str()); + Kokkos::abort(msg.c_str()); #endif #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS - Kokkos::Impl::log_warning(msg); + Kokkos::Impl::log_warning(msg); #endif - } -#else - (void)bound; + } #endif + } } public: @@ -333,20 +338,20 @@ class RangePolicy : public Impl::PolicyTraits { }; }; -RangePolicy()->RangePolicy<>; +RangePolicy() -> RangePolicy<>; -RangePolicy(int64_t, int64_t)->RangePolicy<>; -RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; +RangePolicy(int64_t, int64_t) -> RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&) -> RangePolicy<>; -RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t) -> RangePolicy<>; RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) - ->RangePolicy<>; + -> RangePolicy<>; template >> -RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t) -> RangePolicy; template >> -RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&) -> RangePolicy; } // namespace Kokkos @@ -515,24 +520,24 @@ struct PerThreadValue { template struct ExtractVectorLength { static inline iType value( - std::enable_if_t::value, iType> val, Args...) { + std::enable_if_t, iType> val, Args...) { return val; } - static inline std::enable_if_t::value, int> value( - std::enable_if_t::value, iType>, Args...) { + static inline std::enable_if_t, int> value( + std::enable_if_t, iType>, Args...) { return 1; } }; template -inline std::enable_if_t::value, iType> -extract_vector_length(iType val, Args...) { +inline std::enable_if_t, iType> extract_vector_length( + iType val, Args...) { return val; } template -inline std::enable_if_t::value, int> -extract_vector_length(iType, Args...) { +inline std::enable_if_t, int> extract_vector_length( + iType, Args...) { return 1; } @@ -577,7 +582,7 @@ struct ScratchRequest { } }; -// Throws a runtime exception if level is not `0` or `1` +// Causes abnormal program termination if level is not `0` or `1` void team_policy_check_valid_storage_level_argument(int level); /** \brief Execution policy for parallel work over a league of teams of @@ -721,55 +726,54 @@ class TeamPolicy // Execution space not provided deduces to TeamPolicy<> -TeamPolicy()->TeamPolicy<>; +TeamPolicy() -> TeamPolicy<>; -TeamPolicy(int, int)->TeamPolicy<>; -TeamPolicy(int, int, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, int, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, int) -> TeamPolicy<>; +TeamPolicy(int, int, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&) -> TeamPolicy<>; // DefaultExecutionSpace deduces to TeamPolicy<> -TeamPolicy(DefaultExecutionSpace const&, int, int)->TeamPolicy<>; -TeamPolicy(DefaultExecutionSpace const&, int, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int) -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, - Kokkos::AUTO_t const&) - ->TeamPolicy<>; + Kokkos::AUTO_t const&) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; // ES != DefaultExecutionSpace deduces to TeamPolicy template >> -TeamPolicy(ES const&, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int) -> TeamPolicy; template >> TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) - ->TeamPolicy; + -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&) -> TeamPolicy; namespace Impl { @@ -1041,7 +1045,7 @@ struct TeamThreadMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...) - ->TeamThreadMDRange, TeamHandle>; + -> TeamThreadMDRange, TeamHandle>; template struct ThreadVectorMDRange; @@ -1078,7 +1082,7 @@ struct ThreadVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...) - ->ThreadVectorMDRange, TeamHandle>; + -> ThreadVectorMDRange, TeamHandle>; template struct TeamVectorMDRange; @@ -1115,7 +1119,7 @@ struct TeamVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...) - ->TeamVectorMDRange, TeamHandle>; + -> TeamVectorMDRange, TeamHandle>; template @@ -1162,7 +1166,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1198,7 +1202,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1217,15 +1221,21 @@ KOKKOS_INLINE_FUNCTION void parallel_for( namespace Impl { template ::value> + bool HasTag = !std::is_void_v> struct ParallelConstructName; template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = + std::string(TypeInfo>::name()) + + "/" + std::string(TypeInfo::name()); +#else default_name = std::string(typeid(FunctorType).name()) + "/" + typeid(TagType).name(); +#endif } } std::string const& get() { @@ -1239,7 +1249,11 @@ template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { - default_name = std::string(typeid(FunctorType).name()); +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = TypeInfo>::name(); +#else + default_name = typeid(FunctorType).name(); +#endif } } std::string const& get() { diff --git a/lib/kokkos/core/src/Kokkos_Extents.hpp b/lib/kokkos/core/src/Kokkos_Extents.hpp index 9bc2eda6046..7d1f8c755d7 100644 --- a/lib/kokkos/core/src/Kokkos_Extents.hpp +++ b/lib/kokkos/core/src/Kokkos_Extents.hpp @@ -134,7 +134,7 @@ struct ApplyExtent { template struct ApplyExtent { - using type = ValueType * [Ext]; + using type = ValueType* [Ext]; }; template diff --git a/lib/kokkos/core/src/Kokkos_Future.hpp b/lib/kokkos/core/src/Kokkos_Future.hpp index 0b3a153de8c..c26d08be1cf 100644 --- a/lib/kokkos/core/src/Kokkos_Future.hpp +++ b/lib/kokkos/core/src/Kokkos_Future.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_FUTURE_HPP #define KOKKOS_FUTURE_HPP @@ -41,13 +47,19 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // For now, hack this in as a partial specialization // TODO @tasking @cleanup Make this the "normal" class template and make the old // code the specialization template -class BasicFuture> { +class KOKKOS_DEPRECATED + BasicFuture> { public: using value_type = ValueType; using execution_space = ExecutionSpace; @@ -244,7 +256,7 @@ class BasicFuture> { //////////////////////////////////////////////////////////////////////////////// template -class BasicFuture { +class KOKKOS_DEPRECATED BasicFuture { private: template friend class BasicTaskScheduler; @@ -413,13 +425,13 @@ class BasicFuture { // Is a Future with the given execution space template -struct is_future : public std::false_type {}; +struct KOKKOS_DEPRECATED is_future : public std::false_type {}; template -struct is_future, ExecSpace> +struct KOKKOS_DEPRECATED is_future, ExecSpace> : std::bool_constant< - std::is_same::value || - std::is_void::value> {}; + std::is_same_v || + std::is_void_v> {}; //////////////////////////////////////////////////////////////////////////////// // END OLD CODE @@ -432,8 +444,8 @@ class ResolveFutureArgOrder { private: enum { Arg1_is_space = Kokkos::is_space::value }; enum { Arg2_is_space = Kokkos::is_space::value }; - enum { Arg1_is_value = !Arg1_is_space && !std::is_void::value }; - enum { Arg2_is_value = !Arg2_is_space && !std::is_void::value }; + enum { Arg1_is_value = !Arg1_is_space && !std::is_void_v }; + enum { Arg2_is_value = !Arg2_is_space && !std::is_void_v }; static_assert(!(Arg1_is_space && Arg2_is_space), "Future cannot be given two spaces"); @@ -463,10 +475,15 @@ class ResolveFutureArgOrder { * */ template -using Future = typename Impl::ResolveFutureArgOrder::type; +using Future KOKKOS_DEPRECATED = + typename Impl::ResolveFutureArgOrder::type; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Graph.hpp b/lib/kokkos/core/src/Kokkos_Graph.hpp index 9cc6650e26e..05d774ac61a 100644 --- a/lib/kokkos/core/src/Kokkos_Graph.hpp +++ b/lib/kokkos/core/src/Kokkos_Graph.hpp @@ -86,10 +86,21 @@ struct [[nodiscard]] Graph { return m_impl_ptr->get_execution_space(); } - void submit() const { + void instantiate() { KOKKOS_EXPECTS(bool(m_impl_ptr)) - (*m_impl_ptr).submit(); + (*m_impl_ptr).instantiate(); } + + void submit(const execution_space& exec) const { + KOKKOS_EXPECTS(bool(m_impl_ptr)) + (*m_impl_ptr).submit(exec); + } + + void submit() const { submit(get_execution_space()); } + + decltype(auto) native_graph(); + + decltype(auto) native_graph_exec(); }; // end Graph }}}1 @@ -135,22 +146,68 @@ Graph create_graph(ExecutionSpace ex, Closure&& arg_closure) { // function template injection works. auto rv = Kokkos::Impl::GraphAccess::construct_graph(std::move(ex)); // Invoke the user's graph construction closure - ((Closure &&) arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); + ((Closure&&)arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); // and given them back the graph // KOKKOS_ENSURES(rv.m_impl_ptr.use_count() == 1) return rv; } +template +std::enable_if_t, + Graph> +create_graph(ExecutionSpace exec = ExecutionSpace{}) { + return Kokkos::Impl::GraphAccess::construct_graph(std::move(exec)); +} + template < class ExecutionSpace = DefaultExecutionSpace, class Closure = Kokkos::Impl::DoNotExplicitlySpecifyThisTemplateParameter> -Graph create_graph(Closure&& arg_closure) { - return create_graph(ExecutionSpace{}, (Closure &&) arg_closure); +std::enable_if_t< + !Kokkos::is_execution_space_v>, + Graph> +create_graph(Closure&& arg_closure) { + return create_graph(ExecutionSpace{}, (Closure&&)arg_closure); } // end create_graph }}}1 //============================================================================== +template +decltype(auto) Graph::native_graph() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph(); + } +#endif +} + +template +decltype(auto) Graph::native_graph_exec() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph_exec(); + } +#endif +} + } // end namespace Experimental } // namespace Kokkos @@ -163,7 +220,7 @@ Graph create_graph(Closure&& arg_closure) { #include #if defined(KOKKOS_ENABLE_HIP) // The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it. -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) +#if defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) #include #endif #endif diff --git a/lib/kokkos/core/src/Kokkos_GraphNode.hpp b/lib/kokkos/core/src/Kokkos_GraphNode.hpp index 2a4e2cf6414..a0a60c07d09 100644 --- a/lib/kokkos/core/src/Kokkos_GraphNode.hpp +++ b/lib/kokkos/core/src/Kokkos_GraphNode.hpp @@ -48,7 +48,7 @@ class GraphNodeRef { // intended to be SFINAE-safe, so do validation before you instantiate. static_assert( - std::is_same::value || + std::is_same_v || Kokkos::Impl::is_specialization_of::value, "Invalid predecessor template parameter given to GraphNodeRef"); @@ -56,7 +56,7 @@ class GraphNodeRef { Kokkos::is_execution_space::value, "Invalid execution space template parameter given to GraphNodeRef"); - static_assert(std::is_same::value || + static_assert(std::is_same_v || Kokkos::Impl::is_graph_kernel::value, "Invalid kernel template parameter given to GraphNodeRef"); @@ -151,7 +151,7 @@ class GraphNodeRef { typename return_t::node_impl_t>( m_node_impl->execution_space_instance(), Kokkos::Impl::_graph_node_kernel_ctor_tag{}, - (NextKernelDeduced &&) arg_kernel, + (NextKernelDeduced&&)arg_kernel, // *this is the predecessor Kokkos::Impl::_graph_node_predecessor_ctor_tag{}, *this)); @@ -184,10 +184,10 @@ class GraphNodeRef { // {{{3 // Copyable and movable (basically just shared_ptr semantics - GraphNodeRef() noexcept = default; - GraphNodeRef(GraphNodeRef const&) = default; - GraphNodeRef(GraphNodeRef&&) noexcept = default; - GraphNodeRef& operator=(GraphNodeRef const&) = default; + GraphNodeRef() noexcept = default; + GraphNodeRef(GraphNodeRef const&) = default; + GraphNodeRef(GraphNodeRef&&) noexcept = default; + GraphNodeRef& operator=(GraphNodeRef const&) = default; GraphNodeRef& operator=(GraphNodeRef&&) noexcept = default; ~GraphNodeRef() = default; @@ -197,19 +197,19 @@ class GraphNodeRef { //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // {{{3 - template < - class OtherKernel, class OtherPredecessor, - std::enable_if_t< - // Not a copy/move constructor - !std::is_same>::value && - // must be an allowed type erasure of the kernel - Kokkos::Impl::is_compatible_type_erasure::value && - // must be an allowed type erasure of the predecessor - Kokkos::Impl::is_compatible_type_erasure< - OtherPredecessor, graph_predecessor>::value, - int> = 0> + template > && + // must be an allowed type erasure of the kernel + Kokkos::Impl::is_compatible_type_erasure< + OtherKernel, graph_kernel>::value && + // must be an allowed type erasure of the predecessor + Kokkos::Impl::is_compatible_type_erasure< + OtherPredecessor, graph_predecessor>::value, + int> = 0> /* implicit */ GraphNodeRef( GraphNodeRef const& other) @@ -257,7 +257,7 @@ class GraphNodeRef { //|| policy_t::execution_space_is_defaulted, "Execution Space mismatch between execution policy and graph"); - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using next_policy_t = decltype(policy); @@ -266,8 +266,8 @@ class GraphNodeRef { std::decay_t, Kokkos::ParallelForTag>; return this->_then_kernel(next_kernel_t{std::move(arg_name), policy.space(), - (Functor &&) functor, - (Policy &&) policy}); + (Functor&&)functor, + (Policy&&)policy}); } template < @@ -280,8 +280,7 @@ class GraphNodeRef { int> = 0> auto then_parallel_for(Policy&& policy, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", (Policy &&) policy, - (Functor &&) functor); + return this->then_parallel_for("", (Policy&&)policy, (Functor&&)functor); } template @@ -290,13 +289,13 @@ class GraphNodeRef { // needs to static assert constraint: DataParallelFunctor return this->then_parallel_for(std::move(name), Kokkos::RangePolicy(0, n), - (Functor &&) functor); + (Functor&&)functor); } template auto then_parallel_for(std::size_t n, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", n, (Functor &&) functor); + return this->then_parallel_for("", n, (Functor&&)functor); } // end then_parallel_for }}}2 @@ -359,6 +358,23 @@ class GraphNodeRef { Kokkos::is_reducer::value, "Output argument to parallel reduce in a graph must be a " "View or a Reducer"); + + if constexpr (Kokkos::is_reducer_v) { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename return_type_remove_cvref:: + result_view_type::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } else { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, + typename return_type_remove_cvref::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } + using return_type = // Yes, you do really have to do this... std::conditional_t::value, @@ -373,7 +389,7 @@ class GraphNodeRef { // End of Kokkos reducer disaster //---------------------------------------- - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using passed_reducer_type = typename return_value_adapter::reducer_type; @@ -399,7 +415,7 @@ class GraphNodeRef { return this->_then_kernel(next_kernel_t{ std::move(arg_name), graph_impl_ptr->get_execution_space(), - functor_reducer, (Policy &&) policy, + functor_reducer, (Policy&&)policy, return_value_adapter::return_value(return_value, functor)}); } @@ -413,9 +429,9 @@ class GraphNodeRef { int> = 0> auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", (Policy &&) arg_policy, - (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", (Policy&&)arg_policy, + (Functor&&)functor, + (ReturnType&&)return_value); } template @@ -425,15 +441,15 @@ class GraphNodeRef { ReturnType&& return_value) const { return this->then_parallel_reduce( std::move(label), Kokkos::RangePolicy{0, idx_end}, - (Functor &&) functor, (ReturnType &&) return_value); + (Functor&&)functor, (ReturnType&&)return_value); } template auto then_parallel_reduce(typename execution_space::size_type idx_end, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", idx_end, (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", idx_end, (Functor&&)functor, + (ReturnType&&)return_value); } // end then_parallel_reduce }}}2 diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index 8b5f29f95b2..706586826f4 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -63,10 +63,10 @@ class HostSpace { //! This memory space preferred device_type using device_type = Kokkos::Device; - HostSpace() = default; - HostSpace(HostSpace&& rhs) = default; - HostSpace(const HostSpace& rhs) = default; - HostSpace& operator=(HostSpace&&) = default; + HostSpace() = default; + HostSpace(HostSpace&& rhs) = default; + HostSpace(const HostSpace& rhs) = default; + HostSpace& operator=(HostSpace&&) = default; HostSpace& operator=(const HostSpace&) = default; ~HostSpace() = default; @@ -183,18 +183,6 @@ namespace Kokkos { namespace Impl { -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy_async(exec, dst, src, n); - } -}; - template struct DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { @@ -202,10 +190,15 @@ struct DeepCopy { } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); + if constexpr (!Kokkos::SpaceAccessibility::accessible) { + exec.fence( + "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); + hostspace_parallel_deepcopy_async(dst, src, n); + } else { + hostspace_parallel_deepcopy_async(exec, dst, src, n); + } } }; diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp index 37b80e54a85..a760e7054a1 100644 --- a/lib/kokkos/core/src/Kokkos_Layout.hpp +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -52,13 +52,17 @@ struct LayoutLeft { using array_layout = LayoutLeft; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutLeft(LayoutLeft const&) = default; - LayoutLeft(LayoutLeft&&) = default; + LayoutLeft(LayoutLeft const&) = default; + LayoutLeft(LayoutLeft&&) = default; LayoutLeft& operator=(LayoutLeft const&) = default; - LayoutLeft& operator=(LayoutLeft&&) = default; + LayoutLeft& operator=(LayoutLeft&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutLeft(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -69,7 +73,8 @@ struct LayoutLeft { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride(KOKKOS_IMPL_CTOR_DEFAULT_ARG) {} friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -101,13 +106,17 @@ struct LayoutRight { using array_layout = LayoutRight; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutRight(LayoutRight const&) = default; - LayoutRight(LayoutRight&&) = default; + LayoutRight(LayoutRight const&) = default; + LayoutRight(LayoutRight&&) = default; LayoutRight& operator=(LayoutRight const&) = default; - LayoutRight& operator=(LayoutRight&&) = default; + LayoutRight& operator=(LayoutRight&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutRight(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -118,7 +127,8 @@ struct LayoutRight { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{KOKKOS_IMPL_CTOR_DEFAULT_ARG} {} friend bool operator==(const LayoutRight& left, const LayoutRight& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -144,10 +154,10 @@ struct LayoutStride { enum : bool { is_extent_constructible = false }; - LayoutStride(LayoutStride const&) = default; - LayoutStride(LayoutStride&&) = default; + LayoutStride(LayoutStride const&) = default; + LayoutStride(LayoutStride&&) = default; LayoutStride& operator=(LayoutStride const&) = default; - LayoutStride& operator=(LayoutStride&&) = default; + LayoutStride& operator=(LayoutStride&&) = default; /** \brief Compute strides from ordered dimensions. * @@ -191,8 +201,8 @@ struct LayoutStride { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S5 = 0, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S6 = 0, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S7 = 0) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3, - S4, S5, S6, S7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{S0, S1, S2, S3, S4, S5, S6, S7} {} friend bool operator==(const LayoutStride& left, const LayoutStride& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 0a0acd303f5..97b78a3c648 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -27,7 +27,7 @@ * KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget * execution space * KOKKOS_ENABLE_HIP Kokkos::HIP execution space - * KOKKOS_ENABLE_SYCL Kokkos::Experimental::SYCL execution space + * KOKKOS_ENABLE_SYCL Kokkos::SYCL execution space * KOKKOS_ENABLE_HWLOC HWLOC library is available. * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive! * KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space. @@ -132,7 +132,7 @@ #define KOKKOS_CLASS_LAMBDA [ =, *this ] #endif -//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. +// #if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. // Intel compiler for host code. @@ -252,10 +252,10 @@ // CLANG compiler macros #if defined(KOKKOS_COMPILER_CLANG) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -273,10 +273,10 @@ // GNU Compiler macros #if defined(KOKKOS_COMPILER_GNU) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -298,7 +298,7 @@ #if defined(KOKKOS_COMPILER_NVHPC) #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #endif @@ -357,6 +357,21 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif +// FIXME_OPENACC FIXME_OPENMPTARGET +// Move to setup files once there is more content +// clang-format off +#if defined(KOKKOS_ENABLE_OPENACC) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenACC backend" +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenMPTarget backend" +#endif +// clang-format on + +#if !defined(KOKKOS_IMPL_RELOCATABLE_FUNCTION) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION +#endif + //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily @@ -369,10 +384,14 @@ #define KOKKOS_FORCEINLINE_FUNCTION \ KOKKOS_IMPL_FORCEINLINE_FUNCTION \ __attribute__((annotate("KOKKOS_FORCEINLINE_FUNCTION"))) +#define KOKKOS_RELOCATABLE_FUNCTION \ + KOKKOS_IMPL_RELOCATABLE_FUNCTION \ + __attribute__((annotate("KOKKOS_RELOCATABLE_FUNCTION"))) #else #define KOKKOS_FUNCTION KOKKOS_IMPL_FUNCTION #define KOKKOS_INLINE_FUNCTION KOKKOS_IMPL_INLINE_FUNCTION #define KOKKOS_FORCEINLINE_FUNCTION KOKKOS_IMPL_FORCEINLINE_FUNCTION +#define KOKKOS_RELOCATABLE_FUNCTION KOKKOS_IMPL_RELOCATABLE_FUNCTION #endif //---------------------------------------------------------------------------- @@ -537,14 +556,17 @@ static constexpr bool kokkos_omp_on_host() { return false; } // If compiling with CUDA, we must use relocatable device code to enable the // task policy. +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #if defined(KOKKOS_ENABLE_CUDA) #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) #define KOKKOS_ENABLE_TASKDAG #endif // FIXME_SYCL Tasks not implemented -#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) +#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOS_ENABLE_TASKDAG #endif +#endif #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC @@ -582,9 +604,11 @@ static constexpr bool kokkos_omp_on_host() { return false; } // clang-format off #if defined(__NVCOMPILER) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ - _Pragma("diag_suppress 1216") + _Pragma("diag_suppress 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ - _Pragma("diag_default 1216") + _Pragma("diag_default 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #elif defined(__EDG__) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ _Pragma("warning push") \ @@ -607,6 +631,18 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif + +#if defined(__NVCOMPILER) +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() \ + _Pragma("diag_suppress code_is_unreachable") \ + _Pragma("diag_suppress initialization_not_reachable") +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() \ + _Pragma("diag_default code_is_unreachable") \ + _Pragma("diag_default initialization_not_reachable") +#else +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() +#endif // clang-format on #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index ce8c9e152fa..f7e9e2a78c4 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -196,9 +196,10 @@ class MemoryPool { stats.consumed_superblocks++; stats.consumed_blocks += block_used; - stats.consumed_bytes += block_used * block_size; + stats.consumed_bytes += static_cast(block_used) * block_size; stats.reserved_blocks += block_count - block_used; - stats.reserved_bytes += (block_count - block_used) * block_size; + stats.reserved_bytes += + static_cast(block_count - block_used) * block_size; } } @@ -234,9 +235,9 @@ class MemoryPool { //-------------------------------------------------------------------------- - KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default; KOKKOS_INLINE_FUNCTION MemoryPool() diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp index 118bf52c05f..1304d3ba926 100644 --- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -114,7 +114,7 @@ template <> struct signaling_NaN_helper { static constexpr long dou #endif template struct digits_helper {}; template <> struct digits_helper { static constexpr int value = 1; }; -template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed::value; }; +template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed_v; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT - 1; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT*sizeof(short)-1; }; diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp index e569fefc14d..c44d1f23109 100644 --- a/lib/kokkos/core/src/Kokkos_Pair.hpp +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -449,7 +449,8 @@ struct KOKKOS_DEPRECATED pair { // Specialization of relational operators for Kokkos::pair. // -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #endif template @@ -487,7 +488,8 @@ KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif #endif diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index 122239df790..24349e95aea 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -72,19 +72,19 @@ struct FunctorPolicyExecutionSpace { static_assert( !is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with an execution space " "are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with a device " "type are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A functor with both an execution space and device type is " "given but their execution space types do not match!"); @@ -134,8 +134,10 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy, const FunctorType& functor) { uint64_t kpID = 0; - ExecPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID); + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< @@ -348,9 +350,11 @@ template ::value>> inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, const FunctorType& functor) { - uint64_t kpID = 0; - ExecutionPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); + uint64_t kpID = 0; + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 53913266f13..3b89d184f2a 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -73,7 +73,7 @@ struct Sum { template KOKKOS_DEDUCTION_GUIDE Sum(View const&) - ->Sum::memory_space>; + -> Sum::memory_space>; template struct Prod { @@ -118,7 +118,7 @@ struct Prod { template KOKKOS_DEDUCTION_GUIDE Prod(View const&) - ->Prod::memory_space>; + -> Prod::memory_space>; template struct Min { @@ -165,7 +165,7 @@ struct Min { template KOKKOS_DEDUCTION_GUIDE Min(View const&) - ->Min::memory_space>; + -> Min::memory_space>; template struct Max { @@ -213,7 +213,7 @@ struct Max { template KOKKOS_DEDUCTION_GUIDE Max(View const&) - ->Max::memory_space>; + -> Max::memory_space>; template struct LAnd { @@ -259,7 +259,7 @@ struct LAnd { template KOKKOS_DEDUCTION_GUIDE LAnd(View const&) - ->LAnd::memory_space>; + -> LAnd::memory_space>; template struct LOr { @@ -306,7 +306,7 @@ struct LOr { template KOKKOS_DEDUCTION_GUIDE LOr(View const&) - ->LOr::memory_space>; + -> LOr::memory_space>; template struct BAnd { @@ -353,7 +353,7 @@ struct BAnd { template KOKKOS_DEDUCTION_GUIDE BAnd(View const&) - ->BAnd::memory_space>; + -> BAnd::memory_space>; template struct BOr { @@ -400,7 +400,7 @@ struct BOr { template KOKKOS_DEDUCTION_GUIDE BOr(View const&) - ->BOr::memory_space>; + -> BOr::memory_space>; template struct ValLocScalar { @@ -438,7 +438,12 @@ struct MinLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -458,11 +463,10 @@ struct MinLoc { }; template -KOKKOS_DEDUCTION_GUIDE MinLoc( - View, Properties...> const&) - ->MinLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MinLoc(View, Properties...> const&) -> MinLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MaxLoc { @@ -494,7 +498,12 @@ struct MaxLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -514,11 +523,10 @@ struct MaxLoc { }; template -KOKKOS_DEDUCTION_GUIDE MaxLoc( - View, Properties...> const&) - ->MaxLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MaxLoc(View, Properties...> const&) -> MaxLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MinMaxScalar { @@ -580,8 +588,8 @@ struct MinMax { template KOKKOS_DEDUCTION_GUIDE MinMax(View, Properties...> const&) - ->MinMax, Properties...>::memory_space>; + -> MinMax, Properties...>::memory_space>; template struct MinMaxLocScalar { @@ -622,10 +630,16 @@ struct MinMaxLoc { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { + dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -650,9 +664,9 @@ struct MinMaxLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxLoc( View, Properties...> const&) - ->MinMaxLoc, - Properties...>::memory_space>; + -> MinMaxLoc, + Properties...>::memory_space>; // -------------------------------------------------- // reducers added to support std algorithms @@ -718,9 +732,9 @@ struct MaxFirstLoc { template KOKKOS_DEDUCTION_GUIDE MaxFirstLoc( View, Properties...> const&) - ->MaxFirstLoc, - Properties...>::memory_space>; + -> MaxFirstLoc, + Properties...>::memory_space>; // // MaxFirstLocCustomComparator @@ -788,9 +802,9 @@ template KOKKOS_DEDUCTION_GUIDE MaxFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MaxFirstLocCustomComparator, - Properties...>::memory_space>; + -> MaxFirstLocCustomComparator, + Properties...>::memory_space>; // // MinFirstLoc @@ -852,9 +866,9 @@ struct MinFirstLoc { template KOKKOS_DEDUCTION_GUIDE MinFirstLoc( View, Properties...> const&) - ->MinFirstLoc, - Properties...>::memory_space>; + -> MinFirstLoc, + Properties...>::memory_space>; // // MinFirstLocCustomComparator @@ -922,9 +936,9 @@ template KOKKOS_DEDUCTION_GUIDE MinFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinFirstLocCustomComparator, - Properties...>::memory_space>; + -> MinFirstLocCustomComparator, + Properties...>::memory_space>; // // MinMaxFirstLastLoc @@ -997,9 +1011,9 @@ struct MinMaxFirstLastLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLoc( View, Properties...> const&) - ->MinMaxFirstLastLoc, - Properties...>::memory_space>; + -> MinMaxFirstLastLoc, + Properties...>::memory_space>; // // MinMaxFirstLastLocCustomComparator @@ -1077,7 +1091,7 @@ template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinMaxFirstLastLocCustomComparator< + -> MinMaxFirstLastLocCustomComparator< Scalar, Index, ComparatorType, typename View, Properties...>::memory_space>; @@ -1139,10 +1153,9 @@ struct FirstLoc { }; template -KOKKOS_DEDUCTION_GUIDE FirstLoc( - View, Properties...> const&) - ->FirstLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +FirstLoc(View, Properties...> const&) -> FirstLoc< + Index, typename View, Properties...>::memory_space>; // // LastLoc @@ -1202,8 +1215,8 @@ struct LastLoc { template KOKKOS_DEDUCTION_GUIDE LastLoc(View, Properties...> const&) - ->LastLoc, Properties...>::memory_space>; + -> LastLoc, + Properties...>::memory_space>; template struct StdIsPartScalar { @@ -1270,8 +1283,8 @@ struct StdIsPartitioned { template KOKKOS_DEDUCTION_GUIDE StdIsPartitioned( View, Properties...> const&) - ->StdIsPartitioned, - Properties...>::memory_space>; + -> StdIsPartitioned, + Properties...>::memory_space>; template struct StdPartPointScalar { @@ -1333,8 +1346,8 @@ struct StdPartitionPoint { template KOKKOS_DEDUCTION_GUIDE StdPartitionPoint( View, Properties...> const&) - ->StdPartitionPoint, - Properties...>::memory_space>; + -> StdPartitionPoint, + Properties...>::memory_space>; } // namespace Kokkos namespace Kokkos { @@ -1404,9 +1417,9 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< std::enable_if_t::value && - (!std::is_array::value && - !std::is_pointer::value) && - !Kokkos::is_reducer::value>, + (!std::is_array_v && + !std::is_pointer_v< + ReturnType>)&&!Kokkos::is_reducer::value>, ReturnType, FunctorType> { using return_type = Kokkos::View; @@ -1422,8 +1435,8 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< - std::enable_if_t<(std::is_array::value || - std::is_pointer::value)>, + std::enable_if_t<(std::is_array_v || + std::is_pointer_v)>, ReturnType, FunctorType> { using return_type = Kokkos::View, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; @@ -1434,7 +1447,7 @@ struct ParallelReduceReturnValue< static return_type return_value(ReturnType& return_val, const FunctorType& functor) { - if (std::is_array::value) + if (std::is_array_v) return return_type(return_val); else return return_type(return_val, functor.value_count); @@ -1467,8 +1480,7 @@ struct ParallelReducePolicyType< template struct ParallelReducePolicyType< - std::enable_if_t::value>, PolicyType, - FunctorType> { + std::enable_if_t>, PolicyType, FunctorType> { using execution_space = typename Impl::FunctorPolicyExecutionSpace::execution_space; @@ -1501,27 +1513,28 @@ struct ParallelReduceAdaptor { using PassedReducerType = typename return_value_adapter::reducer_type; uint64_t kpID = 0; - PolicyType inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_reduce( - inner_policy, functor, label, kpID); - using ReducerSelector = - Kokkos::Impl::if_c::value, + Kokkos::Impl::if_c, FunctorType, PassedReducerType>; using Analysis = FunctorAnalysis; - using CombinedFunctorReducerType = CombinedFunctorReducer; + + CombinedFunctorReducerType functor_reducer( + functor, typename Analysis::Reducer( + ReducerSelector::select(functor, return_value))); + const auto& response = Kokkos::Tools::Impl::begin_parallel_reduce< + typename return_value_adapter::reducer_type>(policy, functor_reducer, + label, kpID); + const auto& inner_policy = response.policy; + auto closure = construct_with_shared_allocation_tracking_disabled< Impl::ParallelReduce::execution_space>>( - CombinedFunctorReducerType( - functor, typename Analysis::Reducer( - ReducerSelector::select(functor, return_value))), - inner_policy, + functor_reducer, inner_policy, return_value_adapter::return_value(return_value, functor)); closure.execute(); @@ -1536,7 +1549,7 @@ struct ParallelReduceAdaptor { template static inline std::enable_if_t::value)> + std::is_pointer_v)> execute(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { execute_impl(label, policy, functor, return_value); @@ -1568,7 +1581,7 @@ struct ReducerHasTestReferenceFunction { static std::false_type test_func(...); enum { - value = std::is_same(nullptr))>::value + value = std::is_same_v(nullptr))> }; }; @@ -1611,7 +1624,7 @@ struct ParallelReduceFence { template static void fence(const ExecutionSpace& ex, const std::string& name, ArgsDeduced&&... args) { - if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) { + if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced&&)args...)) { ex.fence(name); } } @@ -1663,11 +1676,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1684,11 +1697,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1704,11 +1717,11 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1728,11 +1741,11 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1754,7 +1767,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1771,7 +1784,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1787,7 +1800,7 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = @@ -1806,7 +1819,7 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = diff --git a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp index e7a9ba0c7ed..1759c2b4a1c 100644 --- a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -32,7 +32,7 @@ class [[nodiscard]] ProfilingSection { uint32_t sectionID; public: - ProfilingSection(ProfilingSection const&) = delete; + ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp b/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp index f45dfa324e9..a4168b9401f 100644 --- a/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp +++ b/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp @@ -30,7 +30,7 @@ namespace Kokkos::Profiling { class [[nodiscard]] ScopedRegion { public: - ScopedRegion(ScopedRegion const &) = delete; + ScopedRegion(ScopedRegion const &) = delete; ScopedRegion &operator=(ScopedRegion const &) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp index a925e32a339..f00e25fdb62 100644 --- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -110,7 +110,7 @@ class ScratchMemorySpace { // Note: for team scratch m_offset is 0, since every // thread will get back the same shared pointer void* tmp = m_iter + m_offset * size; - uintptr_t increment = size * m_multiplier; + uintptr_t increment = static_cast(size) * m_multiplier; // Cast to uintptr_t to avoid problems with pointer arithmetic using SYCL const auto end_iter = diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp index 869a5f8ec26..3edecb4502a 100644 --- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_TASKSCHEDULER_HPP #define KOKKOS_TASKSCHEDULER_HPP @@ -44,6 +50,11 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -54,7 +65,7 @@ class TaskExec; } // end namespace Impl template -class BasicTaskScheduler : public Impl::TaskSchedulerBase { +class KOKKOS_DEPRECATED BasicTaskScheduler : public Impl::TaskSchedulerBase { public: using scheduler_type = BasicTaskScheduler; using execution_space = ExecSpace; @@ -494,8 +505,8 @@ namespace Kokkos { // Construct a TaskTeam execution policy template -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskTeam, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -503,7 +514,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskTeam( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -512,18 +524,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskTeam, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -531,8 +543,8 @@ Impl::TaskPolicyWithScheduler -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskSingle, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -540,7 +552,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskSingle( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -549,18 +562,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskSingle, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -575,7 +588,8 @@ Impl::TaskPolicyWithScheduler -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> host_spawn(Impl::TaskPolicyWithScheduler arg_policy, FunctorType&& arg_functor) { @@ -606,7 +620,8 @@ host_spawn(Impl::TaskPolicyWithScheduler */ template -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> KOKKOS_INLINE_FUNCTION task_spawn(Impl::TaskPolicyWithScheduler arg_policy, @@ -633,7 +648,7 @@ typename Scheduler::template future_type_for_functor> * 2) High, Normal, or Low priority */ template -void KOKKOS_INLINE_FUNCTION +KOKKOS_DEPRECATED void KOKKOS_INLINE_FUNCTION respawn(FunctorType* arg_self, T const& arg, TaskPriority const& arg_priority = TaskPriority::Regular) { static_assert(Kokkos::is_future::value || Kokkos::is_scheduler::value, @@ -656,7 +671,8 @@ respawn(FunctorType* arg_self, T const& arg, // Wait for all runnable tasks to complete template -inline void wait(BasicTaskScheduler const& scheduler) { +KOKKOS_DEPRECATED inline void wait( + BasicTaskScheduler const& scheduler) { using scheduler_type = BasicTaskScheduler; scheduler_type::specialization::execute(scheduler); // scheduler.m_queue->execute(); @@ -664,6 +680,10 @@ inline void wait(BasicTaskScheduler const& scheduler) { } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp index 203fb16eaf0..83e1c06db9b 100644 --- a/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp @@ -31,31 +31,40 @@ static_assert(false, #include //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // Forward declarations used in Impl::TaskQueue template -class BasicFuture; +class KOKKOS_DEPRECATED BasicFuture; template -class SimpleTaskScheduler; +class KOKKOS_DEPRECATED SimpleTaskScheduler; template -class BasicTaskScheduler; +class KOKKOS_DEPRECATED BasicTaskScheduler; template -struct is_scheduler : public std::false_type {}; +struct KOKKOS_DEPRECATED is_scheduler : public std::false_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; -enum class TaskPriority : int { High = 0, Regular = 1, Low = 2 }; +enum class KOKKOS_DEPRECATED TaskPriority : int { + High = 0, + Regular = 1, + Low = 2 +}; } // namespace Kokkos @@ -141,28 +150,28 @@ using default_tasking_memory_space_for_execution_space_t = namespace Kokkos { template -using DeprecatedTaskScheduler = BasicTaskScheduler< +using DeprecatedTaskScheduler KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using DeprecatedTaskSchedulerMultiple = BasicTaskScheduler< +using DeprecatedTaskSchedulerMultiple KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueueMultiple< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using TaskScheduler = SimpleTaskScheduler< +using TaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::SingleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, Impl::TaskQueueTraitsLockBased>>; template -using TaskSchedulerMultiple = SimpleTaskScheduler< +using TaskSchedulerMultiple KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -172,7 +181,7 @@ using TaskSchedulerMultiple = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -using ChaseLevTaskScheduler = SimpleTaskScheduler< +using ChaseLevTaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -182,7 +191,7 @@ using ChaseLevTaskScheduler = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -void wait(BasicTaskScheduler const&); +KOKKOS_DEPRECATED void wait(BasicTaskScheduler const&); namespace Impl { @@ -204,6 +213,10 @@ struct TaskPolicyData; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ diff --git a/lib/kokkos/core/src/Kokkos_Timer.hpp b/lib/kokkos/core/src/Kokkos_Timer.hpp index a210b6ff183..ab31484d76a 100644 --- a/lib/kokkos/core/src/Kokkos_Timer.hpp +++ b/lib/kokkos/core/src/Kokkos_Timer.hpp @@ -48,7 +48,7 @@ class Timer { inline Timer() { reset(); } - Timer(const Timer&) = delete; + Timer(const Timer&) = delete; Timer& operator=(const Timer&) = delete; inline double seconds() const { diff --git a/lib/kokkos/core/src/Kokkos_Tuners.hpp b/lib/kokkos/core/src/Kokkos_Tuners.hpp index f5ffc66af5b..fcb061b378f 100644 --- a/lib/kokkos/core/src/Kokkos_Tuners.hpp +++ b/lib/kokkos/core/src/Kokkos_Tuners.hpp @@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t); VariableValue make_variable_value(size_t, double); SetOrRange make_candidate_range(double lower, double upper, double step, bool openLower, bool openUpper); +SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step, + bool openLower, bool openUpper); size_t get_new_context_id(); void begin_context(size_t context_id); void end_context(size_t context_id); @@ -412,18 +414,19 @@ class TeamSizeTuner : public ExtendableTunerMixin { TunerType tuner; public: - TeamSizeTuner() = default; + TeamSizeTuner() = default; TeamSizeTuner& operator=(const TeamSizeTuner& other) = default; TeamSizeTuner(const TeamSizeTuner& other) = default; - TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; - TeamSizeTuner(TeamSizeTuner&& other) = default; + TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; + TeamSizeTuner(TeamSizeTuner&& other) = default; template TeamSizeTuner(const std::string& name, - Kokkos::TeamPolicy& policy, + const Kokkos::TeamPolicy& policy_in, const Functor& functor, const TagType& tag, ViableConfigurationCalculator calc) { - using PolicyType = Kokkos::TeamPolicy; + using PolicyType = Kokkos::TeamPolicy; + PolicyType policy(policy_in); auto initial_vector_length = policy.impl_vector_length(); if (initial_vector_length < 1) { policy.impl_set_vector_length(1); @@ -505,7 +508,8 @@ class TeamSizeTuner : public ExtendableTunerMixin { } template - void tune(Kokkos::TeamPolicy& policy) { + auto tune(const Kokkos::TeamPolicy& policy_in) { + Kokkos::TeamPolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); auto team_size = std::get<1>(configuration); @@ -515,6 +519,111 @@ class TeamSizeTuner : public ExtendableTunerMixin { policy.impl_set_vector_length(vector_length); } } + return policy; + } + void end() { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + tuner.end(); + } + } + + TunerType get_tuner() const { return tuner; } +}; +namespace Impl { +template +struct tuning_type_for; + +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_double; + static double get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.double_value; + } +}; +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + static int64_t get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.int_value; + } +}; +} // namespace Impl +template +class SingleDimensionalRangeTuner { + size_t id; + size_t context; + using tuning_util = Impl::tuning_type_for; + + Bound default_value; + + public: + SingleDimensionalRangeTuner() = default; + SingleDimensionalRangeTuner( + const std::string& name, + Kokkos::Tools::Experimental::StatisticalCategory category, + Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) { + default_value = default_val; + Kokkos::Tools::Experimental::VariableInfo info; + info.category = category; + info.candidates = make_candidate_range( + static_cast(lower), static_cast(upper), + static_cast(step), false, false); + info.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range; + info.type = tuning_util::value; + id = Kokkos::Tools::Experimental::declare_output_type(name, info); + } + + Bound begin() { + context = Kokkos::Tools::Experimental::get_new_context_id(); + Kokkos::Tools::Experimental::begin_context(context); + auto tuned_value = + Kokkos::Tools::Experimental::make_variable_value(id, default_value); + Kokkos::Tools::Experimental::request_output_values(context, 1, + &tuned_value); + return tuning_util::get(tuned_value); + } + + void end() { Kokkos::Tools::Experimental::end_context(context); } + + template + void with_tuned_value(Functor& func) { + func(begin()); + end(); + } +}; + +class RangePolicyOccupancyTuner { + private: + using TunerType = SingleDimensionalRangeTuner; + TunerType tuner; + + public: + RangePolicyOccupancyTuner() = default; + template + RangePolicyOccupancyTuner(const std::string& name, + const Kokkos::RangePolicy&, + const Functor&, const TagType&, + ViableConfigurationCalculator) + : tuner(TunerType(name, + Kokkos::Tools::Experimental::StatisticalCategory:: + kokkos_value_ratio, + 100, 5, 100, 5)) {} + + template + auto tune(const Kokkos::RangePolicy& policy_in) { + Kokkos::RangePolicy policy(policy_in); + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + auto occupancy = tuner.begin(); + policy.impl_set_desired_occupancy( + Kokkos::Experimental::DesiredOccupancy{static_cast(occupancy)}); + } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { @@ -578,11 +687,13 @@ struct MDRangeTuner : public ExtendableTunerMixin> { policy.impl_change_tile_size({std::get(tuple)...}); } template - void tune(Kokkos::MDRangePolicy& policy) { + auto tune(const Kokkos::MDRangePolicy& policy_in) { + Kokkos::MDRangePolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); set_policy_tile(policy, configuration, std::make_index_sequence{}); } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { diff --git a/lib/kokkos/core/src/Kokkos_TypeInfo.hpp b/lib/kokkos/core/src/Kokkos_TypeInfo.hpp new file mode 100644 index 00000000000..e5710da2e3d --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_TypeInfo.hpp @@ -0,0 +1,103 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TYPE_INFO_HPP +#define KOKKOS_TYPE_INFO_HPP + +#include +#include +#include + +#include + +// Intel C++ Compiler Classic version 2021.2.0 works but 2021.1.2 doesn't +// Both have __INTEL_COMPILER defined to 2021 so using +// __INTEL_COMPILER_BUILD_DATE to discriminate. +// Experimenting on the compiler explorer gave +// icc version | __INTEL_COMPILER | __INTEL_COMPILER_BUILD_DATE +// 2021.1.2 | 2021 | 20201208 +// 2021.2.0 | 2021 | 20210228 +// NVCC versions less than 11.3.0 segfault when that header is included +// NVCC+MSVC doesn't work at all - it simply reports "T" inside type_name +#if (!defined(KOKKOS_COMPILER_INTEL) || \ + (__INTEL_COMPILER_BUILD_DATE >= 20210228)) && \ + (!defined(KOKKOS_COMPILER_NVCC) || (KOKKOS_COMPILER_NVCC >= 1130)) && \ + (!(defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_COMPILER_MSVC))) + +#define KOKKOS_ENABLE_IMPL_TYPEINFO + +namespace Kokkos::Impl { + +template +constexpr std::array to_array(std::string_view src) { + std::array dst{}; + for (size_t i = 0; i < N; ++i) { + dst[i] = src[i]; + } + return dst; +} + +template +constexpr auto type_name() { +#if defined(__clang__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(__GNUC__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[with T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(_MSC_VER) + constexpr std::string_view func = __FUNCSIG__; + constexpr std::string_view prefix{"type_name<"}; + constexpr std::string_view suffix{">(void)"}; +#else +#error bug +#endif + constexpr auto beg = func.find(prefix) + prefix.size(); + constexpr auto end = func.rfind(suffix); + static_assert(beg != std::string_view::npos); + static_assert(end != std::string_view::npos); + return to_array(func.substr(beg, end)); +} + +template +class TypeInfo { + static constexpr auto value_ = type_name(); + + public: + static constexpr std::string_view name() noexcept { + return {value_.data(), value_.size()}; + } +}; + +} // namespace Kokkos::Impl + +#else // out of luck, using Intel C++ Compiler Classic + +namespace Kokkos::Impl { + +template +class TypeInfo { + public: + static constexpr std::string_view name() noexcept { return "not supported"; } +}; + +} // namespace Kokkos::Impl + +#endif + +#endif diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 04d1fcf1518..d5b352876c3 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -22,2016 +22,10 @@ static_assert(false, #ifndef KOKKOS_VIEW_HPP #define KOKKOS_VIEW_HPP -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -#include -#include -#include -#endif -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewArrayAnalysis; - -template ::non_const_value_type> -struct ViewDataAnalysis; - -template -class ViewMapping { - public: - enum : bool { is_assignable_data_type = false }; - enum : bool { is_assignable = false }; -}; - -template -constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( - const IntType i0, const IntType i1, const IntType i2, const IntType i3, - const IntType i4, const IntType i5, const IntType i6, const IntType i7) { - static_assert(std::is_integral::value, - "count_valid_integers() must have integer arguments."); - - return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + - (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + - (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + - (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); -} - -// FIXME Ideally, we would not instantiate this function for every possible View -// type. We should be able to only pass "extent" when we use mdspan. -template -KOKKOS_INLINE_FUNCTION void runtime_check_rank( - const View&, const bool is_void_spec, const size_t i0, const size_t i1, - const size_t i2, const size_t i3, const size_t i4, const size_t i5, - const size_t i6, const size_t i7, const char* label) { - (void)(label); - - if (is_void_spec) { - const size_t num_passed_args = - count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); - // We either allow to pass as many extents as the dynamic rank is, or - // as many extents as the total rank is. In the latter case, the given - // extents for the static dimensions must match the - // compile-time extents. - constexpr int rank = View::rank(); - constexpr int dyn_rank = View::rank_dynamic(); - const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; - const bool n_args_is_rank = num_passed_args == rank; - - if constexpr (rank != dyn_rank) { - if (n_args_is_rank) { - size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - for (int i = dyn_rank; i < rank; ++i) - if (new_extents[i] != View::static_extent(i)) { - KOKKOS_IF_ON_HOST( - const std::string message = - "The specified run-time extent for Kokkos::View '" + - std::string(label) + - "' does not match the compile-time extent in dimension " + - std::to_string(i) + ". The given extent is " + - std::to_string(new_extents[i]) + " but should be " + - std::to_string(View::static_extent(i)) + ".\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "The specified run-time extents for a Kokkos::View " - "do not match the compile-time extents.");) - } - } - } - - if (!n_args_is_dyn_rank && !n_args_is_rank) { - KOKKOS_IF_ON_HOST( - const std::string message = - "Constructor for Kokkos::View '" + std::string(label) + - "' has mismatched number of arguments. The number " - "of arguments = " + - std::to_string(num_passed_args) + - " neither matches the dynamic rank = " + - std::to_string(dyn_rank) + - " nor the total rank = " + std::to_string(rank) + "\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " - "mismatched number of arguments.");) - } - } -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -// Class to provide a uniform type -namespace Kokkos { -namespace Impl { -template -struct ViewUniformType; -} -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -/** \class ViewTraits - * \brief Traits class for accessing attributes of a View. - * - * This is an implementation detail of View. It is only of interest - * to developers implementing a new specialization of View. - * - * Template argument options: - * - View< DataType > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , ArrayLayout > - * - View< DataType , ArrayLayout , Space > - * - View< DataType , ArrayLayout , MemoryTraits > - * - View< DataType , ArrayLayout , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - */ - -template -struct ViewTraits; - -template <> -struct ViewTraits { - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = void; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - // Ignore an extraneous 'void' - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - HooksPolicy, Prop...> { - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = HooksPolicy; -}; - -template -struct ViewTraits::value>, - ArrayLayout, Prop...> { - // Specify layout, keep subsequent space and memory traits arguments - - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = ArrayLayout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits::value>, Space, - Prop...> { - // Specify Space, memory traits should be the only subsequent argument. - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::HostMirrorSpace, - void>::value && - std::is_same::array_layout, - void>::value, - "Only one View Execution or Memory Space template argument"); - - using execution_space = typename Space::execution_space; - using memory_space = typename Space::memory_space; - using HostMirrorSpace = - typename Kokkos::Impl::HostMirror::Space::memory_space; - using array_layout = typename execution_space::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - MemoryTraits, Prop...> { - // Specify memory trait, should not be any subsequent arguments - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::array_layout, - void>::value && - std::is_same::memory_traits, - void>::value && - std::is_same::hooks_policy, - void>::value, - "MemoryTrait is the final optional template argument for a View"); - - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = MemoryTraits; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - private: - // Unpack the properties arguments - using prop = ViewTraits; - - using ExecutionSpace = - std::conditional_t::value, - typename prop::execution_space, - Kokkos::DefaultExecutionSpace>; - - using MemorySpace = - std::conditional_t::value, - typename prop::memory_space, - typename ExecutionSpace::memory_space>; - - using ArrayLayout = - std::conditional_t::value, - typename prop::array_layout, - typename ExecutionSpace::array_layout>; - - using HostMirrorSpace = std::conditional_t< - !std::is_void::value, - typename prop::HostMirrorSpace, - typename Kokkos::Impl::HostMirror::Space>; - - using MemoryTraits = - std::conditional_t::value, - typename prop::memory_traits, - typename Kokkos::MemoryManaged>; - - using HooksPolicy = - std::conditional_t::value, - typename prop::hooks_policy, - Kokkos::Experimental::DefaultViewHooks>; - - // Analyze data type's properties, - // May be specialized based upon the layout and value type - using data_analysis = Kokkos::Impl::ViewDataAnalysis; - - public: - //------------------------------------ - // Data type traits: - - using data_type = typename data_analysis::type; - using const_data_type = typename data_analysis::const_type; - using non_const_data_type = typename data_analysis::non_const_type; - - //------------------------------------ - // Compatible array of trivial type traits: - - using scalar_array_type = typename data_analysis::scalar_array_type; - using const_scalar_array_type = - typename data_analysis::const_scalar_array_type; - using non_const_scalar_array_type = - typename data_analysis::non_const_scalar_array_type; - - //------------------------------------ - // Value type traits: - - using value_type = typename data_analysis::value_type; - using const_value_type = typename data_analysis::const_value_type; - using non_const_value_type = typename data_analysis::non_const_value_type; - - //------------------------------------ - // Mapping traits: - - using array_layout = ArrayLayout; - using dimension = typename data_analysis::dimension; - - using specialize = std::conditional_t< - std::is_void::value, - typename prop::specialize, - typename data_analysis::specialize>; /* mapping specialization tag */ - - static constexpr unsigned rank = dimension::rank; - static constexpr unsigned rank_dynamic = dimension::rank_dynamic; - - //------------------------------------ - // Execution space, memory space, memory access traits, and host mirror space. - - using execution_space = ExecutionSpace; - using memory_space = MemorySpace; - using device_type = Kokkos::Device; - using memory_traits = MemoryTraits; - using host_mirror_space = HostMirrorSpace; - using hooks_policy = HooksPolicy; - - using size_type = typename MemorySpace::size_type; - - enum { is_hostspace = std::is_same::value }; - enum { is_managed = MemoryTraits::is_unmanaged == 0 }; - enum { is_random_access = MemoryTraits::is_random_access == 1 }; - - //------------------------------------ -}; - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -namespace Impl { -struct UnsupportedKokkosArrayLayout; - -template -struct MDSpanViewTraits { - using mdspan_type = UnsupportedKokkosArrayLayout; -}; - -// "Natural" mdspan for a view if the View's ArrayLayout is supported. -template -struct MDSpanViewTraits::type>> { - using index_type = std::size_t; - using extents_type = - typename Impl::ExtentsFromDataType::type; - using mdspan_layout_type = - typename Impl::LayoutFromArrayLayout::type; - using accessor_type = Impl::SpaceAwareAccessor< - typename Traits::memory_space, - Kokkos::default_accessor>; - using mdspan_type = mdspan; -}; -} // namespace Impl -#endif // KOKKOS_ENABLE_IMPL_MDSPAN - -/** \class View - * \brief View to an array of data. - * - * A View represents an array of one or more dimensions. - * For details, please refer to Kokkos' tutorial materials. - * - * \section Kokkos_View_TemplateParameters Template parameters - * - * This class has both required and optional template parameters. The - * \c DataType parameter must always be provided, and must always be - * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are - * placeholders for different template parameters. The default value - * of the fifth template parameter \c Specialize suffices for most use - * cases. When explaining the template parameters, we won't refer to - * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer - * to the valid categories of template parameters, in whatever order - * they may occur. - * - * Valid ways in which template arguments may be specified: - * - View< DataType > - * - View< DataType , Layout > - * - View< DataType , Layout , Space > - * - View< DataType , Layout , Space , MemoryTraits > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - * - * \tparam DataType (required) This indicates both the type of each - * entry of the array, and the combination of compile-time and - * run-time array dimension(s). For example, double* - * indicates a one-dimensional array of \c double with run-time - * dimension, and int*[3] a two-dimensional array of \c int - * with run-time first dimension and compile-time second dimension - * (of 3). In general, the run-time dimensions (if any) must go - * first, followed by zero or more compile-time dimensions. For - * more examples, please refer to the tutorial materials. - * - * \tparam Space (required) The memory space. - * - * \tparam Layout (optional) The array's layout in memory. For - * example, LayoutLeft indicates a column-major (Fortran style) - * layout, and LayoutRight a row-major (C style) layout. If not - * specified, this defaults to the preferred layout for the - * Space. - * - * \tparam MemoryTraits (optional) Assertion of the user's intended - * access behavior. For example, RandomAccess indicates read-only - * access with limited spatial locality, and Unmanaged lets users - * wrap externally allocated memory in a View without automatic - * deallocation. - * - * \section Kokkos_View_MT MemoryTraits discussion - * - * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on - * Space - * - * Some \c MemoryTraits options may have different interpretations for - * different \c Space types. For example, with the Cuda device, - * \c RandomAccess tells Kokkos to fetch the data through the texture - * cache, whereas the non-GPU devices have no such hardware construct. - * - * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits - * - * Users should defer applying the optional \c MemoryTraits parameter - * until the point at which they actually plan to rely on it in a - * computational kernel. This minimizes the number of template - * parameters exposed in their code, which reduces the cost of - * compilation. Users may always assign a View without specified - * \c MemoryTraits to a compatible View with that specification. - * For example: - * \code - * // Pass in the simplest types of View possible. - * void - * doSomething (View out, - * View in) - * { - * // Assign the "generic" View in to a RandomAccess View in_rr. - * // Note that RandomAccess View objects must have const data. - * View in_rr = in; - * // ... do something with in_rr and out ... - * } - * \endcode - */ - -} // namespace Kokkos - -namespace Kokkos { - -template -struct is_always_assignable_impl; - -template -struct is_always_assignable_impl, - Kokkos::View> { - using mapping_type = Kokkos::Impl::ViewMapping< - typename Kokkos::View::traits, - typename Kokkos::View::traits, - typename Kokkos::View::traits::specialize>; - - constexpr static bool value = - mapping_type::is_assignable && - static_cast(Kokkos::View::rank_dynamic) >= - static_cast(Kokkos::View::rank_dynamic); -}; - -template -using is_always_assignable = is_always_assignable_impl< - std::remove_reference_t, - std::remove_const_t>>; - -template -inline constexpr bool is_always_assignable_v = - is_always_assignable::value; - -template -constexpr bool is_assignable(const Kokkos::View& dst, - const Kokkos::View& src) { - using DstTraits = typename Kokkos::View::traits; - using SrcTraits = typename Kokkos::View::traits; - using mapping_type = - Kokkos::Impl::ViewMapping; - - return is_always_assignable_v, - Kokkos::View> || - (mapping_type::is_assignable && - ((DstTraits::dimension::rank_dynamic >= 1) || - (dst.static_extent(0) == src.extent(0))) && - ((DstTraits::dimension::rank_dynamic >= 2) || - (dst.static_extent(1) == src.extent(1))) && - ((DstTraits::dimension::rank_dynamic >= 3) || - (dst.static_extent(2) == src.extent(2))) && - ((DstTraits::dimension::rank_dynamic >= 4) || - (dst.static_extent(3) == src.extent(3))) && - ((DstTraits::dimension::rank_dynamic >= 5) || - (dst.static_extent(4) == src.extent(4))) && - ((DstTraits::dimension::rank_dynamic >= 6) || - (dst.static_extent(5) == src.extent(5))) && - ((DstTraits::dimension::rank_dynamic >= 7) || - (dst.static_extent(6) == src.extent(6))) && - ((DstTraits::dimension::rank_dynamic >= 8) || - (dst.static_extent(7) == src.extent(7)))); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with -// the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp declare target +#if defined(KOKKOS_ENABLE_IMPL_MDSPAN) && !defined(KOKKOS_COMPILER_INTEL) +#include #endif -inline constexpr Kokkos::ALL_t ALL{}; - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp end declare target -#endif - -inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; - -inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; - -inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; - -/** \brief Create View allocation parameter bundle from argument list. - * - * Valid argument list members are: - * 1) label as a "string" or std::string - * 2) memory space instance of the View::memory_space type - * 3) execution space instance compatible with the View::memory_space - * 4) Kokkos::WithoutInitializing to bypass initialization - * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory - * alignment - */ -template -inline Impl::ViewCtorProp::type...> -view_alloc(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_pointer, - "Cannot give pointer-to-memory for view allocation"); - - return return_type(args...); -} - -template -KOKKOS_INLINE_FUNCTION - Impl::ViewCtorProp::type...> - view_wrap(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_memory_space && - !return_type::has_execution_space && - !return_type::has_label && return_type::has_pointer, - "Must only give pointer-to-memory for view wrapping"); - - return return_type(args...); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -class View; - -template -struct is_view : public std::false_type {}; - -template -struct is_view> : public std::true_type {}; - -template -struct is_view> : public std::true_type {}; - -template -inline constexpr bool is_view_v = is_view::value; - -template -class View : public ViewTraits { - private: - template - friend class View; - template - friend class Kokkos::Impl::ViewMapping; - - using view_tracker_type = Kokkos::Impl::ViewTracker; - - public: - using traits = ViewTraits; - - private: - using map_type = - Kokkos::Impl::ViewMapping; - template - friend struct Kokkos::Impl::ViewTracker; - using hooks_policy = typename traits::hooks_policy; - - view_tracker_type m_track; - map_type m_map; - - public: - //---------------------------------------- - /** \brief Compatible view of array of scalar types */ - using array_type = - View; - - /** \brief Compatible view of const data type */ - using const_type = - View; - - /** \brief Compatible view of non-const data type */ - using non_const_type = - View; - - /** \brief Compatible HostMirror view */ - using HostMirror = - View, - typename traits::hooks_policy>; - - /** \brief Compatible HostMirror view */ - using host_mirror_type = - View; - - /** \brief Unified types */ - using uniform_type = typename Impl::ViewUniformType::type; - using uniform_const_type = - typename Impl::ViewUniformType::const_type; - using uniform_runtime_type = - typename Impl::ViewUniformType::runtime_type; - using uniform_runtime_const_type = - typename Impl::ViewUniformType::runtime_const_type; - using uniform_nomemspace_type = - typename Impl::ViewUniformType::nomemspace_type; - using uniform_const_nomemspace_type = - typename Impl::ViewUniformType::const_nomemspace_type; - using uniform_runtime_nomemspace_type = - typename Impl::ViewUniformType::runtime_nomemspace_type; - using uniform_runtime_const_nomemspace_type = - typename Impl::ViewUniformType::runtime_const_nomemspace_type; - - //---------------------------------------- - // Domain rank and extents - - static constexpr Impl::integral_constant - rank = {}; - static constexpr Impl::integral_constant - rank_dynamic = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = - map_type::Rank}; -#endif - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const noexcept { - return m_map.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return map_type::static_extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const noexcept { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() - const { - return m_map.layout(); - } - - //---------------------------------------- - /* Deprecate all 'dimension' functions in favor of - * ISO/C++ vocabulary 'extent'. - */ - - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::SharedAllocationTracker& impl_track() const { - return m_track.m_tracker; - } - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); \ - Kokkos::Impl::view_verify_operator_bounds( \ - __VA_ARGS__); - -#else - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); - -#endif - - template - static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is)); - static_assert(sizeof...(Is) <= 8); - static_assert(Kokkos::Impl::are_integral::value); - } - - template - static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is)); - static_assert(Kokkos::Impl::are_integral::value); - } - - public: - //------------------------------ - // Rank 1 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && !is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which - // have "inlined" versions above - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - ((0 == rank) || !is_default_map)), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.reference(indices...); - } - - //------------------------------ - // Rank 0 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> - access(Is... extra) const { - check_access_member_function_valid_args(extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) - return m_map.reference(); - } - - //------------------------------ - // Rank 1 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && !is_default_map), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.reference(i0, i1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.reference(i0, i1, i2); - } - - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.reference(i0, i1, i2, i3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.reference(i0, i1, i2, i3, i4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map - .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); - } - -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - - //---------------------------------------- - // Standard destructor, constructors, and assignment operators - - KOKKOS_DEFAULTED_FUNCTION - ~View() = default; - - KOKKOS_DEFAULTED_FUNCTION - View() = default; - - KOKKOS_FUNCTION - View(const View& other) : m_track(other.m_track), m_map(other.m_map) { - KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View(View&& other) - : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { - KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View& operator=(const View& other) { - m_map = other.m_map; - m_track = other.m_track; - - KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) - - return *this; - } - - KOKKOS_FUNCTION - View& operator=(View&& other) { - m_map = std::move(other.m_map); - m_track = std::move(other.m_track); - - KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) - - return *this; - } - - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View( - const View& rhs, - std::enable_if_t::traits, - typename traits::specialize>::is_assignable_data_type>* = nullptr) - : m_track(rhs), m_map() { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - } - - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - Kokkos::Impl::ViewMapping< - traits, typename View::traits, - typename traits::specialize>::is_assignable_data_type, - View>& - operator=(const View& rhs) { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - m_track.assign(rhs); - return *this; - } - - //---------------------------------------- - // Compatible subview constructor - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, - Args... args) - : m_track(src_view), m_map() { - using SrcType = View; - - using Mapping = Kokkos::Impl::ViewMapping; - - using DstType = typename Mapping::type; - - static_assert( - Kokkos::Impl::ViewMapping::is_assignable, - "Subview construction requires compatible view and subview arguments"); - - Mapping::assign(m_map, src_view.m_map, arg0, args...); - } - - //---------------------------------------- - // Allocation tracking properties - - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.m_tracker.use_count(); } - - inline const std::string label() const { - return m_track.m_tracker - .template get_label(); - } - - public: - //---------------------------------------- - // Allocation according to allocation properties and array layout - - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), m_map() { - // Copy the input allocation properties with possibly defaulted properties - // We need to split it in two to avoid MSVC compiler errors - auto prop_copy_tmp = - Impl::with_properties_if_unset(arg_prop, std::string{}); - auto prop_copy = Impl::with_properties_if_unset( - prop_copy_tmp, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing View and initializing data with uninitialized " - "execution space"); - } - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, alloc_name.c_str()); - } -#endif - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.m_tracker.assign_allocated_record_to_uninitialized(record); - } - - KOKKOS_INLINE_FUNCTION - void assign_data(pointer_type arg_data) { - m_track.m_tracker.clear(); - m_map.assign_data(arg_data); - } - - // Wrap memory according to properties and array layout - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing View to wrap user memory must supply matching pointer " - "type"); - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, "UNMANAGED"); - } -#endif - } - - // Simple dimension-only layout - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Allocate with label and layout - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, - typename traits::array_layout> const& arg_layout) - : View(Impl::ViewCtorProp(arg_label), arg_layout) {} - - // Allocate label and layout, must disambiguate from subview constructor. - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, const size_t> - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_label), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Construct view from ViewTracker and map - // This should be the preferred method because future extensions may need to - // use the ViewTracker class. - template - KOKKOS_INLINE_FUNCTION View( - const view_tracker_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track.m_tracker); - } - - // Construct View from internal shared allocation tracker object and map - // This is here for backwards compatibility for classes that derive from - // Kokkos::View - template - KOKKOS_INLINE_FUNCTION View( - const typename view_tracker_type::track_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track); - } - - //---------------------------------------- - // Memory span required to wrap these dimensions. - static constexpr size_t required_allocation_size( - typename traits::array_layout const& layout) { - return map_type::memory_span(layout); - } - - static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_ptr), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} - - //---------------------------------------- - // Shared scratch memory constructor - - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - const size_t num_passed_args = Impl::count_valid_integers( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); - - if (std::is_void::value && - num_passed_args != rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - - return View::shmem_size(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - private: - // Want to be able to align to minimum scratch alignment or sizeof or alignof - // elements - static constexpr size_t scratch_value_alignment = - max({sizeof(typename traits::value_type), - alignof(typename traits::value_type), - static_cast( - traits::execution_space::scratch_memory_space::ALIGN)}); - - public: - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(typename traits::array_layout const& arg_layout) { - return map_type::memory_span(arg_layout) + scratch_value_alignment; - } - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(reinterpret_cast( - arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), - scratch_value_alignment))), - arg_layout) {} - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp( - reinterpret_cast(arg_space.get_shmem_aligned( - map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, - arg_N7)), - scratch_value_alignment))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - //---------------------------------------- - // MDSpan converting constructors -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN - template ::mdspan_type> - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(traits::is_managed) -#endif - View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, - std::enable_if_t< - !std::is_same_v>* = - nullptr) - : View(mds.data_handle(), - Impl::array_layout_from_mapping< - typename traits::array_layout, - typename Impl::MDSpanViewTraits::mdspan_type>( - mds.mapping())) { - } - - template - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(!std::is_convertible_v< - Kokkos::mdspan, - typename Impl::MDSpanViewTraits::mdspan_type>) -#endif - View(const Kokkos::mdspan& mds) - : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { - } - - //---------------------------------------- - // Conversion to MDSpan - template ::mdspan_type, - typename = std::enable_if_t, - std::false_type, - std::is_assignable, - ImplNaturalMDSpanType>>::value>> - KOKKOS_INLINE_FUNCTION constexpr operator mdspan< - OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - return mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map)}; - } - - template >, - typename = std::enable_if_t>> - KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( - const OtherAccessorType& other_accessor = - typename Impl::MDSpanViewTraits::accessor_type()) { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - using ret_mdspan_type = - mdspan; - return ret_mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map), - other_accessor}; - } -#endif // KOKKOS_ENABLE_IMPL_MDSPAN -}; - -template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { - return View::rank(); -} - -namespace Impl { - -template -struct RankDataType { - using type = typename RankDataType::type*; -}; - -template -struct RankDataType { - using type = ValueType; -}; - -template -KOKKOS_FUNCTION std::enable_if_t< - N == View::rank() && - std::is_same::specialize, void>::value, - View> -as_view_of_rank_n(View v) { - return v; -} - -// Placeholder implementation to compile generic code for DynRankView; should -// never be called -template -KOKKOS_FUNCTION std::enable_if_t< - N != View::rank() && - std::is_same::specialize, void>::value, - View::value_type, N>::type, - Args...>> -as_view_of_rank_n(View) { - Kokkos::abort("Trying to get at a View of the wrong rank"); - return {}; -} - -template -void apply_to_view_of_static_rank(Function&& f, View a) { - f(a); -} - -} // namespace Impl -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Impl { -template -struct TypeListToViewTraits; - -template -struct TypeListToViewTraits> { - using type = ViewTraits; -}; - -// It is not safe to assume that subviews of views with the Aligned memory trait -// are also aligned. Hence, just remove that attribute for subviews. -template -struct RemoveAlignedMemoryTrait { - private: - using type_list_in = Kokkos::Impl::type_list; - using memory_traits = typename ViewTraits::memory_traits; - using type_list_in_wo_memory_traits = - typename Kokkos::Impl::type_list_remove_first::type; - using new_memory_traits = - Kokkos::MemoryTraits; - using new_type_list = typename Kokkos::Impl::concat_type_list< - type_list_in_wo_memory_traits, - Kokkos::Impl::type_list>::type; - - public: - using type = typename TypeListToViewTraits::type; -}; -} // namespace Impl - -template -KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, - Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - static_assert(Kokkos::is_memory_traits::value); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} -#endif - -template -using Subview = decltype(subview(std::declval(), std::declval()...)); - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, - const View& rhs) { - // Same data, layout, dimensions - using lhs_traits = ViewTraits; - using rhs_traits = ViewTraits; - - return std::is_same::value && - std::is_same::value && - std::is_same::value && - View::rank() == View::rank() && - lhs.data() == rhs.data() && lhs.span() == rhs.span() && - lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && - lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && - lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && - lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); -} - -template -KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, - const View& rhs) { - return !(operator==(lhs, rhs)); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct CommonViewValueType; - -template -struct CommonViewValueType { - using value_type = std::common_type_t; -}; - -template -struct CommonViewAllocProp; - -template -struct CommonViewAllocProp { - using value_type = ValueType; - using scalar_array_type = ValueType; - - template - KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} -}; - -template -struct DeduceCommonViewAllocProp; - -// Base case must provide types for: -// 1. specialize 2. value_type 3. is_view 4. prop_type -template -struct DeduceCommonViewAllocProp { - using specialize = typename FirstView::traits::specialize; - - using value_type = typename FirstView::traits::value_type; - - enum : bool { is_view = is_view::value }; - - using prop_type = CommonViewAllocProp; -}; - -template -struct DeduceCommonViewAllocProp { - using NextTraits = DeduceCommonViewAllocProp; - - using first_specialize = typename FirstView::traits::specialize; - using first_value_type = typename FirstView::traits::value_type; - - enum : bool { first_is_view = is_view::value }; - - using next_specialize = typename NextTraits::specialize; - using next_value_type = typename NextTraits::value_type; - - enum : bool { next_is_view = NextTraits::is_view }; - - // common types - - // determine specialize type - // if first and next specialize differ, but are not the same specialize, error - // out - static_assert(!(!std::is_same::value && - !std::is_void::value && - !std::is_void::value), - "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " - "specialize trait allowed"); - - // otherwise choose non-void specialize if either/both are non-void - using specialize = std::conditional_t< - std::is_same::value, first_specialize, - std::conditional_t<(std::is_void::value && - !std::is_void::value), - next_specialize, first_specialize>>; - - using value_type = typename CommonViewValueType::value_type; - - enum : bool { is_view = (first_is_view && next_is_view) }; - - using prop_type = CommonViewAllocProp; -}; - -} // end namespace Impl - -template -using DeducedCommonPropsType = - typename Impl::DeduceCommonViewAllocProp::prop_type; - -// This function is required in certain scenarios where users customize -// Kokkos View internals. One example are dynamic length embedded ensemble -// types. The function is used to propagate necessary information -// (like the ensemble size) when creating new views. -// However, most of the time it is called with a single view. -// Furthermore, the propagated information is not just for view allocations. -// From what I can tell, the type of functionality provided by -// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, -// a mechanism we will eventually use to replace this clunky approach here, when -// we are finally mdspan based. -// TODO: get rid of this when we have mdspan -template -KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( - Views const&... views) { - return DeducedCommonPropsType(views...); -} - -} // namespace Kokkos - -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +#include -#endif /* #ifndef KOKKOS_VIEW_HPP */ +#endif /* KOKKOS_VIEW_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index efa56a086e3..4d226342815 100644 --- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -120,7 +120,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { (std::int32_t)BEGIN_TOKEN))) { // Attempt to claim ready work index succeeded, // update the hint and return work index - atomic_increment(begin_hint); + atomic_inc(begin_hint); return w; } // arrive here when ready_queue[i] == BEGIN_TOKEN @@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { void operator()(const TagCount, int i) const noexcept { std::int32_t* const count_queue = &m_queue[m_graph.numRows()]; - atomic_increment(count_queue + m_graph.entries[i]); + atomic_inc(count_queue + m_graph.entries[i]); } KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index 99daf379b6f..37fcfb7a1d9 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -23,7 +23,19 @@ #include #include +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) +#include +#elif defined(KOKKOS_ARCH_AMD_GPU) +// FIXME_OPENACC - hip_runtime_api.h contains two implementations: one for AMD +// GPUs and the other for NVIDIA GPUs; below macro is needed to choose AMD GPUs. +#define __HIP_PLATFORM_AMD__ +#include +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) +#include +#endif + #include +#include Kokkos::Experimental::OpenACC::OpenACC() : m_space_instance( @@ -46,6 +58,8 @@ Kokkos::Experimental::OpenACC::OpenACC(int async_arg) void Kokkos::Experimental::OpenACC::impl_initialize( InitializationSettings const& settings) { + Impl::OpenACCInternal::m_concurrency = + 256000; // FIXME_OPENACC - random guess when cannot compute if (Impl::OpenACC_Traits::may_fallback_to_host && acc_get_num_devices(Impl::OpenACC_Traits::dev_type) == 0 && !settings.has_device_id()) { @@ -59,11 +73,46 @@ void Kokkos::Experimental::OpenACC::impl_initialize( acc_get_device_num(acc_device_host); } else { using Kokkos::Impl::get_visible_devices; + acc_set_device_type(Impl::OpenACC_Traits::dev_type); std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + cudaDeviceProp deviceProp; + cudaError error = cudaGetDeviceProperties(&deviceProp, dev_num); + if (error != cudaSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "CUDA device properties: (" << cudaGetErrorName(error) + << "): " << cudaGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ARCH_AMD_GPU) + hipDeviceProp_t deviceProp; + hipError_t error = hipGetDeviceProperties(&deviceProp, dev_num); + if (error != hipSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "HIP device properties: (" << hipGetErrorName(error) + << "): " << hipGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + Impl::OpenACCInternal::m_concurrency = std::thread::hardware_concurrency(); + if (Impl::OpenACCInternal::m_concurrency == 0) { + Kokkos::Impl::host_abort( + "Error: During OpenACC backend initialization, failed to retrieve " + "CPU hardware concurrency"); + } +#else + // FIXME_OPENACC: Compute Impl::OpenACCInternal::m_concurrency correctly. +#endif } Impl::OpenACCInternal::singleton().initialize(); } @@ -86,6 +135,12 @@ void Kokkos::Experimental::OpenACC::print_configuration(std::ostream& os, os << "yes\n"; #else os << "no\n"; +#endif + os << " KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE: "; +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + os << "yes\n"; +#else + os << "no\n"; #endif m_space_instance->print_configuration(os, verbose); } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index 5155bee33dc..aee696bd34e 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -42,6 +42,7 @@ static_assert(false, // LLVM/Clacc compiler does not need this. #ifndef KOKKOS_COMPILER_CLANG #define KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS +#define KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS #endif namespace Kokkos::Experimental::Impl { @@ -87,9 +88,9 @@ class OpenACC { static char const* name() { return "OpenACC"; } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency() { return 256000; } // FIXME_OPENACC + static int concurrency(); #else - int concurrency() const { return 256000; } // FIXME_OPENACC + int concurrency() const; #endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 KOKKOS_DEPRECATED static bool in_parallel() { diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp index 4e7170cbbdf..75cef98a8d9 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp @@ -85,16 +85,26 @@ class OpenACCSpace { template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp index 82d38586eb8..1373f8fa7a4 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp @@ -38,7 +38,7 @@ class FunctorAdapter; \ KOKKOS_IMPL_ACC_PRAGMA(routine CLAUSE) \ template \ - KOKKOS_FUNCTION void operator()(Args &&... args) const { \ + KOKKOS_FUNCTION void operator()(Args &&...args) const { \ if constexpr (std::is_void_v) { \ m_functor(static_cast(args)...); \ } else { \ diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp index 10a76fbd313..1dad499c1be 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp @@ -27,6 +27,7 @@ // Arbitrary value to denote that we don't know yet what device to use. int Kokkos::Experimental::Impl::OpenACCInternal::m_acc_device_num = -1; +int Kokkos::Experimental::Impl::OpenACCInternal::m_concurrency = -1; Kokkos::Experimental::Impl::OpenACCInternal& Kokkos::Experimental::Impl::OpenACCInternal::singleton() { @@ -78,8 +79,18 @@ void Kokkos::Experimental::Impl::OpenACCInternal::fence( [&]() { acc_wait(m_async_arg); }); } -uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() const - noexcept { +uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() + const noexcept { return Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +int Kokkos::Experimental::OpenACC::concurrency() { + return Impl::OpenACCInternal::m_concurrency; +} +#else +int Kokkos::Experimental::OpenACC::concurrency() const { + return Impl::OpenACCInternal::m_concurrency; +} +#endif diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index c3d72368727..343d9921a95 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -30,11 +30,12 @@ namespace Kokkos::Experimental::Impl { class OpenACCInternal { bool m_is_initialized = false; - OpenACCInternal(const OpenACCInternal&) = default; + OpenACCInternal(const OpenACCInternal&) = default; OpenACCInternal& operator=(const OpenACCInternal&) = default; public: static int m_acc_device_num; + static int m_concurrency; int m_async_arg = acc_async_noval; OpenACCInternal() = default; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index 550436fe7be..629d26928ed 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -30,10 +30,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i1 = m / dim0 + begin1; + auto i0 = m % dim0 + begin0; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -42,6 +55,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, functor(i0, i1); } } +#endif } template @@ -50,10 +64,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i0 = m / dim1 + begin0; + auto i1 = m % dim1 + begin1; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -62,6 +89,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, functor(i0, i1); } } +#endif } template @@ -71,12 +99,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1) copyin(functor) async(async_arg) // clang-format on @@ -94,12 +122,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; // clang-format off #pragma acc parallel loop gang vector tile(tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -116,12 +144,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim1 * dim0; + auto i2 = m / tmp1 + begin2; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -132,6 +177,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -140,12 +186,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim2 + begin1; + auto i2 = tmp2 % dim2 + begin2; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -156,6 +219,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -165,15 +229,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2) copyin(functor) async(async_arg) // clang-format on @@ -193,15 +257,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; // clang-format off #pragma acc parallel loop gang vector tile(tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -220,14 +284,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1 * dim0; + auto i3 = m / tmp1 + begin3; + auto tmp2 = m % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -240,6 +325,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -248,14 +334,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + auto i2 = tmp2 / dim3 + begin2; + auto i3 = tmp2 % dim3 + begin3; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -268,6 +375,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -277,18 +385,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3) copyin(functor) async(async_arg) // clang-format on @@ -310,18 +418,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; // clang-format off #pragma acc parallel loop gang vector tile(tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -342,16 +450,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = m / tmp1 + begin4; + auto tmp2 = m % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -366,6 +499,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -374,16 +508,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i3 = tmp2 / dim4 + begin3; + auto i4 = tmp2 % dim4 + begin4; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -398,6 +557,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -407,21 +567,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4) copyin(functor) async(async_arg) // clang-format on @@ -445,21 +605,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; // clang-format off #pragma acc parallel loop gang vector tile(tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -482,18 +642,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; + auto i5 = m / tmp1 + begin5; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = tmp2 / tmp1 + begin4; + tmp2 = tmp2 % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -510,6 +699,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -518,18 +708,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim5 * dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + auto i4 = tmp2 / dim5 + begin4; + auto i5 = tmp2 % dim5 + begin5; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -546,6 +765,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -555,24 +775,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int tile5 = tile[5]; - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto tile5 = tile[5]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4,tile5) copyin(functor) async(async_arg) // clang-format on @@ -598,24 +818,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile5 = tile[5]; - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto tile5 = tile[5]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; // clang-format off #pragma acc parallel loop gang vector tile(tile5,tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 5afb5e75d39..2b5631d6f8a 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -113,6 +113,404 @@ class Kokkos::Impl::ParallelReduce \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i1 = m / dim0 + begin1; \ + auto i0 = m % dim0 + begin0; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i0 = m / dim1 + begin0; \ + auto i1 = m % dim1 + begin1; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim1 * dim0; \ + auto i2 = m / tmp1 + begin2; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim2 + begin1; \ + auto i2 = tmp2 % dim2 + begin2; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1 * dim0; \ + auto i3 = m / tmp1 + begin3; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + auto i2 = tmp2 / dim3 + begin2; \ + auto i3 = tmp2 % dim3 + begin3; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = m / tmp1 + begin4; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i3 = tmp2 / dim4 + begin3; \ + auto i4 = tmp2 % dim4 + begin4; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; \ + auto i5 = m / tmp1 + begin5; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = tmp2 / tmp1 + begin4; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim5 * dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + auto i4 = tmp2 / dim5 + begin4; \ + auto i5 = tmp2 % dim5 + begin5; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + } // namespace Kokkos::Experimental::Impl + +#else + #define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, \ OPERATOR) \ namespace Kokkos::Experimental::Impl { \ @@ -124,10 +522,10 @@ class Kokkos::Impl::ParallelReduce \ diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 430bdcb6808..d4cb73164d2 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -163,13 +163,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + wrapped_reducer.final(&tmp); result = tmp; } } @@ -180,15 +191,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -200,7 +221,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -208,6 +239,7 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -218,9 +250,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -228,6 +268,8 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -239,7 +281,17 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -247,6 +299,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -273,10 +326,23 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + ValueType tmp = ValueType(); #pragma acc loop worker reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); result = tmp; } @@ -314,11 +380,22 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } @@ -357,11 +434,23 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); result = tmp; } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp index c6d3267bdb0..b1c48baa1e7 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -225,7 +225,7 @@ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ } #pragma acc exit data delete (functor, chunk_values, offset_values, \ - final_reducer)async(async_arg) + final_reducer)async(async_arg) acc_wait(async_arg); } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index faa50aa7c38..95526aa7849 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -28,8 +28,11 @@ struct OpenACC_Traits { #elif defined(KOKKOS_ARCH_AMD_GPU) static constexpr acc_device_t dev_type = acc_device_radeon; static constexpr bool may_fallback_to_host = false; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + static constexpr acc_device_t dev_type = acc_device_host; + static constexpr bool may_fallback_to_host = true; #else - static constexpr acc_device_t dev_type = acc_device_not_host; + static constexpr acc_device_t dev_type = acc_device_default; static constexpr bool may_fallback_to_host = true; #endif }; diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index a403909f677..aa4be87ceb6 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -30,7 +30,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -93,11 +92,16 @@ class OpenMP { void fence(std::string const& name = "Kokkos::OpenMP::fence: Unnamed Instance Fence") const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Does the given instance return immediately after launching /// a parallel algorithm /// /// This always returns false on OpenMP - inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED inline static bool is_asynchronous( + OpenMP const& = OpenMP()) noexcept { + return false; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); @@ -154,10 +158,6 @@ inline int OpenMP::impl_thread_pool_rank() noexcept { KOKKOS_IF_ON_DEVICE((return -1;)) } -inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { - return false; -} - inline int OpenMP::impl_thread_pool_size(int depth) const { return depth < 2 ? impl_thread_pool_size() : 1; } @@ -202,7 +202,9 @@ struct MemorySpaceAccess #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 2877d940faf..6edcbff0c26 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -26,12 +26,19 @@ #include #include +#include + #include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -360,6 +367,10 @@ extern template class TaskQueue #include #include -#include #include #include #include @@ -148,7 +147,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { #include #include #include -#include /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index ed625cfcc82..ec33d25b969 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include +#include #ifdef KOKKOS_ENABLE_OPENMPTARGET @@ -91,9 +92,9 @@ class OpenMPTargetSpace { /**\brief Default memory space instance */ OpenMPTargetSpace(); - OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; - OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; - OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; + OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; + OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; + OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; OpenMPTargetSpace& operator=(const OpenMPTargetSpace&) = default; ~OpenMPTargetSpace() = default; @@ -141,79 +142,5 @@ class OpenMPTargetSpace { KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( Kokkos::Experimental::OpenMPTargetSpace); -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -// TODO: implement all possible deep_copies -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - // In the Release and RelWithDebInfo builds, the size of the memcpy should - // be greater than zero to avoid error. omp_target_memcpy returns zero on - // success. - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence " - "before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } -}; - -} // namespace Impl -} // namespace Kokkos - #endif #endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp new file mode 100644 index 00000000000..aace09e266b --- /dev/null +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp @@ -0,0 +1,101 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_OPENMPTARGET_DEEP_COPY_HPP +#define KOKKOS_OPENMPTARGET_DEEP_COPY_HPP + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// TODO: implement all possible deep_copies +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + // In the Release and RelWithDebInfo builds, the size of the memcpy should + // be greater than zero to avoid error. omp_target_memcpy returns zero on + // success. + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence " + "before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_OPENMPTARGET_DEEP_COPY_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp deleted file mode 100644 index 6c5eb048e34..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ /dev/null @@ -1,130 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -// FIXME_OPENMPTARGET currently unused -/* -namespace Kokkos { -namespace Impl { -namespace { - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel(); - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel() { return omp_in_parallel(); } - -bool s_using_hwloc = false; - -} // namespace -} // namespace Impl -} // namespace Kokkos -*/ - -namespace Kokkos { -namespace Impl { - -void OpenMPTargetExec::verify_is_process(const char* const label) { - // Fails if the current task is in a parallel region or is not on the host. - if (omp_in_parallel() && (!omp_is_initial_device())) { - std::string msg(label); - msg.append(" ERROR: in parallel or on device"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void OpenMPTargetExec::verify_initialized(const char* const label) { - if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { - std::string msg(label); - msg.append(" ERROR: not initialized"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void* OpenMPTargetExec::m_scratch_ptr = nullptr; -int64_t OpenMPTargetExec::m_scratch_size = 0; -uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; -int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; -std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; - -void OpenMPTargetExec::clear_scratch() { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_ptr = nullptr; - m_scratch_size = 0; -} - -void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; } - -void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, - int64_t shmem_size_L1, - int64_t league_size) { - Kokkos::Experimental::OpenMPTargetSpace space; - // Level-0 scratch when using clang/17 and higher comes from their OpenMP - // extension, `ompx_dyn_cgroup_mem`. -#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) - shmem_size_L0 = 0; -#endif - const int64_t shmem_size = - shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. - const int64_t padding = shmem_size * 10 / 100; // Padding per team. - - // Maximum active teams possible. - // The number should not exceed the maximum in-flight teams possible or the - // league_size. - int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); - - // max_active_teams is the number of active teams on the given hardware. - // We set the number of teams to be twice the number of max_active_teams for - // the compiler to pick the right number in its case. - // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. -#if !defined(KOKKOS_COMPILER_CRAY_LLVM) - omp_set_num_teams(max_active_teams * 2); -#endif - - // Total amount of scratch memory allocated is depenedent - // on the maximum number of in-flight teams possible. - int64_t total_size = - (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - max_active_teams * 2; - - if (total_size > m_scratch_size) { - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_size = total_size; - m_scratch_ptr = space.allocate(total_size); - } -} - -} // namespace Impl -} // namespace Kokkos - -#endif // KOKKOS_ENABLE_OPENMPTARGET diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp new file mode 100644 index 00000000000..13b509c0ada --- /dev/null +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp @@ -0,0 +1,48 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP +#define KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP + +#include +#include + +namespace Kokkos::Experimental::Impl { + +template +class FunctorAdapter { + Functor m_functor; + using WorkTag = typename Policy::work_tag; + + public: + FunctorAdapter() = default; + FunctorAdapter(Functor const &functor) : m_functor(functor) {} + + Functor get_functor() const { return m_functor; } + + template + KOKKOS_FUNCTION void operator()(Args &&...args) const { + if constexpr (std::is_void_v) { + m_functor(static_cast(args)...); + } else { + m_functor(WorkTag(), static_cast(args)...); + } + } +}; + +} // namespace Kokkos::Experimental::Impl + +#endif // KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 44e9119ea88..53e723882f5 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -27,11 +27,11 @@ // constructor. undef'ed at the end #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND +#include #include #include #include #include -#include #include @@ -105,18 +105,15 @@ void OpenMPTargetInternal::print_configuration(std::ostream& os, void OpenMPTargetInternal::impl_finalize() { m_is_initialized = false; - Kokkos::Impl::OpenMPTargetExec space; - if (space.m_uniquetoken_ptr != nullptr) + if (m_uniquetoken_ptr != nullptr) Kokkos::kokkos_free( - space.m_uniquetoken_ptr); + m_uniquetoken_ptr); } void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; - Kokkos::Impl::OpenMPTargetExec::MAX_ACTIVE_THREADS = concurrency(); - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. // FIXME_OPENMPTARGTE: Cray compiler did not yet implement omp_set_num_teams. @@ -136,7 +133,75 @@ OpenMPTargetInternal* OpenMPTargetInternal::impl_singleton() { return &self; } -} // Namespace Impl +void OpenMPTargetInternal::verify_is_process(const char* const label) { + // Fails if the current task is in a parallel region or is not on the host. + if (omp_in_parallel() && (!omp_is_initial_device())) { + std::string msg(label); + msg.append(" ERROR: in parallel or on device"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::verify_initialized(const char* const label) { + if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { + std::string msg(label); + msg.append(" ERROR: not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::clear_scratch() { + Kokkos::Experimental::OpenMPTargetSpace space; + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_ptr = nullptr; + m_scratch_size = 0; +} + +void* OpenMPTargetInternal::get_scratch_ptr() { return m_scratch_ptr; } + +void OpenMPTargetInternal::resize_scratch(int64_t team_size, + int64_t shmem_size_L0, + int64_t shmem_size_L1, + int64_t league_size) { + Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif + const int64_t shmem_size = + shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. + const int64_t padding = shmem_size * 10 / 100; // Padding per team. + + // Maximum active teams possible. + // The number should not exceed the maximum in-flight teams possible or the + // league_size. + int max_active_teams = + std::min(OpenMPTargetInternal::concurrency() / team_size, league_size); + + // max_active_teams is the number of active teams on the given hardware. + // We set the number of teams to be twice the number of max_active_teams for + // the compiler to pick the right number in its case. + // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. +#if !defined(KOKKOS_COMPILER_CRAY_LLVM) + omp_set_num_teams(max_active_teams * 2); +#endif + + // Total amount of scratch memory allocated is depenedent + // on the maximum number of in-flight teams possible. + int64_t total_size = + (shmem_size + + ::Kokkos::Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * + max_active_teams * 2; + + if (total_size > m_scratch_size) { + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_size = total_size; + m_scratch_ptr = space.allocate(total_size); + } +} + +} // namespace Impl OpenMPTarget::OpenMPTarget() : m_space_instance(Impl::OpenMPTargetInternal::impl_singleton()) {} @@ -206,9 +271,9 @@ namespace Experimental { UniqueToken:: - UniqueToken(Kokkos::Experimental::OpenMPTarget const&) { + UniqueToken(Kokkos::Experimental::OpenMPTarget const& space) { #ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND - uint32_t* ptr = Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr; + uint32_t* ptr = space.impl_internal_space_instance()->m_uniquetoken_ptr; int count = Kokkos::Experimental::OpenMPTarget().concurrency(); if (ptr == nullptr) { int size = count * sizeof(uint32_t); @@ -221,7 +286,7 @@ UniqueTokenm_uniquetoken_ptr = ptr; } #else // FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr` @@ -229,8 +294,7 @@ UniqueToken - namespace Kokkos { namespace Experimental { namespace Impl { @@ -27,9 +25,9 @@ enum class openmp_fence_is_static { yes, no }; class OpenMPTargetInternal { private: - OpenMPTargetInternal() = default; - OpenMPTargetInternal(const OpenMPTargetInternal&) = default; - OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default; + OpenMPTargetInternal() = default; + OpenMPTargetInternal(const OpenMPTargetInternal&) = delete; + OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = delete; public: void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no); @@ -55,6 +53,19 @@ class OpenMPTargetInternal { static OpenMPTargetInternal* impl_singleton(); + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + void* get_scratch_ptr(); + void clear_scratch(); + void resize_scratch(int64_t team_reduce_bytes, int64_t team_shared_bytes, + int64_t thread_local_bytes, int64_t league_size); + + void* m_scratch_ptr = nullptr; + std::mutex m_mutex_scratch_ptr; + int64_t m_scratch_size = 0; + uint32_t* m_uniquetoken_ptr = nullptr; + private: bool m_is_initialized = false; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index e222d652501..f71f8887135 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -116,8 +116,8 @@ class OpenMPTargetExecTeamMember { // FIXME_OPENMPTARGET this function currently ignores the reducer passed. template KOKKOS_INLINE_FUNCTION std::enable_if_t::value> - team_reduce(ReducerType const&, typename ReducerType::value_type& value) const - noexcept { + team_reduce(ReducerType const&, + typename ReducerType::value_type& value) const noexcept { #pragma omp barrier using value_type = typename ReducerType::value_type; @@ -741,43 +741,6 @@ struct TeamVectorRangeBoundariesStruct { } // namespace Impl -} // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -/** \brief Data for OpenMPTarget thread execution */ - -class OpenMPTargetExec { - public: - // FIXME_OPENMPTARGET - Currently the maximum number of - // teams possible is calculated based on NVIDIA's Volta GPU. In - // future this value should be based on the chosen architecture for the - // OpenMPTarget backend. - static int MAX_ACTIVE_THREADS; - - private: - static void* scratch_ptr; - - public: - static void verify_is_process(const char* const); - static void verify_initialized(const char* const); - - static void* get_scratch_ptr(); - static void clear_scratch(); - static void resize_scratch(int64_t team_reduce_bytes, - int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - static void* m_scratch_ptr; - static std::mutex m_mutex_scratch_ptr; - static int64_t m_scratch_size; - static uint32_t* m_uniquetoken_ptr; -}; - -} // namespace Impl } // namespace Kokkos #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp index bd7d3eef5d7..38ed7c5681a 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp @@ -20,6 +20,8 @@ #include #include #include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -31,38 +33,38 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; + using Policy = Kokkos::MDRangePolicy; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + const FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; public: inline void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); + Policy policy = m_policy; - typename Policy::point_type unused; static_assert(1 < Policy::rank && Policy::rank < 7); static_assert(Policy::inner_direction == Iterate::Left || Policy::inner_direction == Iterate::Right); execute_tile( - unused, functor, policy, + m_functor, policy, std::integral_constant()); } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -72,18 +74,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i0 = begin_0; i0 < end_0; ++i0) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -96,10 +94,7 @@ class ParallelFor, for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -107,9 +102,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -125,10 +119,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -137,9 +128,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -158,11 +148,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -172,9 +158,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -197,12 +182,7 @@ class ParallelFor, for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } @@ -214,9 +194,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -226,18 +205,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i1 = begin_1; i1 < end_1; ++i1) for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -250,10 +225,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -261,9 +233,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -279,10 +250,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -291,9 +259,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -312,11 +279,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -326,9 +289,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -351,12 +313,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp index a674637a3b1..502461cc5e0 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp @@ -20,6 +20,8 @@ #include #include #include +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" namespace Kokkos { namespace Impl { @@ -28,36 +30,30 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy; + using Member = typename Policy::member_type; - const FunctorType m_functor; + Kokkos::Experimental::Impl::FunctorAdapter m_functor; const Policy m_policy; public: - void execute() const { execute_impl(); } + void execute() const { execute_impl(); } - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto begin = m_policy.begin(); const auto end = m_policy.end(); if (end <= begin) return; - FunctorType a_functor(m_functor); + auto const a_functor(m_functor); #pragma omp target teams distribute parallel for map(to : a_functor) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - a_functor(i); - } else { - a_functor(TagType(), i); - } + a_functor(i); } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 26085f11400..77dc71a87b7 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace Kokkos { @@ -76,28 +77,27 @@ class ParallelFor, using Policy = Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Member = typename Policy::member_type; + + Kokkos::Experimental::Impl::FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; const size_t m_shmem_size; public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl(); + execute_impl(); } private: - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto league_size = m_policy.league_size(); const auto team_size = m_policy.team_size(); @@ -105,11 +105,12 @@ class ParallelFor, const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, - league_size); + m_policy.space().impl_internal_space_instance()->resize_scratch( + team_size, shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - FunctorType a_functor(m_functor); + void* scratch_ptr = + m_policy.space().impl_internal_space_instance()->get_scratch_ptr(); + auto const a_functor(m_functor); // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the // scratch implementation does not work in the Release or RelWithDebugInfo @@ -122,7 +123,7 @@ class ParallelFor, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(m_policy.space().concurrency() / team_size, league_size); #endif // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the @@ -161,16 +162,13 @@ class ParallelFor, typename Policy::member_type team(league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #else #pragma omp target teams distribute firstprivate(a_functor) \ is_device_ptr(scratch_ptr) num_teams(max_active_teams) \ - thread_limit(team_size) + thread_limit(team_size) for (int i = 0; i < league_size; i++) { #pragma omp parallel { @@ -180,10 +178,7 @@ class ParallelFor, typename Policy::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index e86a1219749..bee604834c7 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -37,9 +37,8 @@ class ParallelReduce; + public: inline void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = FunctorAdapter(m_functor_reducer.get_functor()); execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr, + functor, m_policy, m_result_ptr, std::integral_constant()); } @@ -77,7 +81,7 @@ class ParallelReduce inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -90,32 +94,23 @@ class ParallelReduce::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -126,7 +121,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -141,38 +136,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -184,7 +170,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -201,40 +187,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -247,7 +222,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -266,26 +241,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -293,18 +260,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -318,7 +280,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -339,27 +301,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -368,19 +322,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -395,7 +344,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -408,32 +357,23 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -444,7 +384,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -459,38 +399,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -502,7 +433,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -519,40 +450,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -565,7 +485,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -584,26 +504,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -611,18 +523,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -636,7 +543,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -657,27 +564,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -686,19 +585,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4a112ed11d0..b7c8abcb449 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -33,8 +34,6 @@ class ParallelReduce, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; @@ -55,14 +54,17 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - using TagType = typename Policy::work_tag; public: void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); + if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -75,26 +77,26 @@ class ParallelReduce, // Enter this loop if the reduction is on an array and the routine is // templated over the size of the array. if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<2>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<4>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<8>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<16>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<32>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else { Kokkos::abort("array reduction length must be <= 32"); } } else { // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<1>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 16c0eedb818..b81e3aa7ed0 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -59,7 +59,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for reduction(+ : TeamThread_scratch[:1]) +#pragma omp for reduction(+ : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -68,7 +68,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for reduction(custom : TeamThread_scratch[:1]) +#pragma omp for reduction(custom : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -90,11 +90,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< const Lambda& lambda, ReducerType result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custominner:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custominner \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of // elements in the array <= 32. For reduction we allocate, 16 bytes per @@ -109,7 +108,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); #pragma omp barrier -#pragma omp for reduction(custominner : TeamThread_scratch[:1]) +#pragma omp for reduction(custominner : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamThread_scratch[0]); } @@ -132,11 +131,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< ValueType* TeamThread_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -145,8 +143,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for reduction(omp_red_teamthread_reducer \ - : TeamThread_scratch[:1]) schedule(static, 1) +#pragma omp for reduction( \ + omp_red_teamthread_reducer : TeamThread_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -259,11 +258,10 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< const Lambda& lambda, ReducerType const& result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType vector_reduce; Impl::OpenMPTargetReducerWrapper::init(vector_reduce); @@ -329,7 +327,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) +#pragma omp for simd reduction(+ : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -338,7 +336,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -363,11 +361,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< static_assert(sizeof(ValueType) <= Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); @@ -376,7 +373,7 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); #pragma omp barrier -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamVector_scratch[0]); } @@ -400,11 +397,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -413,8 +409,9 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for simd reduction(omp_red_teamthread_reducer \ - : TeamVector_scratch[:1]) schedule(static, 1) +#pragma omp for simd reduction( \ + omp_red_teamthread_reducer : TeamVector_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -443,8 +440,7 @@ class ParallelReduce scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, m_result_ptr_on_device); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 29df0163c80..ec8a96cb2f3 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -20,6 +20,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -30,7 +31,6 @@ class ParallelScan, protected: using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using idx_type = typename Policy::index_type; @@ -48,18 +48,8 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(idx, val, is_final); - } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(WorkTag(), idx, val, is_final); - } + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; public: void impl_execute( @@ -77,8 +67,10 @@ class ParallelScan, idx_type team_size = 128; auto a_functor_reducer = m_functor_reducer; -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) + auto a_functor = FunctorAdapter(m_functor_reducer.get_functor()); + +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -91,9 +83,8 @@ class ParallelScan, const idx_type idx = local_offset + i; value_type val; reducer.init(&val); - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, val, - false); + if (idx < N) a_functor(idx, val, false); + element_values(team_id, i) = val; } #pragma omp barrier @@ -120,9 +111,8 @@ class ParallelScan, } } -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) \ - thread_limit(team_size) +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) thread_limit(team_size) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -145,12 +135,7 @@ class ParallelScan, #if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \ !defined(KOKKOS_ARCH_AMD_GFX1100) && !defined(KOKKOS_ARCH_AMD_GFX1103) if constexpr (Analysis::Reducer::has_join_member_function()) { - if constexpr (std::is_void_v) - a_functor_reducer.get_functor().join(local_offset_value, - offset_value); - else - a_functor_reducer.get_functor().join( - WorkTag{}, local_offset_value, offset_value); + a_functor.get_functor().join(local_offset_value, offset_value); } else local_offset_value += offset_value; #else @@ -158,9 +143,8 @@ class ParallelScan, #endif } else local_offset_value = offset_value; - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, - local_offset_value, true); + if (idx < N) a_functor(idx, local_offset_value, true); + if (idx == N - 1 && m_result_ptr_device_accessible) *m_result_ptr = local_offset_value; } @@ -169,9 +153,9 @@ class ParallelScan, } void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const idx_type N = m_policy.end() - m_policy.begin(); const idx_type chunk_size = 128; @@ -179,7 +163,7 @@ class ParallelScan, // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View, public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); const int chunk_size = 128; @@ -231,7 +215,9 @@ class ParallelScanWithTotal, if (N > 0) { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + base_t::m_policy.space() + .impl_internal_space_instance() + ->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View #include #include +#include namespace Kokkos { namespace Impl { @@ -72,7 +73,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = Kokkos::RangePolicy; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -82,12 +82,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); const auto begin = p.begin(); @@ -104,33 +107,27 @@ struct ParallelReduceSpecialize, return; } -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), ptr_on_device); } - template - static void execute_array(const FunctorType& f, const PolicyType& p, + template + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); const auto begin = p.begin(); @@ -150,27 +147,14 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams distribute parallel for \ - map(to:f) reduction(+: result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result) + for (auto i = begin; i < end; ++i) f(i, result); } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) + for (auto i = begin; i < end; ++i) f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), @@ -186,13 +170,10 @@ struct ParallelReduceSpecialize, ptr_on_device); return; } -#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result[ : NumReductions]) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result( @@ -200,12 +181,12 @@ struct ParallelReduceSpecialize, } } - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); const auto begin = p.begin(); @@ -219,23 +200,25 @@ struct ParallelReduceSpecialize, const auto size = end - begin; - // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently + // FIXME_OPENMPTARGET: The team size and concurrency are currently // based on NVIDIA-V100 and should be modifid to be based on the // architecture in the future. const int max_team_threads = 32; const int max_teams = - OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; + p.space().impl_internal_space_instance()->concurrency() / + max_team_threads; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. Achieved by setting the first // parameter of `resize_scratch=1`. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - std::numeric_limits::max()); - ValueType* scratch_ptr = - static_cast(OpenMPTargetExec::get_scratch_ptr()); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), + std::numeric_limits::max()); + ValueType* scratch_ptr = static_cast( + p.space().impl_internal_space_instance()->get_scratch_ptr()); - typename FunctorAnalysis::Reducer final_reducer(f); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { #pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) @@ -260,8 +243,7 @@ struct ParallelReduceSpecialize, } #pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to \ - : final_reducer) is_device_ptr(scratch_ptr) + map(to : final_reducer) is_device_ptr(scratch_ptr) { #pragma omp parallel { @@ -279,11 +261,7 @@ struct ParallelReduceSpecialize, // Accumulate partial results in thread specific storage. #pragma omp for simd for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } // Reduce all paritial results within a team. @@ -304,8 +282,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ +#pragma omp target teams distribute parallel for simd map(to : f) \ is_device_ptr(scratch_ptr) for (int i = 0; i < max_teams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { @@ -344,7 +321,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = TeamPolicyInternal; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -355,12 +331,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); @@ -370,9 +349,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); ValueType result = ValueType(); @@ -383,16 +364,15 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. if (max_active_teams <= 0) return; -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) KOKKOS_IMPL_OMPTARGET_PRAGMA( @@ -414,16 +394,13 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #else #pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \ - num_teams(max_active_teams) thread_limit(team_size) reduction(custom \ - : result) + num_teams(max_active_teams) thread_limit(team_size) \ + reduction(custom : result) for (int i = 0; i < league_size; i++) { #pragma omp parallel reduction(custom : result) { @@ -433,10 +410,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #endif @@ -447,12 +421,12 @@ struct ParallelReduceSpecialize, } template - static void execute_array(const FunctorType& f, const PolicyType& p, + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); @@ -462,9 +436,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); // Maximum active teams possible. // FIXME_OPENMPTARGET: Cray compiler did not yet implement @@ -473,7 +449,7 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. @@ -504,19 +480,14 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } else { // Case where the reduction is on a non-native data type. #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) #pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - map(to \ - : f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + map(to : f) is_device_ptr(scratch_ptr) reduction(custom : result) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -531,10 +502,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } @@ -545,10 +513,10 @@ struct ParallelReduceSpecialize, } else { ValueType result[NumReductions] = {}; // Case where the reduction is on an array. -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) -#pragma omp parallel reduction(+ : result[:NumReductions]) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + map(to : f) is_device_ptr(scratch_ptr) \ + reduction(+ : result[ : NumReductions]) +#pragma omp parallel reduction(+ : result[ : NumReductions]) { if (omp_get_num_teams() > max_active_teams) Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); @@ -562,10 +530,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } @@ -577,12 +542,12 @@ struct ParallelReduceSpecialize, // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join "); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join"); using FunctorAnalysis = @@ -611,13 +576,14 @@ struct ParallelReduceSpecialize, const auto nteams = league_size; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - typename FunctorAnalysis::Reducer final_reducer(f); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { // If there is no work to be done, copy back the initialized values and @@ -661,11 +627,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) { - f(team, result); - } else { - f(TagType(), team, result); - } + f(team, result); } } // end parallel } // end target @@ -673,7 +635,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { #pragma omp target teams distribute parallel for simd firstprivate( \ - final_reducer) is_device_ptr(scratch_ptr) + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 9b578aca112..4308fb042a3 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -34,9 +34,6 @@ struct OpenMPTargetReducerWrapper { KOKKOS_INLINE_FUNCTION static void join(value_type&, const value_type&) = delete; - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type&, const volatile value_type&) = delete; - KOKKOS_INLINE_FUNCTION static void init(value_type&) = delete; }; @@ -51,11 +48,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest += src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest += src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::sum(); @@ -72,11 +64,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest *= src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest *= src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::prod(); @@ -95,11 +82,6 @@ struct OpenMPTargetReducerWrapper> { if (src < dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src < dest) dest = src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::min(); @@ -118,11 +100,6 @@ struct OpenMPTargetReducerWrapper> { if (src > dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src > dest) dest = src; - } - // Required KOKKOS_INLINE_FUNCTION static void init(value_type& val) { @@ -141,11 +118,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest && src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest && src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::land(); @@ -166,11 +138,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest || src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest || src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::lor(); @@ -189,11 +156,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest & src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest & src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::band(); @@ -212,11 +174,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest | src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest | src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::bor(); @@ -236,12 +193,12 @@ struct OpenMPTargetReducerWrapper> { // Required KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -263,12 +220,12 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -298,16 +255,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -331,22 +278,16 @@ struct OpenMPTargetReducerWrapper> { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -385,15 +326,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::max(); @@ -428,15 +360,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::min(); @@ -480,23 +403,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -531,13 +437,6 @@ struct OpenMPTargetReducerWrapper> { : dest.min_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_true = reduction_identity::min(); @@ -569,13 +468,6 @@ struct OpenMPTargetReducerWrapper> { : dest.max_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = reduction_identity::max(); @@ -611,17 +503,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = ::Kokkos::reduction_identity::max(); @@ -654,13 +535,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_false = ::Kokkos::reduction_identity::min(); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp deleted file mode 100644 index 458c4c9a43e..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp +++ /dev/null @@ -1,251 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ENABLE_TASKPOLICY) - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template class TaskQueue; - -//---------------------------------------------------------------------------- - -TaskExec::TaskExec() - : m_self_exec(0), - m_team_exec(0), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(0), - m_team_rank(0), - m_team_size(1) {} - -TaskExec::TaskExec( - Kokkos::Impl::OpenMPTargetExec &arg_exec, int const arg_team_size) - : m_self_exec(&arg_exec), - m_team_exec(arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size)), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(arg_exec.pool_rank_rev() / arg_team_size), - m_team_rank(arg_exec.pool_rank_rev() % arg_team_size), - m_team_size(arg_team_size) { - // This team spans - // m_self_exec->pool_rev( team_size * group_rank ) - // m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 ) - - int64_t volatile *const sync = (int64_t *)m_self_exec->scratch_reduce(); - - sync[0] = int64_t(0); - sync[1] = int64_t(0); - - for (int i = 0; i < m_team_size; ++i) { - m_sync_value |= int64_t(1) << (8 * i); - m_sync_mask |= int64_t(3) << (8 * i); - } - - Kokkos::memory_fence(); -} - -void TaskExec::team_barrier_impl() const { - if (m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t))) { - Kokkos::abort("TaskQueue scratch_reduce memory too small"); - } - - // Use team shared memory to synchronize. - // Alternate memory locations between barriers to avoid a sequence - // of barriers overtaking one another. - - int64_t volatile *const sync = - ((int64_t *)m_team_exec->scratch_reduce()) + (m_sync_step & 0x01); - - // This team member sets one byte within the sync variable - int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank; - - *sync_self = int8_t(m_sync_value & 0x03); // signal arrival - - while (m_sync_value != *sync) - ; // wait for team to arrive - - ++m_sync_step; - - if (0 == (0x01 & m_sync_step)) { // Every other step - m_sync_value ^= m_sync_mask; - if (1000 < m_sync_step) m_sync_step = 0; - } -} - -//---------------------------------------------------------------------------- - -void TaskQueueSpecialization::execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - using Member = TaskExec; - - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - // Required: team_size <= 8 - - const int team_size = PoolExec::pool_size(2); // Threads per core - // const int team_size = PoolExec::pool_size(1); // Threads per NUMA - - if (8 < team_size) { - Kokkos::abort("TaskQueue unsupported team size"); - } - -#pragma omp parallel - { - PoolExec &self = *PoolExec::get_thread_omp(); - - Member single_exec; - Member team_exec(self, team_size); - - // Team shared memory - task_root_type *volatile *const task_shared = - (task_root_type **)team_exec.m_team_exec->scratch_thread(); - -// Barrier across entire OpenMPTarget thread pool to insure initialization -#pragma omp barrier - - // Loop until all queues are empty and no tasks in flight - - do { - task_root_type *task = 0; - - // Each team lead attempts to acquire either a thread team task - // or a single thread task for the team. - - if (0 == team_exec.team_rank()) { - task = 0 < *((volatile int *)&queue->m_ready_count) ? end : 0; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - } - - // Team lead broadcast acquired task to team members: - - if (1 < team_exec.team_size()) { - if (0 == team_exec.team_rank()) *task_shared = task; - - // Fence to be sure task_shared is stored before the barrier - Kokkos::memory_fence(); - - // Whole team waits for every team member to reach this statement - team_exec.team_barrier(); - - // Fence to be sure task_shared is stored - Kokkos::memory_fence(); - - task = *task_shared; - } - - if (0 == task) break; // 0 == m_ready_count - - if (end == task) { - // All team members wait for whole team to reach this statement. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } else if (task_root_type::TaskTeam == task->m_task_type) { - // Thread Team Task - (*task->m_apply)(task, &team_exec); - - // The m_apply function performs a barrier - - if (0 == team_exec.team_rank()) { - // team member #0 completes the task, which may delete the task - queue->complete(task); - } - } else { - // Single Thread Task - - if (0 == team_exec.team_rank()) { - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - } - - // All team members wait for whole team to reach this statement. - // Not necessary to complete the task. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } - } while (1); - } - // END #pragma omp parallel -} - -void TaskQueueSpecialization:: - iff_single_thread_recursive_execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using Member = TaskExec; - - if (1 == omp_get_num_threads()) { - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - Member single_exec; - - task_root_type *task = end; - - do { - task = end; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - - if (end == task) break; - - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - - } while (1); - } -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( \ - KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp deleted file mode 100644 index c9aa7b128f1..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp +++ /dev/null @@ -1,319 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP -#define KOKKOS_IMPL_OPENMP_TASK_HPP - -#if defined(KOKKOS_ENABLE_TASKPOLICY) - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <> -class TaskQueueSpecialization { - public: - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = Kokkos::Impl::TaskQueue; - using task_base_type = Kokkos::Impl::TaskBase; - - // Must specify memory space - using memory_space = Kokkos::HostSpace; - - static void iff_single_thread_recursive_execute(queue_type* const); - - // Must provide task queue execution function - static void execute(queue_type* const); - - // Must provide mechanism to set function pointer in - // execution space from the host process. - template - static void proc_set_apply(task_base_type::function_type* ptr) { - using TaskType = TaskBase; - *ptr = TaskType::apply; - } -}; - -extern template class TaskQueue; - -//---------------------------------------------------------------------------- - -template <> -class TaskExec { - private: - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; - TaskExec& operator=(TaskExec const&) = delete; - - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - - friend class Kokkos::Impl::TaskQueue; - friend class Kokkos::Impl::TaskQueueSpecialization< - Kokkos::Experimental::OpenMPTarget>; - - PoolExec* const m_self_exec; ///< This thread's thread pool data structure - PoolExec* const m_team_exec; ///< Team thread's thread pool data structure - int64_t m_sync_mask; - int64_t mutable m_sync_value; - int mutable m_sync_step; - int m_group_rank; ///< Which "team" subset of thread pool - int m_team_rank; ///< Which thread within a team - int m_team_size; - - TaskExec(); - TaskExec(PoolExec& arg_exec, int arg_team_size); - - void team_barrier_impl() const; - - public: - KOKKOS_FUNCTION void* team_shared() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread() : nullptr;)) - - KOKKOS_IF_ON_DEVICE((return nullptr;)) - } - - KOKKOS_FUNCTION int team_shared_size() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread_size() : 0;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /**\brief Whole team enters this function call - * before any teeam member returns from - * this function call. - */ - KOKKOS_FUNCTION void team_barrier() const { - KOKKOS_IF_ON_HOST((if (1 < m_team_size) { team_barrier_impl(); })) - } - - KOKKOS_INLINE_FUNCTION - int team_rank() const { return m_team_rank; } - - KOKKOS_INLINE_FUNCTION - int team_size() const { return m_team_size; } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, - count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& start, const iType& end) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, start, - end); -} - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i); - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - shared[0] += shared[i]; - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - join(shared[0], shared[i]); - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) {} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { -} - -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - ValueType accum = 0; - ValueType val, local_total; - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - int team_size = loop_boundaries.thread.team_size(); - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - - // Intra-member scan - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } - - shared[team_rank] = accum; - loop_boundaries.thread.team_barrier(); - - // Member 0 do scan on accumulated totals - if (team_rank == 0) { - for (iType i = 1; i < team_size; i += 1) { - shared[i] += shared[i - 1]; - } - accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan - } - - loop_boundaries.thread.team_barrier(); - - // Inter-member scan adding in accumulated totals - if (team_rank != 0) { - accum = shared[team_rank - 1]; - } - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) {} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ -#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 4de6931918e..2583a1cdc04 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -46,7 +46,6 @@ struct Container { } // namespace namespace Kokkos { -namespace Experimental { SYCL::SYCL() : m_space_instance(&Impl::SYCLInternal::singleton(), [](Impl::SYCLInternal*) {}) { @@ -100,6 +99,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif +#ifdef KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : defined\n"; +#else + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : undefined\n"; +#endif #ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; #else @@ -172,8 +176,7 @@ void SYCL::fence(const std::string& name) const { } void SYCL::impl_static_fence(const std::string& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, @@ -261,8 +264,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << device.get_info() << "\nImage Max Buffer Size: " << device.get_info() - << "\nImage Max Array Size: " - << device.get_info() << "\nMax Samplers: " << device.get_info() << "\nMax Parameter Size: " << device.get_info() @@ -317,5 +318,4 @@ int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index 0f3d1f0994d..937dcceab48 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -39,7 +39,6 @@ static_assert(false, #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal; } @@ -91,9 +90,8 @@ class SYCL { /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); - void fence( - const std::string& name = - "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence") const; + void fence(const std::string& name = + "Kokkos::SYCL::fence: Unnamed Instance Fence") const; /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -131,15 +129,13 @@ class SYCL { Kokkos::Impl::HostSharedPtr m_space_instance; }; -} // namespace Experimental - namespace Tools { namespace Experimental { template <> -struct DeviceTypeTraits { +struct DeviceTypeTraits { /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling static constexpr DeviceType id = DeviceType::SYCL; - static int device_id(const Kokkos::Experimental::SYCL& exec) { + static int device_id(const Kokkos::SYCL& exec) { return exec.impl_internal_space_instance()->m_syclDev; } }; @@ -185,10 +181,11 @@ std::vector partition_space(const SYCL& sycl_space, return instances; } +} // namespace Experimental + namespace Impl { std::vector get_sycl_devices(); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp index afc7eebd388..a9e2eca4fb3 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -28,37 +28,34 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n); -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n); +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n); void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n); template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value && is_sycl_type_space::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; @@ -66,10 +63,9 @@ struct DeepCopy struct DeepCopy< MemSpace1, MemSpace2, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + is_sycl_type_space::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -93,9 +89,8 @@ struct DeepCopy< template struct DeepCopy< MemSpace, HostSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -118,9 +113,8 @@ struct DeepCopy< template struct DeepCopy< HostSpace, MemSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp index 9c39df94159..54ca6459953 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp @@ -32,30 +32,29 @@ namespace Impl { template -class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag< - PatternTag, Functor, PolicyType, Args..., - Kokkos::Experimental::SYCL>::type { +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag::type { public: using Policy = PolicyType; using graph_kernel = GraphNodeKernelImpl; - using base_t = typename PatternImplSpecializationFromTag< - PatternTag, Functor, Policy, Args..., Kokkos::Experimental::SYCL>::type; + using base_t = + typename PatternImplSpecializationFromTag::type; // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::Experimental::SYCL const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + GraphNodeKernelImpl(std::string, Kokkos::SYCL const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...) {} template - GraphNodeKernelImpl(Kokkos::Experimental::SYCL const& exec_space, - Functor arg_functor, PolicyDeduced&& arg_policy) + GraphNodeKernelImpl(Kokkos::SYCL const& exec_space, Functor arg_functor, + PolicyDeduced&& arg_policy) : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} + (PolicyDeduced&&)arg_policy) {} void set_sycl_graph_ptr( sycl::ext::oneapi::experimental::command_graph< @@ -102,14 +101,14 @@ template ::type> struct get_graph_node_kernel_type - : type_identity> {}; + : type_identity< + GraphNodeKernelImpl> {}; template struct get_graph_node_kernel_type : type_identity, Kokkos::ParallelReduceTag>> {}; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp index 6bbe6711a2e..828f1cacb4a 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp @@ -28,7 +28,7 @@ namespace Kokkos { namespace Impl { template <> -struct GraphNodeBackendSpecificDetails { +struct GraphNodeBackendSpecificDetails { std::optional node; explicit GraphNodeBackendSpecificDetails() = default; @@ -38,16 +38,16 @@ struct GraphNodeBackendSpecificDetails { }; template -struct GraphNodeBackendDetailsBeforeTypeErasure { +struct GraphNodeBackendDetailsBeforeTypeErasure { protected: GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, Kernel &, PredecessorRef const &, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, Kernel &, PredecessorRef const &, + GraphNodeBackendSpecificDetails &) noexcept {} GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, _graph_node_is_root_ctor_tag, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails &) noexcept {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp index 1dc4a9c9973..dc63052dd7a 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp @@ -31,29 +31,28 @@ namespace Kokkos { namespace Impl { template <> -class GraphImpl { +class GraphImpl { public: - using node_details_t = - GraphNodeBackendSpecificDetails; - using root_node_impl_t = GraphNodeImpl; + using node_details_t = GraphNodeBackendSpecificDetails; + using root_node_impl_t = + GraphNodeImpl; using aggregate_kernel_impl_t = SYCLGraphNodeAggregateKernel; using aggregate_node_impl_t = - GraphNodeImpl; // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); - explicit GraphImpl(Kokkos::Experimental::SYCL instance); + explicit GraphImpl(Kokkos::SYCL instance); void add_node(std::shared_ptr const& arg_node_ptr); @@ -63,19 +62,25 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::SYCL& exec); - Kokkos::Experimental::SYCL const& get_execution_space() const noexcept; + Kokkos::SYCL const& get_execution_space() const noexcept; auto create_root_node_ptr(); template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { m_graph_exec = m_graph.finalize(); } + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec.has_value()); + m_graph_exec = m_graph.finalize(); + } - Kokkos::Experimental::SYCL m_execution_space; + auto& sycl_graph() { return m_graph; } + auto& sycl_graph_exec() { return m_graph_exec; } + + private: + Kokkos::SYCL m_execution_space; sycl::ext::oneapi::experimental::command_graph< sycl::ext::oneapi::experimental::graph_state::modifiable> m_graph; @@ -84,17 +89,16 @@ class GraphImpl { m_graph_exec; }; -inline GraphImpl::~GraphImpl() { +inline GraphImpl::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); } -inline GraphImpl::GraphImpl( - Kokkos::Experimental::SYCL instance) +inline GraphImpl::GraphImpl(Kokkos::SYCL instance) : m_execution_space(std::move(instance)), m_graph(m_execution_space.sycl_queue().get_context(), m_execution_space.sycl_queue().get_device()) {} -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // add an empty node that needs to be set up before finalizing the graph arg_node_ptr->node_details_t::node = m_graph.add(); @@ -103,7 +107,7 @@ inline void GraphImpl::add_node( // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in its policy template -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -122,7 +126,7 @@ inline void GraphImpl::add_node( // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template -inline void GraphImpl::add_predecessor( +inline void GraphImpl::add_predecessor( NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); @@ -137,19 +141,19 @@ inline void GraphImpl::add_predecessor( m_graph.make_edge(*pred_node, *node); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::SYCL& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - m_execution_space.sycl_queue().ext_oneapi_graph(*m_graph_exec); + exec.sycl_queue().ext_oneapi_graph(*m_graph_exec); } -inline Kokkos::Experimental::SYCL const& -GraphImpl::get_execution_space() const noexcept { +inline Kokkos::SYCL const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } -inline auto GraphImpl::create_root_node_ptr() { +inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared(get_execution_space(), _graph_node_is_root_ctor_tag{}); @@ -158,7 +162,7 @@ inline auto GraphImpl::create_root_node_ptr() { } template -inline auto GraphImpl::create_aggregate_ptr( +inline auto GraphImpl::create_aggregate_ptr( PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 5843dca8123..5af1330d939 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -24,14 +24,12 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { namespace { // FIXME_SYCL Should be a multiple of the maximum subgroup size. -static constexpr auto sizeScratchGrain = - sizeof(Kokkos::Experimental::SYCL::size_type[32]); +static constexpr auto sizeScratchGrain = sizeof(Kokkos::SYCL::size_type[32]); std::size_t scratch_count(const std::size_t size) { return (size + sizeScratchGrain - 1) / sizeScratchGrain; @@ -55,8 +53,8 @@ Kokkos::View sycl_global_unique_token_locks( SYCLInternal::~SYCLInternal() { if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { - std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " - "Kokkos::Experimental::SYCL::finalize()" + std::cerr << "Kokkos::SYCL ERROR: Failed to call " + "Kokkos::SYCL::finalize()" << std::endl; std::cerr.flush(); } @@ -64,7 +62,7 @@ SYCLInternal::~SYCLInternal() { int SYCLInternal::verify_is_initialized(const char* const label) const { if (!is_initialized()) { - Kokkos::abort((std::string("Kokkos::Experimental::SYCL::") + label + + Kokkos::abort((std::string("Kokkos::SYCL::") + label + " : ERROR device not initialized\n") .c_str()); } @@ -171,12 +169,12 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && @@ -184,9 +182,9 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -255,7 +253,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) mem_space.deallocate(m_scratchSpace, @@ -265,8 +263,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - m_scratchSpace = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -276,7 +274,7 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchHost) mem_space.deallocate(m_scratchHost, @@ -286,8 +284,8 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchHostCount, sizeScratchGrain); - m_scratchHost = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); + m_scratchHost = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchHost", alloc_size)); } return m_scratchHost; @@ -297,7 +295,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) mem_space.deallocate(m_scratchFlags, @@ -307,8 +305,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - m_scratchFlags = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchFlags", alloc_size)); // We only zero-initialize the allocation when we actually allocate. // It's the responsibility of the features using scratch_flags, @@ -326,8 +324,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( template void SYCLInternal::fence_helper(WAT& wat, const std::string& name, uint32_t instance_id) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id}, [&]() { try { @@ -364,8 +361,7 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { AllocationSpace alloc_space(*m_q); if (m_data) alloc_space.deallocate(m_data, m_capacity); - m_data = - alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); + m_data = alloc_space.allocate("Kokkos::SYCL::USMObjectMem", n); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); @@ -396,5 +392,4 @@ template class SYCLInternal::USMObjectMem; template class SYCLInternal::USMObjectMem; } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 2d784ef8a5f..c982154a9a8 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -28,7 +28,6 @@ #include #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal { @@ -38,10 +37,10 @@ class SYCLInternal { SYCLInternal() = default; ~SYCLInternal(); - SYCLInternal(const SYCLInternal&) = delete; + SYCLInternal(const SYCLInternal&) = delete; SYCLInternal& operator=(const SYCLInternal&) = delete; - SYCLInternal& operator=(SYCLInternal&&) = delete; - SYCLInternal(SYCLInternal&&) = delete; + SYCLInternal& operator=(SYCLInternal&&) = delete; + SYCLInternal(SYCLInternal&&) = delete; Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); @@ -76,8 +75,9 @@ class SYCLInternal { mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; - uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< - Kokkos::Experimental::SYCL>(reinterpret_cast(this)); + uint32_t m_instance_id = + Kokkos::Tools::Experimental::Impl::idForInstance( + reinterpret_cast(this)); std::optional m_queue; // Using std::vector> reveals a compiler bug when @@ -102,9 +102,9 @@ class SYCLInternal { explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept : m_q(std::move(q)), m_instance_id(instance_id) {} - USMObjectMem(USMObjectMem const&) = delete; - USMObjectMem(USMObjectMem&&) = delete; - USMObjectMem& operator=(USMObjectMem&&) = delete; + USMObjectMem(USMObjectMem const&) = delete; + USMObjectMem(USMObjectMem&&) = delete; + USMObjectMem& operator=(USMObjectMem&&) = delete; USMObjectMem& operator=(USMObjectMem const&) = delete; ~USMObjectMem() { reset(); }; @@ -119,12 +119,12 @@ class SYCLInternal { size_t reserve(size_t n); private: - using AllocationSpace = std::conditional_t< - Kind == sycl::usm::alloc::device, - Kokkos::Experimental::SYCLDeviceUSMSpace, - std::conditional_t>; + using AllocationSpace = + std::conditional_t>; public: // Performs either sycl::memcpy (for USM device memory) or std::memcpy @@ -144,11 +144,10 @@ class SYCLInternal { } void fence() { - SYCLInternal::fence( - m_last_event, - "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for " - "last event to finish", - m_instance_id); + SYCLInternal::fence(m_last_event, + "Kokkos::SYCLInternal::USMObject fence to wait for " + "last event to finish", + m_instance_id); } void register_event(sycl::event event) { @@ -324,13 +323,12 @@ auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) { return SYCLFunctionWrapper(functor, storage); } } // namespace Impl -} // namespace Experimental } // namespace Kokkos #if defined(SYCL_DEVICE_COPYABLE) && defined(KOKKOS_ARCH_INTEL_GPU) template struct sycl::is_device_copyable< - Kokkos::Experimental::Impl::SYCLFunctionWrapper> + Kokkos::Impl::SYCLFunctionWrapper> : std::true_type {}; #if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ @@ -352,8 +350,7 @@ static_assert( template struct sycl::is_device_copyable< - const Kokkos::Experimental::Impl::SYCLFunctionWrapper, + const Kokkos::Impl::SYCLFunctionWrapper, std::enable_if_t>>> : std::true_type {}; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp index d212e2dacc3..9498513a3e8 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp @@ -22,13 +22,13 @@ namespace Kokkos { template <> -struct default_outer_direction { +struct default_outer_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; template <> -struct default_inner_direction { +struct default_inner_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; @@ -37,8 +37,8 @@ namespace Impl { // Settings for MDRangePolicy template <> -inline TileSizeProperties get_tile_size_properties( - const Kokkos::Experimental::SYCL& space) { +inline TileSizeProperties get_tile_size_properties( + const Kokkos::SYCL& space) { TileSizeProperties properties; properties.max_threads = space.impl_internal_space_instance()->m_maxWorkgroupSize; @@ -50,8 +50,7 @@ inline TileSizeProperties get_tile_size_properties( // Settings for TeamMDRangePolicy template -struct ThreadAndVectorNestLevel +struct ThreadAndVectorNestLevel : AcceleratorBasedNestLevel {}; } // namespace Impl diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index cb7b1048da3..3dbd63d81ad 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -25,7 +25,7 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelFor, const typename Policy::index_type m_num_tiles; static constexpr Iterate inner_direction = Policy::inner_direction; } m_policy; - const Kokkos::Experimental::SYCL& m_space; + const Kokkos::SYCL& m_space; sycl::nd_range<3> compute_ranges() const { const auto& m_tile = m_policy.m_tile; @@ -180,12 +180,11 @@ class Kokkos::Impl::ParallelFor, } void execute() const { - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = - m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 8ef43d392c6..da75f3e901d 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -67,7 +67,7 @@ struct FunctorWrapperRangePolicyParallelForCustom { template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; @@ -82,8 +82,8 @@ class Kokkos::Impl::ParallelFor, sycl::event sycl_direct_launch(const Policy& policy, const Functor& functor, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -160,13 +160,13 @@ class Kokkos::Impl::ParallelFor, void execute() const { if (m_policy.begin() == m_policy.end()) return; - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = m_policy.space() - .impl_internal_space_instance() - ->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_policy.space() + .impl_internal_space_instance() + ->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index cf7f582bc79..d8859cda9f3 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -27,11 +27,11 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using functor_type = FunctorType; - using size_type = ::Kokkos::Experimental::SYCL::size_type; + using size_type = ::Kokkos::SYCL::size_type; private: using member_type = typename Policy::member_type; @@ -52,8 +52,8 @@ class Kokkos::Impl::ParallelFor, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = m_policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -146,11 +146,11 @@ class Kokkos::Impl::ParallelFor, scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); @@ -164,10 +164,14 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended(arg_functor, ParallelForTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 0774b24bca1..1e313549757 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -30,7 +30,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -76,7 +76,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -85,7 +85,7 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch( functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event()); @@ -370,7 +368,7 @@ class Kokkos::Impl::ParallelReduce template -class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { +class Kokkos::Impl::ParallelReduce< + CombinedFunctorReducerType, Kokkos::RangePolicy, Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -49,7 +48,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -59,8 +58,8 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_reducer_wrapper, diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index b443bcbf902..8f5310cbb21 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -29,7 +29,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -46,7 +46,7 @@ class Kokkos::Impl::ParallelReduce(m_scratch_size[1]) * m_league_size)); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_reducer_wrapper, @@ -436,16 +434,21 @@ class Kokkos::Impl::ParallelReduce::accessible), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended( m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), ParallelReduceTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } + // Must be a power of two greater than two, get the one not bigger than the // requested one. if ((m_team_size & m_team_size - 1) || m_team_size < 2) { @@ -461,7 +464,7 @@ class Kokkos::Impl::ParallelReduce(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index bdb5b883770..ed7cee2805d 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -145,7 +145,7 @@ class ParallelScanSYCLBase { using value_type = typename Analysis::value_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; - using size_type = Kokkos::Experimental::SYCL::size_type; + using size_type = Kokkos::SYCL::size_type; using index_type = typename Policy::index_type; protected: @@ -161,8 +161,8 @@ class ParallelScanSYCLBase { sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, sycl::event memcpy_event) { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - Kokkos::Experimental::Impl::SYCLInternal& instance = + const Kokkos::SYCL& space = m_policy.space(); + Kokkos::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -374,11 +374,11 @@ class ParallelScanSYCLBase { std::scoped_lock scratch_buffers_lock( instance.m_mutexScratchSpace); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor_reducer, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); @@ -399,7 +399,7 @@ class ParallelScanSYCLBase { template class Kokkos::Impl::ParallelScan, - Kokkos::Experimental::SYCL> + Kokkos::SYCL> : private ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; @@ -417,13 +417,12 @@ class Kokkos::Impl::ParallelScan, template class Kokkos::Impl::ParallelScanWithTotal< - FunctorType, Kokkos::RangePolicy, ReturnType, - Kokkos::Experimental::SYCL> + FunctorType, Kokkos::RangePolicy, ReturnType, Kokkos::SYCL> : public ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; - const Kokkos::Experimental::SYCL& m_exec; + const Kokkos::SYCL& m_exec; inline void execute() { Base::impl_execute([&]() { @@ -445,7 +444,7 @@ class Kokkos::Impl::ParallelScanWithTotal< const typename Base::Policy& arg_policy, const ViewType& arg_result_view) : Base(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess::accessible), m_exec(arg_policy.space()) {} }; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 19fad29150e..022f88e0a81 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -33,11 +33,11 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); } -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n) { sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; auto event = q.memcpy(dst, src, n); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -46,9 +46,8 @@ void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, } void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); - Experimental::SYCL().fence( - "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + SYCL().fence("Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); } } // namespace Impl @@ -60,12 +59,9 @@ namespace { std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { switch (allocation_kind) { - case sycl::usm::alloc::host: - return Kokkos::Experimental::SYCLHostUSMSpace::name(); - case sycl::usm::alloc::device: - return Kokkos::Experimental::SYCLDeviceUSMSpace::name(); - case sycl::usm::alloc::shared: - return Kokkos::Experimental::SYCLSharedUSMSpace::name(); + case sycl::usm::alloc::host: return Kokkos::SYCLHostUSMSpace::name(); + case sycl::usm::alloc::device: return Kokkos::SYCLDeviceUSMSpace::name(); + case sycl::usm::alloc::shared: return Kokkos::SYCLSharedUSMSpace::name(); default: Kokkos::abort("bug: unknown sycl allocation type"); return "unreachable"; @@ -75,7 +71,6 @@ std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { } // namespace namespace Kokkos { -namespace Experimental { SYCLDeviceUSMSpace::SYCLDeviceUSMSpace() : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {} @@ -114,12 +109,12 @@ void* allocate_sycl(const char* arg_label, const size_t arg_alloc_size, return hostPtr; } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const size_t arg_alloc_size) const { return allocate(exec_space, "[unlabeled]", arg_alloc_size); } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { @@ -244,7 +239,6 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, Kokkos::Tools::make_space_handle(name()), m_queue); } -} // namespace Experimental } // namespace Kokkos //============================================================================== @@ -253,11 +247,11 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, #include KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); + Kokkos::SYCLDeviceUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLSharedUSMSpace); + Kokkos::SYCLSharedUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp index b86cfca413c..5a37da130ca 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -39,8 +39,6 @@ template struct is_sycl_type_space : public std::false_type {}; } // namespace Impl -namespace Experimental { - class SYCLDeviceUSMSpace { public: using execution_space = SYCL; @@ -154,45 +152,40 @@ class SYCLHostUSMSpace { sycl::queue m_queue; }; -} // namespace Experimental - namespace Impl { template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type {}; -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space != SYCLSharedUSMSpace::execution_space enum : bool { assignable = false }; enum : bool { accessible = true }; @@ -200,26 +193,24 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space == - // Experimental::SYCLHostUSMSpace::execution_space + // SYCLHostUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLDeviceUSMSpace::execution_space == SYCLSharedUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; @@ -227,14 +218,11 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLDeviceUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLDeviceUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLDeviceUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLDeviceUSMSpace::execution_space enum : bool { deepcopy = true }; }; @@ -243,16 +231,15 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; // SYCL cannot access HostSpace enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLSharedUSMSpace::execution_space == SYCLDeviceUSMSpace::execution_space // Can access SYCLSharedUSMSpace from Host but cannot access // SYCLDeviceUSMSpace from Host @@ -264,47 +251,38 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLSharedUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLSharedUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLSharedUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLSharedUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from SYCL - enum : bool { - accessible = true - }; // Experimental::SYCLHostUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLHostUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from Host enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // different execution_space enum : bool { accessible = true }; // same accessibility enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::ScratchMemorySpace> { +struct MemorySpaceAccess> { enum : bool { assignable = false }; enum : bool { accessible = true }; enum : bool { deepcopy = false }; @@ -315,11 +293,9 @@ struct MemorySpaceAccess< } // namespace Kokkos KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLSharedUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLHostUSMSpace); #endif #endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index 1e42faa5a83..6359e4a2d9e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -34,7 +34,7 @@ namespace Impl { */ class SYCLTeamMember { public: - using execution_space = Kokkos::Experimental::SYCL; + using execution_space = Kokkos::SYCL; using scratch_memory_space = execution_space::scratch_memory_space; using team_handle = SYCLTeamMember; @@ -126,6 +126,20 @@ class SYCLTeamMember { team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + using value_type = typename WrappedReducerType::value_type; auto sg = m_item.get_sub_group(); const auto sub_group_range = sg.get_local_range()[0]; @@ -139,7 +153,7 @@ class SYCLTeamMember { if (vector_range * shift < sub_group_range) { const value_type tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } }; shuffle_combine(1); @@ -153,14 +167,13 @@ class SYCLTeamMember { shift <<= 1) { auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } #endif value = Kokkos::Impl::SYCLReduction::select_from_group(sg, value, 0); const int n_subgroups = sg.get_group_range()[0]; if (n_subgroups == 1) { - reducer.reference() = value; return; } @@ -187,16 +200,15 @@ class SYCLTeamMember { for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && group_id < std::min(start + step_width, n_subgroups)) - reducer.join(reduction_array[group_id - start], value); + wrapped_reducer.join(&reduction_array[group_id - start], &value); sycl::group_barrier(m_item.get_group()); } // Do the final reduction for all threads redundantly value = reduction_array[0]; for (int i = 1; i < std::min(step_width, n_subgroups); ++i) - reducer.join(value, reduction_array[i]); + wrapped_reducer.join(&value, &reduction_array[i]); - reducer.reference() = value; // Make sure that every thread is done using the reduction array. sycl::group_barrier(m_item.get_group()); } @@ -271,8 +283,8 @@ class SYCLTeamMember { const auto update = Kokkos::Impl::SYCLReduction::shift_group_right(sg, value, vector_range); - Type intermediate = (group_id > 0 ? base_data[group_id - 1] : 0) + - (id_in_sg >= vector_range ? update : 0); + Type intermediate = (group_id > 0 ? base_data[group_id - 1] : Type{0}) + + (id_in_sg >= vector_range ? update : Type{0}); if (global_accum) { if (id_in_sg == sub_group_range - 1 && @@ -311,6 +323,19 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const { + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const { const auto tidx1 = m_item.get_local_id(1); const auto grange1 = m_item.get_local_range(1); @@ -319,13 +344,13 @@ class SYCLTeamMember { if (grange1 == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = grange1; (i >>= 1);) { tmp2 = Kokkos::Impl::SYCLReduction::shift_group_left(sg, tmp, i); if (static_cast(tidx1) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -336,8 +361,7 @@ class SYCLTeamMember { tmp2 = Kokkos::Impl::SYCLReduction::select_from_group( sg, tmp, (sg.get_local_id() / grange1) * grange1); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; } //---------------------------------------- @@ -531,8 +555,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); @@ -541,7 +573,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel_reduce assuming summation. @@ -557,20 +591,28 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); i < loop_boundaries.end; i += loop_boundaries.member.item().get_local_range(0)) { - closure(i, val); + closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + result = value; } /** \brief Inter-thread parallel exclusive prefix sum. @@ -657,8 +699,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -670,8 +720,11 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< i < loop_boundaries.end; i += grange0 * grange1) closure(i, value); - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -679,10 +732,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -692,11 +751,13 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1; i < loop_boundaries.end; i += grange0 * grange1) - closure(i, val); + closure(i, value); + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- @@ -746,16 +807,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const iType grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, reducer.reference()); + closure(i, value); - loop_boundaries.member.vector_reduce(reducer); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Intra-thread vector parallel_reduce. @@ -774,16 +846,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const int grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, result); + closure(i, value); - loop_boundaries.member.vector_reduce(Kokkos::Sum(result)); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp index 17ce59058bd..556ca0d2818 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp @@ -22,8 +22,7 @@ #include template -class Kokkos::Impl::TeamPolicyInternal +class Kokkos::Impl::TeamPolicyInternal : public PolicyTraits { public: using execution_policy = TeamPolicyInternal; @@ -45,7 +44,7 @@ class Kokkos::Impl::TeamPolicyInternal TeamPolicyInternal(TeamPolicyInternal const& p) { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index d55fc6a84ba..79d9e8a8d48 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -22,13 +22,14 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { Kokkos::View sycl_global_unique_token_locks( bool deallocate = false); } +namespace Experimental { + // both global and instance Unique Tokens are implemented in the same way // the global version has one shared static lock array underneath // but it can't be a static member variable since we need to acces it on device @@ -42,7 +43,7 @@ class UniqueToken { using size_type = int32_t; explicit UniqueToken(execution_space const& = execution_space()) - : m_locks(Impl::sycl_global_unique_token_locks()) {} + : m_locks(Kokkos::Impl::sycl_global_unique_token_locks()) {} KOKKOS_DEFAULTED_FUNCTION UniqueToken(const UniqueToken&) = default; @@ -75,11 +76,15 @@ class UniqueToken { /// \brief acquire value such that 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type impl_acquire() const { +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto item = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); +#else auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); +#endif std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; @@ -122,11 +127,11 @@ class UniqueToken public: UniqueToken() : UniqueToken( - Kokkos::Experimental::SYCL().concurrency()) {} + Kokkos::SYCL().concurrency()) {} explicit UniqueToken(execution_space const& arg) : UniqueToken( - Kokkos::Experimental::SYCL().concurrency(), arg) {} + Kokkos::SYCL().concurrency(), arg) {} explicit UniqueToken(size_type max_size) : UniqueToken(max_size) {} diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 61db6b34aac..2905733a4de 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -23,12 +23,11 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst) { - auto event = exec_space.impl_internal_space_instance()->m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::SYCL& exec_space, void* dst, size_t cnt) { + auto event = + exec_space.impl_internal_space_instance()->m_queue->memset(dst, 0, cnt); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES exec_space.impl_internal_space_instance() ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp index 81d43b31b35..a1fa9e43e08 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -34,7 +34,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -267,7 +266,7 @@ template std::vector partition_space(const Serial&, std::vector const& weights) { static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); // We only care about the number of instances to create and ignore weights @@ -284,7 +283,9 @@ std::vector partition_space(const Serial&, #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 34e115eca9b..addcaba009f 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -44,11 +44,16 @@ class ParallelFor, public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_iter.m_rp.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->exec(); } template @@ -112,10 +117,15 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 80faec9041d..2ab7b7f8034 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -31,7 +31,7 @@ class ParallelFor, Kokkos::Serial> { const Policy m_policy; template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { m_functor(i); @@ -39,7 +39,7 @@ class ParallelFor, Kokkos::Serial> { } template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -49,10 +49,15 @@ class ParallelFor, Kokkos::Serial> { public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_policy.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->template exec(); } @@ -79,7 +84,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -88,7 +93,7 @@ class ParallelReduce, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; @@ -108,10 +113,15 @@ class ParallelReduce, auto* internal_instance = m_policy.space().impl_internal_space_instance(); + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -166,7 +176,7 @@ class ParallelScan, const Policy m_policy; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -175,7 +185,7 @@ class ParallelScan, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -194,10 +204,16 @@ class ParallelScan, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -235,7 +251,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -244,7 +260,7 @@ class ParallelScanWithTotal, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -262,10 +278,16 @@ class ParallelScanWithTotal, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index a523cc86c97..7a6faf3d9fb 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -223,7 +223,7 @@ class ParallelFor, const size_t m_shared; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor(Member(data, ileague, m_league)); @@ -231,7 +231,7 @@ class ParallelFor, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { const TagType t{}; for (int ileague = 0; ileague < m_league; ++ileague) { @@ -247,10 +247,16 @@ class ParallelFor, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -293,7 +299,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor_reducer.get_functor()(Member(data, ileague, m_league), update); @@ -301,7 +307,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { const TagType t{}; @@ -321,10 +327,16 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp index 5905d6d32e1..678d1825047 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp @@ -25,10 +25,16 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -102,9 +108,8 @@ class TaskQueueSpecialization> { template class TaskQueueSpecializationConstrained< - Scheduler, - std::enable_if_t::value>> { + Scheduler, std::enable_if_t>> { public: // Note: Scheduler may be an incomplete type at class scope (but not inside // of the methods, obviously) @@ -215,6 +220,10 @@ extern template class TaskQueue, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 6ad6aabc5a7..527e0940798 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -31,15 +31,11 @@ namespace Impl { // parallel execution space since the specialization for // DefaultHostExecutionSpace is defined elsewhere. struct DummyExecutionSpace; -template +template <> struct ZeroMemset< - std::conditional_t::value, - Serial, DummyExecutionSpace>, - View> { - ZeroMemset(const Serial&, const View& dst) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } + std::conditional_t, + Serial, DummyExecutionSpace>> { + ZeroMemset(const Serial&, void* dst, size_t cnt) { std::memset(dst, 0, cnt); } }; } // namespace Impl diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index 3842966cd77..edc9489f67e 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -67,8 +67,9 @@ std::pair int s_thread_pool_size[3] = {0, 0, 0}; -void (*volatile s_current_function)(ThreadsInternal &, const void *); -const void *volatile s_current_function_arg = nullptr; +using s_current_function_type = void (*)(ThreadsInternal &, const void *); +std::atomic s_current_function; +std::atomic s_current_function_arg = nullptr; inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); @@ -79,7 +80,7 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } -void wait_yield(volatile ThreadState &flag, const ThreadState value) { +void wait_yield(std::atomic &flag, const ThreadState value) { while (value == flag) { std::this_thread::yield(); } @@ -135,11 +136,12 @@ ThreadsInternal::ThreadsInternal() ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding - const int entry = reinterpret_cast(s_current_function_arg) < - size_t(s_thread_pool_size[0]) - ? reinterpret_cast(s_current_function_arg) - : size_t(Kokkos::hwloc::bind_this_thread( - s_thread_pool_size[0], s_threads_coord)); + const int entry = + reinterpret_cast(s_current_function_arg.load()) < + size_t(s_thread_pool_size[0]) + ? reinterpret_cast(s_current_function_arg.load()) + : size_t(Kokkos::hwloc::bind_this_thread(s_thread_pool_size[0], + s_threads_coord)); // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && @@ -543,7 +545,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. ThreadsInternal *const th = - ((ThreadsInternal * volatile *)s_threads_exec)[ith]; + ((ThreadsInternal *volatile *)s_threads_exec)[ith]; if (th) { wait_yield(th->m_pool_state, ThreadState::Active); } else { diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index a5eb231cb01..130b3433d02 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -60,7 +60,7 @@ class ThreadsInternal { int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - ThreadState volatile m_pool_state; ///< State for global synchronizations + std::atomic m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -96,7 +96,7 @@ class ThreadsInternal { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION auto &state() { return m_pool_state; } KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } @@ -225,7 +225,7 @@ class ThreadsInternal { // to inactive triggers another thread to exit a spinwait // and read the 'reduce_memory'. // Must 'memory_fence()' to guarantee that storing the update to - // 'reduce_memory()' will complete before storing the the update to + // 'reduce_memory()' will complete before storing the update to // 'm_pool_state'. memory_fence(); @@ -403,7 +403,7 @@ class ThreadsInternal { static void start(void (*)(ThreadsInternal &, const void *), const void *); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static int in_parallel(); + static int in_parallel(); #endif static void fence(); static void fence(const std::string &); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 59577609ab7..711b1b69261 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -51,7 +51,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -65,7 +65,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 4a89c4fad82..25aab9ebfbc 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -35,7 +35,7 @@ class ParallelFor, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -47,7 +47,7 @@ class ParallelFor, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { const TagType t{}; #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -64,7 +64,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -77,7 +77,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index f927d7c6a67..40be3884c3d 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -36,8 +36,8 @@ class ParallelFor, const size_t m_shared; template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_static(); member.next_static()) { functor(member); @@ -45,8 +45,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -55,8 +55,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_dynamic(); member.next_dynamic()) { functor(member); @@ -64,8 +64,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_dynamic(); member.next_dynamic()) { @@ -88,8 +88,12 @@ class ParallelFor, policy.impl_set_vector_length(1); } if (policy.team_size() < 0) { - policy.impl_set_team_size( - policy.team_size_recommended(m_functor, ParallelForTag{})); + int team_size = policy.team_size_recommended(m_functor, ParallelForTag{}); + if (team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index fa63215a9e5..9f28f9bbfcc 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -59,7 +59,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -76,7 +76,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -91,8 +91,8 @@ class ParallelReduce(instance.reduce_memory())); + reference_type update = + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; @@ -100,7 +100,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -55,7 +55,7 @@ class ParallelReduce, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { const TagType t{}; @@ -73,7 +73,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), @@ -89,7 +89,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index 4db310701f9..69527ee3e65 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -42,7 +42,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { for (; member.valid_static(); member.next_static()) { functor(member, update); @@ -50,7 +50,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -106,9 +106,14 @@ class ParallelReduce could not find " + "a valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp index 62f34d741ff..d54f4ca952e 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp @@ -39,7 +39,7 @@ class ParallelScan, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -52,7 +52,7 @@ class ParallelScan, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; @@ -119,7 +119,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -132,7 +132,7 @@ class ParallelScanWithTotal, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp index 3df9dc07bf4..0f9a77f2afa 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -108,7 +108,7 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value) { Kokkos::store_fence(); uint32_t i = 0; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp index b98b6dbb73b..7ab43cdb7af 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -20,6 +20,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,7 +35,7 @@ enum class WaitMode : int { void host_thread_yield(const uint32_t i, const WaitMode mode); -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value); } // namespace Impl diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index a3501a437d2..f627e0d47a5 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -143,8 +143,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); @@ -164,8 +164,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; f(value); if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); memory_fence(); @@ -186,7 +186,7 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: using type = - typename if_c::type; + std::conditional_t; if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -215,52 +215,65 @@ class ThreadsExecTeamMember { } template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - team_reduce(const ReducerType& reducer, - const typename ReducerType::value_type contribution) const { + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(const ReducerType& reducer, + typename ReducerType::value_type& contribution) const { KOKKOS_IF_ON_DEVICE(((void)reducer; (void)contribution;)) - KOKKOS_IF_ON_HOST(( - using value_type = typename ReducerType::value_type; - // Make sure there is enough scratch space: - using type = typename if_c::type; - - type* const local_value = ((type*)m_instance->scratch_memory()); + KOKKOS_IF_ON_HOST( + (using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), contribution); + reducer.reference() = contribution;)) + } - // Set this thread's contribution - if (team_rank() != team_size() - 1) { *local_value = contribution; } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + const WrappedReducerType& wrapped_reducer, + typename WrappedReducerType::value_type& contribution) const { + using value_type = typename WrappedReducerType::value_type; + // Make sure there is enough scratch space: + using type = std::conditional_t; + + type* const local_value = ((type*)m_instance->scratch_memory()); + + // Set this thread's contribution + if (team_rank() != team_size() - 1) { + *local_value = contribution; + } - // Fence to make sure the base team member has access: - memory_fence(); + // Fence to make sure the base team member has access: + memory_fence(); - if (team_fan_in()) { - // The last thread to synchronize returns true, all other threads - // wait for team_fan_out() - type* const team_value = ((type*)m_team_base[0]->scratch_memory()); + if (team_fan_in()) { + // The last thread to synchronize returns true, all other threads + // wait for team_fan_out() + type* const team_value = ((type*)m_team_base[0]->scratch_memory()); - *team_value = contribution; - // Join to the team value: - for (int i = 1; i < m_team_size; ++i) { - reducer.join(*team_value, - *((type*)m_team_base[i]->scratch_memory())); - } + *team_value = contribution; + // Join to the team value: + for (int i = 1; i < m_team_size; ++i) { + wrapped_reducer.join(team_value, + ((type*)m_team_base[i]->scratch_memory())); + } - // Team base thread may "lap" member threads so copy out to their - // local value. - for (int i = 1; i < m_team_size; ++i) { - *((type*)m_team_base[i]->scratch_memory()) = *team_value; - } + // Team base thread may "lap" member threads so copy out to their + // local value. + for (int i = 1; i < m_team_size; ++i) { + *((type*)m_team_base[i]->scratch_memory()) = *team_value; + } - // Fence to make sure all team members have access - memory_fence(); - } + // Fence to make sure all team members have access + memory_fence(); + } - team_fan_out(); + team_fan_out(); - // Value was changed by the team base - reducer.reference() = *local_value;)) + contribution = *local_value; } /** \brief Intra-team exclusive prefix sum with team_rank() ordering @@ -278,8 +291,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_DEVICE(((void)global_accum; return value;)) KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; volatile type* const work_value = ((type*)m_instance->scratch_memory()); @@ -887,19 +900,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType intermediate; - Sum sum(intermediate); - sum.init(intermediate); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - ValueType tmp = ValueType(); - lambda(i, tmp); - intermediate += tmp; + lambda(i, value); } - loop_boundaries.thread.team_reduce(sum, intermediate); - result = sum.reference(); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } template @@ -907,15 +926,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { lambda(i, value); } - loop_boundaries.thread.team_reduce(reducer, value); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } } // namespace Kokkos @@ -950,11 +979,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } template @@ -962,11 +1004,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel exclusive prefix sum. Executes @@ -1049,7 +1104,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( typename Impl::FunctorAnalysis, FunctorType, void>::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of closure and return type"); ValueType scan_val = ValueType(); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index c88d66db5f9..5fed92db26d 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -36,13 +36,13 @@ class ParallelFor, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/lib/kokkos/core/src/View/Kokkos_BasicView.hpp b/lib/kokkos/core/src/View/Kokkos_BasicView.hpp new file mode 100644 index 00000000000..29eafca62ee --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_BasicView.hpp @@ -0,0 +1,652 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_BASIC_VIEW_HPP +#define KOKKOS_BASIC_VIEW_HPP +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// FIXME: we need to make this work for not using our mdspan impl +#define KOKKOS_IMPL_NO_UNIQUE_ADDRESS _MDSPAN_NO_UNIQUE_ADDRESS +namespace Kokkos::Impl { + +constexpr inline struct SubViewCtorTag { + explicit SubViewCtorTag() = default; +} subview_ctor_tag{}; + +template +struct KokkosSliceToMDSpanSliceImpl { + using type = T; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(const T &s) { return s; } +}; + +template <> +struct KokkosSliceToMDSpanSliceImpl { + using type = full_extent_t; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(Kokkos::ALL_t) { + return full_extent; + } +}; + +template +using kokkos_slice_to_mdspan_slice = + typename KokkosSliceToMDSpanSliceImpl::type; + +template +KOKKOS_INLINE_FUNCTION constexpr decltype(auto) +transform_kokkos_slice_to_mdspan_slice(const T &s) { + return KokkosSliceToMDSpanSliceImpl::transform(s); +} + +// We do have implementation detail versions of these in our mdspan impl +// However they are not part of the public standard interface +template +struct is_layout_right_padded : public std::false_type {}; + +template +struct is_layout_right_padded> + : public std::true_type {}; + +template +struct is_layout_left_padded : public std::false_type {}; + +template +struct is_layout_left_padded> + : public std::true_type {}; + +template +class BasicView { + public: + using mdspan_type = + mdspan; + using extents_type = typename mdspan_type::extents_type; + using layout_type = typename mdspan_type::layout_type; + using accessor_type = typename mdspan_type::accessor_type; + using mapping_type = typename mdspan_type::mapping_type; + using element_type = typename mdspan_type::element_type; + using value_type = typename mdspan_type::value_type; + using index_type = typename mdspan_type::index_type; + using size_type = typename mdspan_type::size_type; + using rank_type = typename mdspan_type::rank_type; + using data_handle_type = typename mdspan_type::data_handle_type; + using reference = typename mdspan_type::reference; + using memory_space = typename accessor_type::memory_space; + using execution_space = typename memory_space::execution_space; + + // For now View and BasicView will have a restriction that the data handle + // needs to be convertible to element_type* and vice versa + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + + KOKKOS_FUNCTION static constexpr rank_type rank() noexcept { + return extents_type::rank(); + } + KOKKOS_FUNCTION static constexpr rank_type rank_dynamic() noexcept { + return extents_type::rank_dynamic(); + } + KOKKOS_FUNCTION static constexpr size_t static_extent(rank_type r) noexcept { + return extents_type::static_extent(r); + } + KOKKOS_FUNCTION constexpr index_type extent(rank_type r) const noexcept { + return m_map.extents().extent(r); + }; + + protected: + // These are pre-condition checks which are unconditionally (i.e. in release + // mode) enabled in Kokkos::View 4.4 + template + KOKKOS_FUNCTION static constexpr void check_basic_view_constructibility( + [[maybe_unused]] const OtherMapping &rhs) { + using src_t = typename OtherMapping::layout_type; + using dst_t = layout_type; + constexpr size_t rnk = mdspan_type::rank(); + if constexpr (!std::is_same_v) { + if constexpr (Impl::is_layout_left_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == 0 ? rhs.stride(1) : rhs.extents().extent(r)); + } + } + } + if constexpr (Impl::is_layout_right_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == rnk ? rhs.stride(r - 2) + : rhs.extents().extent(r - 1)); + } + } + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r); + } + } else if constexpr (Impl::is_layout_left_padded::value && + rnk > 1) { + if (rhs.stride(1) != rhs.extents().extent(0)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r - 1); + } + } + } else if constexpr (Impl::is_layout_right_padded::value && + rnk > 1) { + if (rhs.stride(rnk - 2) != rhs.extents().extent(rnk - 1)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + } + } + + public: + KOKKOS_DEFAULTED_FUNCTION constexpr BasicView() = default; + + KOKKOS_FUNCTION constexpr BasicView(const mdspan_type &other) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()){}; + KOKKOS_FUNCTION constexpr BasicView(mdspan_type &&other) + : m_ptr(std::move(other.data_handle())), + m_map(std::move(other.mapping())), + m_acc(std::move(other.accessor())){}; + + template + // requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit constexpr BasicView( + std::enable_if_t, + data_handle_type> + p, + OtherIndexTypes... exts) + : m_ptr(std::move(p)), + m_map(extents_type(static_cast(std::move(exts))...)), + m_acc{} {} + + template + // When doing C++20 we should switch to this, the conditional explicit we + // can't do in 17 + // requires(std::is_constructible_v>) + // explicit(Size != rank_dynamic()) + KOKKOS_FUNCTION constexpr BasicView( + std::enable_if_t< + std::is_constructible_v>, + data_handle_type> + p, + const Array &exts) + : m_ptr(std::move(p)), m_map(extents_type(exts)), m_acc{} {} + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, + const extents_type &exts) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v && + std::is_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(exts), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(m), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m, + const accessor_type &a) + : m_ptr(std::move(p)), m_map(m), m_acc(a) {} + + template +// requires(std::is_constructible_v::mdspan_type>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const BasicView &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, typename BasicView::mdspan_type>, + void *> = nullptr) + : m_ptr(other.m_ptr), m_map(other.m_map), m_acc(other.m_acc) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + template +// requires(std::is_constructible_v>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const mdspan &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, mdspan>, + void *> = nullptr) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + // Allocating constructors specific to BasicView + /// + /// Construct from a given mapping + /// + explicit constexpr BasicView(const std::string &label, + const mapping_type &mapping) + : BasicView(view_alloc(label), mapping) {} + + /// + /// Construct from a given extents + /// + explicit constexpr BasicView(const std::string &label, + const extents_type &ext) + : BasicView(view_alloc(label), mapping_type{ext}) {} + + private: + template + data_handle_type create_data_handle( + const Impl::ViewCtorProp &arg_prop, + const typename mdspan_type::mapping_type &arg_mapping) { + constexpr bool has_exec = Impl::ViewCtorProp::has_execution_space; + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, memory_space{}, execution_space{}); + using alloc_prop = decltype(prop_copy); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + return data_handle_type(Impl::make_shared_allocation_record( + arg_mapping.required_span_size(), + Impl::get_property(prop_copy), + Impl::get_property(prop_copy), + has_exec ? std::optional{Impl::get_property< + Impl::ExecutionSpaceTag>(prop_copy)} + : std::optional{std::nullopt}, + std::integral_constant(), + std::integral_constant())); + } + + public: + template + // requires(!Impl::ViewCtorProp::has_pointer) + explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView(create_data_handle(arg_prop, arg_mapping), arg_mapping) {} + + template + // requires(Impl::ViewCtorProp::has_pointer) + KOKKOS_FUNCTION explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView( + data_handle_type(Impl::get_property(arg_prop)), + arg_mapping) {} + + protected: + template + KOKKOS_INLINE_FUNCTION BasicView( + Impl::SubViewCtorTag, + const BasicView &src_view, + SliceSpecifiers... slices) + : BasicView(submdspan( + src_view.to_mdspan(), + Impl::transform_kokkos_slice_to_mdspan_slice(slices)...)) {} + + public: + //---------------------------------------- + // Conversion to MDSpan + template , + mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr + operator mdspan() const { + return mdspan_type(m_ptr, m_map, m_acc); + } + + // Here we use an overload instead of a default parameter as a workaround + // to a potential compiler bug with clang 17. It may be present in other + // compilers + template >> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan() const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), static_cast(accessor())); + } + + template < + class OtherAccessorType = AccessorPolicy, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType &other_accessor) const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), other_accessor); + } + + KOKKOS_FUNCTION void assign_data(element_type *ptr) { m_ptr = ptr; } + + // ========================= mdspan ================================= + + // [mdspan.mdspan.members], members + +// Introducing the C++20 and C++23 variants of the operators already +#ifndef KOKKOS_ENABLE_CXX17 +#ifndef KOKKOS_ENABLE_CXX20 + // C++23 only operator[] + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator[]( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#endif + + // C++20 operator() + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#else + // C++17 variant of operator() + + // Some weird unexplained issue in compiling the SFINAE version with CUDA/MSVC + // So we just use post factor check here with static_assert +#if defined(KOKKOS_ENABLE_CUDA) && defined(_WIN32) + template + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + static_assert((std::is_convertible_v && ...)); + static_assert( + (std::is_nothrow_constructible_v && ...)); + static_assert((sizeof...(OtherIndexTypes)) == rank()); + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#else + template + KOKKOS_FUNCTION constexpr std::enable_if_t< + ((std::is_convertible_v && ...)) && + ((std::is_nothrow_constructible_v && + ...)) && + ((sizeof...(OtherIndexTypes)) == rank()), + reference> + operator()(OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#endif +#endif + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside size() + template + KOKKOS_FUNCTION constexpr size_type size_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return 0u; + return ((static_cast(m_map.extents().extent(Idxs))) * ... * + size_type(1)); + } + + public: + KOKKOS_FUNCTION constexpr size_type size() const noexcept { + return size_impl(std::make_index_sequence()); + } + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside empty() + template + KOKKOS_FUNCTION constexpr bool empty_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return true; + return (rank() > 0) && + ((m_map.extents().extent(Idxs) == index_type(0)) || ... || false); + } + + public: + [[nodiscard]] KOKKOS_FUNCTION constexpr bool empty() const noexcept { + return empty_impl(std::make_index_sequence()); + } + + KOKKOS_FUNCTION friend constexpr void swap(BasicView &x, + BasicView &y) noexcept { + kokkos_swap(x.m_ptr, y.m_ptr); + kokkos_swap(x.m_map, y.m_map); + kokkos_swap(x.m_acc, y.m_acc); + } + + KOKKOS_FUNCTION constexpr const extents_type &extents() const noexcept { + return m_map.extents(); + }; + KOKKOS_FUNCTION constexpr const data_handle_type &data_handle() + const noexcept { + return m_ptr; + }; + KOKKOS_FUNCTION constexpr const mapping_type &mapping() const noexcept { + return m_map; + }; + KOKKOS_FUNCTION constexpr const accessor_type &accessor() const noexcept { + return m_acc; + }; + + KOKKOS_FUNCTION static constexpr bool is_always_unique() noexcept { + return mapping_type::is_always_unique(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return mapping_type::is_always_exhaustive(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_strided() noexcept { + return mapping_type::is_always_strided(); + }; + + KOKKOS_FUNCTION constexpr bool is_unique() const { + return m_map.is_unique(); + }; + KOKKOS_FUNCTION constexpr bool is_exhaustive() const { + return m_map.is_exhaustive(); + }; + KOKKOS_FUNCTION constexpr bool is_strided() const { + return m_map.is_strided(); + }; + KOKKOS_FUNCTION constexpr index_type stride(rank_type r) const { + return m_map.stride(r); + }; + + protected: +#ifndef __NVCC__ + KOKKOS_IMPL_NO_UNIQUE_ADDRESS data_handle_type m_ptr{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS mapping_type m_map{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS accessor_type m_acc{}; +#else + data_handle_type m_ptr{}; + mapping_type m_map{}; + accessor_type m_acc{}; +#endif + + template + friend class BasicView; +}; +} // namespace Kokkos::Impl + +#endif diff --git a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp index 1ade75692f1..eb11630b21b 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -26,6 +26,7 @@ static_assert(false, #include #include #include +#include #include #include @@ -41,22 +42,8 @@ bool is_zero_byte(const T& x) { return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } -//---------------------------------------------------------------------------- - -/* - * The construction, assignment to default, and destruction - * are merged into a single functor. - * Primarily to work around an unresolved CUDA back-end bug - * that would lose the destruction cuda device function when - * called from the shared memory tracking destruction. - * Secondarily to have two fewer partial specializations. - */ -template ::value> -struct ViewValueFunctor; - template -struct ViewValueFunctor { +struct ViewValueFunctor { using ExecSpace = typename DeviceType::execution_space; struct DestroyTag {}; @@ -68,20 +55,31 @@ struct ViewValueFunctor { std::string name; bool default_exec_space; - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - operator()(ConstructTag const&, const size_t i) const { + template + KOKKOS_FUNCTION + std::enable_if_t> + operator()(ConstructTag, const size_t i) const { new (ptr + i) ValueType(); } - KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, - const size_t i) const { + KOKKOS_FUNCTION void operator()(DestroyTag, const size_t i) const { + // When instantiating a View on host execution space with a host only + // destructor the workaround for CUDA device symbol instantiation tries to + // still compile a destruction kernel for the device, and issues a warning + // for host from host-device +#ifdef KOKKOS_ENABLE_CUDA + if constexpr (std::is_same_v) { + KOKKOS_IF_ON_DEVICE(((ptr + i)->~ValueType();)) + } else { + KOKKOS_IF_ON_HOST(((ptr + i)->~ValueType();)) + } +#else (ptr + i)->~ValueType(); +#endif } - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; ViewValueFunctor& operator=(const ViewValueFunctor&) = default; ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, @@ -104,49 +102,6 @@ struct ViewValueFunctor { functor_instantiate_workaround(); } - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_dispatch() { - ValueType value{}; -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_dispatch() { - parallel_for_implementation(); - } - template void parallel_for_implementation() { using PolicyType = @@ -172,24 +127,62 @@ struct ViewValueFunctor { const Kokkos::Impl::ParallelFor closure( *this, policy); closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } + if (default_exec_space || std::is_same_v) { + space.fence(std::is_same_v + ? "Kokkos::View::destruction before deallocate" + : "Kokkos::View::initialization"); + } } - void construct_shared_allocation() { construct_dispatch(); } + // Shortcut for zero initialization + void zero_memset_implementation() { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + // We are not really using parallel_for here but using beginParallelFor + // instead of begin_parallel_for (and adding "via memset") is the best + // we can do to indicate that this is not supposed to be tunable (and + // doesn't really execute a parallel_for). + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "] via memset", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } + + (void)ZeroMemset(space, ptr, n * sizeof(ValueType)); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space) { + space.fence("Kokkos::View::initialization via memset"); + } + } + + void construct_shared_allocation() { +// On A64FX memset seems to do the wrong thing with regards to first touch +// leading to the significant performance issues +#ifndef KOKKOS_ARCH_A64FX + if constexpr (std::is_trivial_v) { + // value-initialization is equivalent to filling with zeros + zero_memset_implementation(); + } else +#endif + parallel_for_implementation(); + } void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v) { + // do nothing, don't bother calling the destructor + } else { #ifdef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - if constexpr (std::is_same_v) - for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); - else + if constexpr (std::is_same_v) + for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); + else #endif - { - parallel_for_implementation(); + parallel_for_implementation(); } } @@ -206,114 +199,6 @@ struct ViewValueFunctor { } }; -template -struct ViewValueFunctor { - using ExecSpace = typename DeviceType::execution_space; - using PolicyType = Kokkos::RangePolicy>; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t i) const { ptr[i] = ValueType(); } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) {} - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) {} - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_shared_allocation() { - // Shortcut for zero initialization -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - ValueType value{}; - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_shared_allocation() { - parallel_for_implementation(); - } - - void parallel_for_implementation() { - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } - - void destroy_shared_allocation() {} -}; - template struct ViewValueFunctorSequentialHostInit { using ExecSpace = typename DeviceType::execution_space; @@ -358,6 +243,63 @@ struct ViewValueFunctorSequentialHostInit { } }; +template +Kokkos::Impl::SharedAllocationRecord* make_shared_allocation_record( + const size_t& required_span_size, std::string_view label, + const MemorySpace& memory_space, + const std::optional exec_space, + std::bool_constant, std::bool_constant) { + static_assert(SpaceAccessibility::accessible); + + // Use this for constructing and destroying the view + using device_type = Kokkos::Device; + using functor_type = std::conditional_t< + SequentialInit, + ViewValueFunctorSequentialHostInit, + ViewValueFunctor>; + using record_type = + Kokkos::Impl::SharedAllocationRecord; + + /* Force alignment of allocations on on 8 byte boundaries even for + * element types smaller than 8 bytes */ + static constexpr std::size_t align_mask = 0x7; + + // Calculate the total size of the memory, in bytes, and make sure it is + // byte-aligned + const std::size_t alloc_size = + (required_span_size * sizeof(ElementType) + align_mask) & ~align_mask; + + auto* record = + exec_space + ? record_type::allocate(*exec_space, memory_space, std::string{label}, + alloc_size) + : record_type::allocate(memory_space, std::string{label}, alloc_size); + + auto ptr = static_cast(record->data()); + + auto functor = + exec_space ? functor_type(*exec_space, ptr, required_span_size, + std::string{label}) + : functor_type(ptr, required_span_size, std::string{label}); + + // Only initialize if the allocation is non-zero. + // May be zero if one of the dimensions is zero. + if constexpr (Initialize) { + if (alloc_size) { + // Assume destruction is only required when construction is requested. + // The ViewValueFunctor has both value construction and destruction + // operators. + record->m_destroy = std::move(functor); + + // Construct values + record->m_destroy.construct_shared_allocation(); + } + } + + return record; +} + } // namespace Kokkos::Impl #endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp similarity index 96% rename from lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp index 23d4c2524c7..f77066b70f5 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_ATOMIC_VIEW_HPP -#define KOKKOS_ATOMIC_VIEW_HPP +#ifndef KOKKOS_VIEWATOMIC_HPP +#define KOKKOS_VIEWATOMIC_HPP #include #include @@ -44,10 +44,10 @@ class AtomicDataElement { } KOKKOS_INLINE_FUNCTION - void inc() const { Kokkos::atomic_increment(ptr); } + void inc() const { Kokkos::atomic_inc(ptr); } KOKKOS_INLINE_FUNCTION - void dec() const { Kokkos::atomic_decrement(ptr); } + void dec() const { Kokkos::atomic_dec(ptr); } KOKKOS_INLINE_FUNCTION const_value_type operator++() const { @@ -215,7 +215,7 @@ class AtomicViewDataHandle { } KOKKOS_INLINE_FUNCTION - operator typename ViewTraits::value_type*() const { return ptr; } + operator typename ViewTraits::value_type *() const { return ptr; } }; } // namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp similarity index 84% rename from lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp index 379180ae643..f0804747172 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp @@ -72,8 +72,8 @@ struct ViewCtorProp {}; */ template struct ViewCtorProp> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = CommonViewAllocProp; @@ -92,8 +92,8 @@ struct ViewCtorProp || std::is_same_v || std::is_same_v>, P> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = P; @@ -106,14 +106,14 @@ struct ViewCtorProp || /* Map input label type to std::string */ template struct ViewCtorProp::value>, Label> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = std::string; ViewCtorProp(const type &arg) : value(arg) {} - ViewCtorProp(type &&arg) : value(arg) {} + ViewCtorProp(type &&arg) : value(std::move(arg)) {} type value; }; @@ -122,8 +122,8 @@ template struct ViewCtorProp::value || Kokkos::is_execution_space::value>, Space> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = Space; @@ -135,8 +135,8 @@ struct ViewCtorProp::value || template struct ViewCtorProp { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = T *; @@ -213,14 +213,19 @@ struct ViewCtorProp : public ViewCtorProp... { using execution_space = typename var_execution_space::type; using pointer_type = typename var_pointer::type; - /* Copy from a matching argument list. - * Requires std::is_same< P , ViewCtorProp< void , Args >::value ... - */ - template - inline ViewCtorProp(Args const &... args) : ViewCtorProp(args)... {} + // Construct from a matching argument list. + // + // Note that if P is empty, this constructor is the default constructor. + // On the other hand, if P is not empty, the constraint implies that + // there is no default constructor. + template , Args &&>...>>> + ViewCtorProp(Args &&...args) + : ViewCtorProp(std::forward(args))... {} template - KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &... args) + KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &...args) : ViewCtorProp(arg0), ViewCtorProp::type>(args)... {} @@ -252,7 +257,7 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop) { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, [[maybe_unused]] const Property &property, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -302,7 +307,7 @@ template struct WithPropertiesIfUnset, Property, Properties...> { static constexpr auto apply_prop(const ViewCtorProp &view_ctor_prop, const Property &prop, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -328,7 +333,7 @@ struct WithPropertiesIfUnset, Property, Properties...> { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, - const Properties &... properties) { + const Properties &...properties) { return WithPropertiesIfUnset, Properties...>::apply_prop( view_ctor_prop, properties...); } @@ -437,6 +442,48 @@ using ViewAllocateWithoutInitializing = Impl::ViewCtorProp; +inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; + +inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; + +inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; + +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory + * alignment + */ +template +auto view_alloc(Args &&...args) { + using return_type = Impl::ViewCtorProp>::type...>; + + static_assert(!return_type::has_pointer, + "Cannot give pointer-to-memory for view allocation"); + + return return_type(std::forward(args)...); +} + +template +KOKKOS_INLINE_FUNCTION + Impl::ViewCtorProp::type...> + view_wrap(Args const &...args) { + using return_type = + Impl::ViewCtorProp::type...>; + + static_assert(!return_type::has_memory_space && + !return_type::has_execution_space && + !return_type::has_label && return_type::has_pointer, + "Must only give pointer-to-memory for view wrapping"); + + return return_type(args...); +} + } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp similarity index 96% rename from lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp index 04c0c9aeede..37b6e2802fc 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp @@ -60,8 +60,8 @@ struct rank_dynamic { static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ }; \ template \ @@ -72,8 +72,8 @@ struct rank_dynamic { struct ViewDimension##R<0u, RD> { \ static constexpr size_t ArgN##R = 0; \ std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ }; \ @@ -149,8 +149,8 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension static constexpr unsigned rank = sizeof...(Vals); static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; ViewDimension& operator=(const ViewDimension&) = default; KOKKOS_INLINE_FUNCTION @@ -370,8 +370,7 @@ struct ViewDataAnalysis { // ValueType is opportunity for partial specialization. // Must match array analysis when this default template is used. static_assert( - std::is_same::value); + std::is_same_v); public: using specialize = void; // No specialization diff --git a/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp b/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp new file mode 100644 index 00000000000..fd406d58cca --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp @@ -0,0 +1,1604 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWLEGACY_HPP +#define KOKKOS_VIEWLEGACY_HPP + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#include +#endif +#include + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType > + * - View< DataType , Layout > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, double* + * indicates a one-dimensional array of \c double with run-time + * dimension, and int*[3] a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * Space. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on + * Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View out, + * View in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ + +} // namespace Kokkos + +namespace Kokkos { + +template +struct is_always_assignable_impl; + +template +struct is_always_assignable_impl, + Kokkos::View> { + using mapping_type = Kokkos::Impl::ViewMapping< + typename Kokkos::View::traits, + typename Kokkos::View::traits, + typename Kokkos::View::traits::specialize>; + + constexpr static bool value = + mapping_type::is_assignable && + static_cast(Kokkos::View::rank_dynamic) >= + static_cast(Kokkos::View::rank_dynamic); +}; + +template +using is_always_assignable = is_always_assignable_impl< + std::remove_reference_t, + std::remove_const_t>>; + +template +inline constexpr bool is_always_assignable_v = + is_always_assignable::value; + +template +constexpr bool is_assignable(const Kokkos::View& dst, + const Kokkos::View& src) { + using DstTraits = typename Kokkos::View::traits; + using SrcTraits = typename Kokkos::View::traits; + using mapping_type = + Kokkos::Impl::ViewMapping; + + return is_always_assignable_v, + Kokkos::View> || + (mapping_type::is_assignable && + ((DstTraits::dimension::rank_dynamic >= 1) || + (dst.static_extent(0) == src.extent(0))) && + ((DstTraits::dimension::rank_dynamic >= 2) || + (dst.static_extent(1) == src.extent(1))) && + ((DstTraits::dimension::rank_dynamic >= 3) || + (dst.static_extent(2) == src.extent(2))) && + ((DstTraits::dimension::rank_dynamic >= 4) || + (dst.static_extent(3) == src.extent(3))) && + ((DstTraits::dimension::rank_dynamic >= 5) || + (dst.static_extent(4) == src.extent(4))) && + ((DstTraits::dimension::rank_dynamic >= 6) || + (dst.static_extent(5) == src.extent(5))) && + ((DstTraits::dimension::rank_dynamic >= 7) || + (dst.static_extent(6) == src.extent(6))) && + ((DstTraits::dimension::rank_dynamic >= 8) || + (dst.static_extent(7) == src.extent(7)))); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +class View; + +template +struct is_view : public std::false_type {}; + +template +struct is_view> : public std::true_type {}; + +template +struct is_view> : public std::true_type {}; + +template +inline constexpr bool is_view_v = is_view::value; + +template +class View : public ViewTraits { + private: + template + friend class View; + template + friend class Kokkos::Impl::ViewMapping; + + using view_tracker_type = Kokkos::Impl::ViewTracker; + + public: + using traits = ViewTraits; + + private: + using map_type = + Kokkos::Impl::ViewMapping; + template + friend struct Kokkos::Impl::ViewTracker; + using hooks_policy = typename traits::hooks_policy; + + view_tracker_type m_track; + map_type m_map; + + public: + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + using array_type = + View; + + /** \brief Compatible view of const data type */ + using const_type = + View; + + /** \brief Compatible view of non-const data type */ + using non_const_type = + View; + + /** \brief Compatible host mirror view */ + using host_mirror_type = + View, + typename traits::hooks_policy>; + + /** \brief Compatible host mirror view */ + using HostMirror = host_mirror_type; + + /** \brief Unified types */ + using uniform_type = typename Impl::ViewUniformType::type; + using uniform_const_type = + typename Impl::ViewUniformType::const_type; + using uniform_runtime_type = + typename Impl::ViewUniformType::runtime_type; + using uniform_runtime_const_type = + typename Impl::ViewUniformType::runtime_const_type; + using uniform_nomemspace_type = + typename Impl::ViewUniformType::nomemspace_type; + using uniform_const_nomemspace_type = + typename Impl::ViewUniformType::const_nomemspace_type; + using uniform_runtime_nomemspace_type = + typename Impl::ViewUniformType::runtime_nomemspace_type; + using uniform_runtime_const_nomemspace_type = + typename Impl::ViewUniformType::runtime_const_nomemspace_type; + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + // Typedefs from mdspan + // using extents_type -> not applicable + // Defining layout_type here made MSVC+CUDA fail + // using layout_type = typename traits::array_layout; + // using accessor_type -> not applicable + // using mapping_type -> not applicable + using element_type = typename traits::value_type; + // using value_type -> conflicts with traits::value_type + using index_type = typename traits::memory_space::size_type; + // using size_type -> already from traits::size_type; where it is + // memory_space::size_type + using rank_type = size_t; + using data_handle_type = pointer_type; + using reference = reference_type; + + //---------------------------------------- + // Domain rank and extents + + static constexpr Impl::integral_constant + rank = {}; + static constexpr Impl::integral_constant + rank_dynamic = {}; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = + map_type::Rank}; +#endif + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + extent(const iType& r) const noexcept { + return m_map.extent(r); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return map_type::static_extent(r); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> + extent_int(const iType& r) const noexcept { + return static_cast(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * + m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * + m_map.dimension_6() * m_map.dimension_7(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + stride(iType r) const { + return ( + r == 0 + ? m_map.stride_0() + : (r == 1 + ? m_map.stride_1() + : (r == 2 + ? m_map.stride_2() + : (r == 3 + ? m_map.stride_3() + : (r == 4 + ? m_map.stride_4() + : (r == 5 + ? m_map.stride_5() + : (r == 6 + ? m_map.stride_6() + : m_map.stride_7()))))))); + } + + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference_v + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_map.data() != nullptr; + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping& + impl_map() const { + return m_map; + } + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::SharedAllocationTracker& impl_track() const { + return m_track.m_tracker; + } + //---------------------------------------- + + private: + static constexpr bool is_layout_left = + std::is_same_v; + + static constexpr bool is_layout_right = + std::is_same_v; + + static constexpr bool is_layout_stride = + std::is_same_v; + + static constexpr bool is_default_map = + std::is_void_v && + (is_layout_left || is_layout_right || is_layout_stride); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); \ + Kokkos::Impl::view_verify_operator_bounds( \ + __VA_ARGS__); + +#else + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); + +#endif + + template + static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); + } + + template + static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); + } + + public: + //------------------------------ + // Rank 1 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 1 operator[] + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && !is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + operator()(I0 i0, I1 i1) const { + check_operator_parens_valid_args(i0, i1); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which + // have "inlined" versions above + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + ((0 == rank) || !is_default_map)), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.reference(indices...); + } + + //------------------------------ + // Rank 0 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> + access(Is... extra) const { + check_access_member_function_valid_args(extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) + return m_map.reference(); + } + + //------------------------------ + // Rank 1 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && !is_default_map), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (2 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + return m_map.reference(i0, i1); + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + //------------------------------ + // Rank 3 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.reference(i0, i1, i2); + } + + //------------------------------ + // Rank 4 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.reference(i0, i1, i2, i3); + } + + //------------------------------ + // Rank 5 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.reference(i0, i1, i2, i3, i4); + } + + //------------------------------ + // Rank 6 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + //------------------------------ + // Rank 7 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + //------------------------------ + // Rank 8 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map + .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); + } + +#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard destructor, constructors, and assignment operators + + KOKKOS_DEFAULTED_FUNCTION + ~View() = default; + + KOKKOS_DEFAULTED_FUNCTION + View() = default; + + KOKKOS_FUNCTION + View(const View& other) : m_track(other.m_track), m_map(other.m_map) { + KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View(View&& other) + : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { + KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View& operator=(const View& other) { + m_map = other.m_map; + m_track = other.m_track; + + KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) + + return *this; + } + + KOKKOS_FUNCTION + View& operator=(View&& other) { + m_map = std::move(other.m_map); + m_track = std::move(other.m_track); + + KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) + + return *this; + } + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View( + const View& rhs, + std::enable_if_t::traits, + typename traits::specialize>::is_assignable_data_type>* = nullptr) + : m_track(rhs), m_map() { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t< + Kokkos::Impl::ViewMapping< + traits, typename View::traits, + typename traits::specialize>::is_assignable_data_type, + View>& + operator=(const View& rhs) { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + m_track.assign(rhs); + return *this; + } + + //---------------------------------------- + // Compatible subview constructor + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, + Args... args) + : m_track(src_view), m_map() { + using SrcType = View; + + using Mapping = Kokkos::Impl::ViewMapping; + + using DstType = typename Mapping::type; + + static_assert( + Kokkos::Impl::ViewMapping::is_assignable, + "Subview construction requires compatible view and subview arguments"); + + Mapping::assign(m_map, src_view.m_map, arg0, args...); + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.m_tracker.use_count(); } + + inline const std::string label() const { + return m_track.m_tracker + .template get_label(); + } + + public: + //---------------------------------------- + // Allocation according to allocation properties and array layout + + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track(), m_map() { + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, typename traits::device_type::memory_space{}, + typename traits::device_type::execution_space{}); + using alloc_prop = decltype(prop_copy); + + static_assert(traits::is_managed, + "View allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif + + Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( + prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); + + // Setup and initialization complete, start tracking + m_track.m_tracker.assign_allocated_record_to_uninitialized(record); + } + + KOKKOS_INLINE_FUNCTION + void assign_data(pointer_type arg_data) { + m_track.m_tracker.clear(); + m_map.assign_data(arg_data); + } + + // Wrap memory according to properties and array layout + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track() // No memory tracking + , + m_map(arg_prop, arg_layout) { + static_assert( + std::is_same::pointer_type>::value, + "Constructing View to wrap user memory must supply matching pointer " + "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif + } + + // Simple dimension-only layout + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Allocate with label and layout + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, + typename traits::array_layout> const& arg_layout) + : View(Impl::ViewCtorProp(arg_label), arg_layout) {} + + // Allocate label and layout, must disambiguate from subview constructor. + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, const size_t> + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_label), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Construct view from ViewTracker and map + // This should be the preferred method because future extensions may need to + // use the ViewTracker class. + template + KOKKOS_INLINE_FUNCTION View( + const view_tracker_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track.m_tracker); + } + + // Construct View from internal shared allocation tracker object and map + // This is here for backwards compatibility for classes that derive from + // Kokkos::View + template + KOKKOS_INLINE_FUNCTION View( + const typename view_tracker_type::track_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track); + } + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + typename traits::array_layout const& layout) { + return map_type::memory_span(layout); + } + + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, + const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, + const size_t arg_N6 = 0, const size_t arg_N7 = 0) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_ptr), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} + + //---------------------------------------- + // Shared scratch memory constructor + + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + const size_t num_passed_args = Impl::count_valid_integers( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + + if (std::is_void_v && + num_passed_args != rank_dynamic) { + Kokkos::abort( + "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); + } + + return View::shmem_size(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + private: + // Want to be able to align to minimum scratch alignment or sizeof or alignof + // elements + static constexpr size_t scratch_value_alignment = + max({sizeof(typename traits::value_type), + alignof(typename traits::value_type), + static_cast( + traits::execution_space::scratch_memory_space::ALIGN)}); + + public: + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(typename traits::array_layout const& arg_layout) { + return map_type::memory_span(arg_layout) + scratch_value_alignment; + } + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(reinterpret_cast( + arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), + scratch_value_alignment))), + arg_layout) {} + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp( + reinterpret_cast(arg_space.get_shmem_aligned( + map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, + arg_N7)), + scratch_value_alignment))), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template ::mdspan_type, + typename = std::enable_if_t, + std::false_type, + std::is_assignable, + ImplNaturalMDSpanType>>::value>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template >, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = + typename Impl::MDSpanViewTraits::accessor_type()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN +}; + +template +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { + return View::rank(); +} + +namespace Impl { + +template +struct RankDataType { + using type = typename RankDataType::type*; +}; + +template +struct RankDataType { + using type = ValueType; +}; + +template +KOKKOS_FUNCTION std::enable_if_t< + N == View::rank() && + std::is_same_v::specialize, void>, + View> +as_view_of_rank_n(View v) { + return v; +} + +// Placeholder implementation to compile generic code for DynRankView; should +// never be called +template +KOKKOS_FUNCTION std::enable_if_t< + N != View::rank() && + std::is_same_v::specialize, void>, + View::value_type, N>::type, + Args...>> +as_view_of_rank_n(View) { + Kokkos::abort("Trying to get at a View of the wrong rank"); + return {}; +} + +template +void apply_to_view_of_static_rank(Function&& f, View a) { + f(a); +} + +} // namespace Impl + +template +KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, + Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + static_assert(Kokkos::is_memory_traits::value); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} +#endif + +template +using Subview = decltype(subview(std::declval(), std::declval()...)); + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, + const View& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits; + using rhs_traits = ViewTraits; + + return std::is_same_v && + std::is_same_v && + std::is_same_v && + View::rank() == View::rank() && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); +} + +template +KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, + const View& rhs) { + return !(operator==(lhs, rhs)); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +struct CommonViewValueType; + +template +struct CommonViewValueType { + using value_type = std::common_type_t; +}; + +template +struct CommonViewAllocProp; + +template +struct CommonViewAllocProp { + using value_type = ValueType; + using scalar_array_type = ValueType; + + template + KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} +}; + +template +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template +struct DeduceCommonViewAllocProp { + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view::value }; + + using prop_type = CommonViewAllocProp; +}; + +template +struct DeduceCommonViewAllocProp { + using NextTraits = DeduceCommonViewAllocProp; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error + // out + static_assert(!(!std::is_same_v && + !std::is_void_v && + !std::is_void_v), + "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " + "specialize trait allowed"); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = + std::conditional_t, + first_specialize, + std::conditional_t<(std::is_void_v && + !std::is_void_v), + next_specialize, first_specialize>>; + + using value_type = typename CommonViewValueType::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp; +}; + +} // end namespace Impl + +template +using DeducedCommonPropsType = + typename Impl::DeduceCommonViewAllocProp::prop_type; + +// This function is required in certain scenarios where users customize +// Kokkos View internals. One example are dynamic length embedded ensemble +// types. The function is used to propagate necessary information +// (like the ensemble size) when creating new views. +// However, most of the time it is called with a single view. +// Furthermore, the propagated information is not just for view allocations. +// From what I can tell, the type of functionality provided by +// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, +// a mechanism we will eventually use to replace this clunky approach here, when +// we are finally mdspan based. +// TODO: get rid of this when we have mdspan +template +KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( + Views const&... views) { + return DeducedCommonPropsType(views...); +} + +} // namespace Kokkos + +#include +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWLEGACY_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp similarity index 90% rename from lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp index 10aaa63b7c8..ecc19eaf5e2 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp @@ -28,61 +28,41 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include #include #include #include -#include +#include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { - -struct ALL_t { - KOKKOS_INLINE_FUNCTION - constexpr const ALL_t& operator()() const { return *this; } - - KOKKOS_INLINE_FUNCTION - constexpr bool operator==(const ALL_t&) const { return true; } -}; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Impl { -// TODO This alias declaration forces us to fully qualify ALL_t inside the -// Kokkos::Impl namespace to avoid deprecation warnings. Replace the -// fully-qualified name when we remove Kokkos::Impl::ALL_t. -using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = - Kokkos::ALL_t; -} // namespace Impl -#endif -} // namespace Kokkos - namespace Kokkos { namespace Impl { template struct is_integral_extent_type { - enum : bool { value = std::is_same::value ? 1 : 0 }; + enum : bool { value = std::is_same_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; // Assuming '2 == initializer_list::size()' template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template @@ -93,8 +73,7 @@ struct is_integral_extent { enum : bool { value = is_integral_extent_type::value }; - static_assert(value || std::is_integral::value || - std::is_void::value, + static_assert(value || std::is_integral_v || std::is_void_v, "subview argument must be either integral or integral extent"); }; @@ -112,16 +91,16 @@ struct SubviewLegalArgsCompileTime { enum { - value = (((CurrentArg == RankDest - 1) && - (Kokkos::Impl::is_integral_extent_type::value)) || - ((CurrentArg >= RankDest) && (std::is_integral::value)) || - ((CurrentArg < RankDest) && - (std::is_same::value)) || - ((CurrentArg == 0) && - (Kokkos::Impl::is_integral_extent_type::value))) && - (SubviewLegalArgsCompileTime::value) + value = + (((CurrentArg == RankDest - 1) && + (Kokkos::Impl::is_integral_extent_type::value)) || + ((CurrentArg >= RankDest) && (std::is_integral_v)) || + ((CurrentArg < RankDest) && (std::is_same_v)) || + ((CurrentArg == 0) && + (Kokkos::Impl::is_integral_extent_type::value))) && + (SubviewLegalArgsCompileTime::value) }; }; @@ -129,7 +108,7 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankDest - 1) || (std::is_integral::value)) && + value = ((CurrentArg == RankDest - 1) || (std::is_integral_v)) && (CurrentArg == RankSrc - 1) }; }; @@ -144,10 +123,9 @@ struct SubviewLegalArgsCompileTime::value)) || - ((CurrentArg < RankSrc - RankDest) && - (std::is_integral::value)) || + ((CurrentArg < RankSrc - RankDest) && (std::is_integral_v)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same::value))) && + (std::is_same_v))) && (SubviewLegalArgsCompileTime::value) @@ -158,8 +136,8 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankSrc - 1) && - (std::is_same::value)) + value = + ((CurrentArg == RankSrc - 1) && (std::is_same_v)) }; }; @@ -392,7 +370,7 @@ struct SubviewExtents { const int n = snprintf(buffer, LEN, "Kokkos::subview bounds error ("); error(buffer + n, LEN - n, 0, 0, dim, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE(((void)dim; Kokkos::abort("Kokkos::subview bounds error"); @@ -718,8 +696,8 @@ struct ViewOffset< return *this; } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -885,14 +863,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1071,8 +1052,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1086,7 +1067,11 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride(Padding::stride(arg_layout.dimension[0])) {} + m_stride( + arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG + ? arg_layout.stride + : Padding::stride(arg_layout.dimension[0])) { + } template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1407,8 +1392,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1565,14 +1550,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1614,8 +1602,8 @@ struct ViewOffset< } KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * - m_dim.N2 * m_dim.N1; + return m_stride == static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * + m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1; } /* Strides of dimensions */ @@ -1624,19 +1612,21 @@ struct ViewOffset< return m_dim.N7; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { - return m_dim.N7 * m_dim.N6; + return static_cast(m_dim.N7) * m_dim.N6; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3 * m_dim.N2; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride; @@ -1749,13 +1739,31 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif /* Enable padding for trivial scalar types with non-zero trivial scalar size. */ + + private: + template + KOKKOS_FUNCTION constexpr size_type compute_stride( + const Kokkos::LayoutRight& arg_layout) { + if (arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG) + return arg_layout.stride; + size_type value = m_dim.N1; + if constexpr (dimension_type::rank > 2) value *= m_dim.N2; + if constexpr (dimension_type::rank > 3) value *= m_dim.N3; + if constexpr (dimension_type::rank > 4) value *= m_dim.N4; + if constexpr (dimension_type::rank > 5) value *= m_dim.N5; + if constexpr (dimension_type::rank > 6) value *= m_dim.N6; + if constexpr (dimension_type::rank > 7) value *= m_dim.N7; + return Padding::stride(value); + } + + public: template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( std::integral_constant const&, @@ -1764,37 +1772,7 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride( - Padding:: - stride(/* 2 <= rank */ - m_dim.N1 * - (dimension_type::rank == 2 - ? size_t(1) - : m_dim.N2 * - (dimension_type::rank == 3 - ? size_t(1) - : m_dim.N3 * - (dimension_type::rank == 4 - ? size_t(1) - : m_dim.N4 * - (dimension_type::rank == - 5 - ? size_t(1) - : m_dim.N5 * - (dimension_type:: - rank == - 6 - ? size_t( - 1) - : m_dim.N6 * - (dimension_type:: - rank == - 7 - ? size_t( - 1) - : m_dim - .N7)))))))) { - } + m_stride(compute_stride(arg_layout)) {} template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1886,8 +1864,8 @@ struct ViewStride<0> { static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1901,8 +1879,8 @@ struct ViewStride<1> { static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1916,8 +1894,8 @@ struct ViewStride<2> { size_t S0, S1; static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1931,8 +1909,8 @@ struct ViewStride<3> { size_t S0, S1, S2; static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1946,8 +1924,8 @@ struct ViewStride<4> { size_t S0, S1, S2, S3; static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1961,8 +1939,8 @@ struct ViewStride<5> { size_t S0, S1, S2, S3, S4; static constexpr size_t S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1976,8 +1954,8 @@ struct ViewStride<6> { size_t S0, S1, S2, S3, S4, S5; static constexpr size_t S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1991,8 +1969,8 @@ struct ViewStride<7> { size_t S0, S1, S2, S3, S4, S5, S6; static constexpr size_t S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2005,8 +1983,8 @@ template <> struct ViewStride<8> { size_t S0, S1, S2, S3, S4, S5, S6, S7; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2283,8 +2261,8 @@ struct ViewOffset { } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -2398,9 +2376,9 @@ struct ViewDataHandle { template struct ViewDataHandle< Traits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && + std::enable_if_t<(std::is_same_v && + std::is_void_v && Traits::memory_traits::is_atomic)>> { using value_type = typename Traits::value_type; using handle_type = typename Kokkos::Impl::AtomicViewDataHandle; @@ -2422,11 +2400,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - (!Traits::memory_traits::is_aligned) && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + (!Traits::memory_traits::is_aligned) && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; using handle_type = typename Traits::value_type* KOKKOS_RESTRICT; using return_type = typename Traits::value_type& KOKKOS_RESTRICT; @@ -2446,11 +2423,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - (!Traits::memory_traits::is_restrict) && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + (!Traits::memory_traits::is_restrict) && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2485,11 +2461,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2533,11 +2508,10 @@ namespace Impl { /** \brief View mapping for non-specialized data type and standard layout */ template class ViewMapping< - Traits, - std::enable_if_t<( - std::is_void::value && - ViewOffset::is_mapping_plugin::value)>> { + Traits, std::enable_if_t<(std::is_void_v && + ViewOffset::is_mapping_plugin::value)>> { public: using offset_type = ViewOffset; @@ -2680,28 +2654,26 @@ class ViewMapping< reference_type reference() const { return m_impl_handle[0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if layout is neither stride nor irregular, - // then just use the handle directly - !(std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if layout is neither stride nor irregular, + // then just use the handle directly + !(std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if the layout is strided or irregular, then - // we have to use the offset - (std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if the layout is strided or irregular, then + // we have to use the offset + (std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[m_impl_offset(i0)]; } @@ -2780,7 +2752,7 @@ class ViewMapping< KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(const ViewMapping&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(ViewMapping&&) = default; //---------------------------------------- @@ -2894,29 +2866,34 @@ template class ViewMapping< DstTraits, SrcTraits, std::enable_if_t<( - !(std::is_same:: - value) && // Added to have a new specialization for SrcType of - // LayoutStride + !(std::is_same_v)&& // Added to have a new + // specialization for + // SrcType of + // LayoutStride // default mappings - std::is_void::value && - std::is_void::value && + std::is_void_v && + std::is_void_v && ( // same layout - std::is_same::value || + std::is_same_v || // known layout - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value))))>> { + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>))))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -2926,10 +2903,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -2939,12 +2916,12 @@ class ViewMapping< }; enum { - is_assignable_layout = - std::is_same::value || - std::is_same::value || - (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1) + is_assignable_layout = std::is_same_v || + std::is_same_v || + (DstTraits::dimension::rank == 0) || + (DstTraits::dimension::rank == 1) }; public: @@ -3032,22 +3009,21 @@ class ViewMapping< template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same::value && - std::is_void::value && - std::is_void::value && - ( - // same layout - std::is_same::value || - // known layout - (std::is_same::value || - std::is_same::value || - std::is_same::value)))>> { + std::enable_if_t<(std::is_same_v && + std::is_void_v && + std::is_void_v && + ( + // same layout + std::is_same_v || + // known layout + (std::is_same_v || + std::is_same_v || + std::is_same_v)))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -3057,10 +3033,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -3091,8 +3067,7 @@ class ViewMapping< bool assignable = true; src.stride(strides); size_t exp_stride = 1; - if (std::is_same::value) { + if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(i - 1); if (strides[i] != exp_stride) { @@ -3100,8 +3075,8 @@ class ViewMapping< break; } } - } else if (std::is_same::value) { + } else if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(src.Rank - i); if (strides[src.Rank - 1 - i] != exp_stride) { @@ -3197,8 +3172,8 @@ struct SubViewDataTypeImpl> { template struct SubViewDataTypeImpl< - std::enable_if_t>::value>, - ValueType, Kokkos::Experimental::Extents, Integral, Args...> + std::enable_if_t>>, ValueType, + Kokkos::Experimental::Extents, Integral, Args...> : SubViewDataTypeImpl, Args...> {}; @@ -3230,13 +3205,13 @@ struct SubViewDataType : SubViewDataTypeImpl {}; template class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>, + std::enable_if_t<( + std::is_void_v && + (std::is_same_v || + std::is_same_v || + std::is_same_v))>, SrcTraits, Args...> { private: static_assert(SrcTraits::rank == sizeof...(Args), @@ -3292,14 +3267,14 @@ class ViewMapping< // OutputRank 1 or 2, InputLayout Left, Interval 0 // because single stride one or second index has a stride. (rank <= 2 && R0 && - std::is_same::value) // replace with input rank + std::is_same_v) // replace with input rank || // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] // because single stride one or second index has a stride. (rank <= 2 && R0_rev && - std::is_same::value) // replace input rank + std::is_same_v) // replace input rank ), typename SrcTraits::array_layout, Kokkos::LayoutStride>; diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp similarity index 100% rename from lib/kokkos/core/src/impl/Kokkos_ViewTracker.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp diff --git a/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp b/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp new file mode 100644 index 00000000000..5eddfc68e07 --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp @@ -0,0 +1,457 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWTRAITS_HPP +#define KOKKOS_VIEWTRAITS_HPP + +#include +#include +#include +#include +#include +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct ALL_t { + KOKKOS_FUNCTION + constexpr const ALL_t& operator()() const { return *this; } + + KOKKOS_FUNCTION + constexpr bool operator==(const ALL_t&) const { return true; } +}; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +// TODO This alias declaration forces us to fully qualify ALL_t inside the +// Kokkos::Impl namespace to avoid deprecation warnings. Replace the +// fully-qualified name when we remove Kokkos::Impl::ALL_t. +using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = + Kokkos::ALL_t; +} // namespace Impl +#endif + +// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with +// the OpenMPTarget backend +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp declare target +#endif + +inline constexpr Kokkos::ALL_t ALL{}; + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp end declare target +#endif + +namespace Impl { + +template +struct ViewArrayAnalysis; + +template ::non_const_value_type> +struct ViewDataAnalysis; + +template +class ViewMapping { + public: + enum : bool { is_assignable_data_type = false }; + enum : bool { is_assignable = false }; +}; + +template +constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( + const IntType i0, const IntType i1, const IntType i2, const IntType i3, + const IntType i4, const IntType i5, const IntType i6, const IntType i7) { + static_assert(std::is_integral_v, + "count_valid_integers() must have integer arguments."); + + return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + + (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + + (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + + (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); +} + +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { + (void)(label); + + if (is_void_spec) { + const size_t num_passed_args = + count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } + + if (!n_args_is_dyn_rank && !n_args_is_rank) { + KOKKOS_IF_ON_HOST( + const std::string message = + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + + std::to_string(num_passed_args) + + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " + "mismatched number of arguments.");) + } + } +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +// Class to provide a uniform type +namespace Kokkos { +namespace Impl { +template +struct ViewUniformType; +} +} // namespace Kokkos + +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename LayoutFromArrayLayout::type; + using accessor_type = + SpaceAwareAccessor>; + using mdspan_type = mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument options: + * - View< DataType > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , ArrayLayout > + * - View< DataType , ArrayLayout , Space > + * - View< DataType , ArrayLayout , MemoryTraits > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + */ + +template +struct ViewTraits; + +template <> +struct ViewTraits { + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = void; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + // Ignore an extraneous 'void' + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + HooksPolicy, Prop...> { + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = HooksPolicy; +}; + +template +struct ViewTraits::value>, + ArrayLayout, Prop...> { + // Specify layout, keep subsequent space and memory traits arguments + + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = ArrayLayout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits::value>, Space, + Prop...> { + // Specify Space, memory traits should be the only subsequent argument. + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::HostMirrorSpace, + void> && + std::is_same_v::array_layout, + void>, + "Only one View Execution or Memory Space template argument"); + + using execution_space = typename Space::execution_space; + using memory_space = typename Space::memory_space; + using HostMirrorSpace = + typename Kokkos::Impl::HostMirror::Space::memory_space; + using array_layout = typename execution_space::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + MemoryTraits, Prop...> { + // Specify memory trait, should not be any subsequent arguments + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::array_layout, + void> && + std::is_same_v::memory_traits, + void> && + std::is_same_v::hooks_policy, + void>, + "MemoryTrait is the final optional template argument for a View"); + + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = MemoryTraits; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + private: + // Unpack the properties arguments + using prop = ViewTraits; + + using ExecutionSpace = + std::conditional_t, + typename prop::execution_space, + Kokkos::DefaultExecutionSpace>; + + using MemorySpace = + std::conditional_t, + typename prop::memory_space, + typename ExecutionSpace::memory_space>; + + using ArrayLayout = + std::conditional_t, + typename prop::array_layout, + typename ExecutionSpace::array_layout>; + + using HostMirrorSpace = std::conditional_t< + !std::is_void_v, + typename prop::HostMirrorSpace, + typename Kokkos::Impl::HostMirror::Space>; + + using MemoryTraits = + std::conditional_t, + typename prop::memory_traits, + typename Kokkos::MemoryManaged>; + + using HooksPolicy = + std::conditional_t, + typename prop::hooks_policy, + Kokkos::Experimental::DefaultViewHooks>; + + // Analyze data type's properties, + // May be specialized based upon the layout and value type + using data_analysis = Kokkos::Impl::ViewDataAnalysis; + + public: + //------------------------------------ + // Data type traits: + + using data_type = typename data_analysis::type; + using const_data_type = typename data_analysis::const_type; + using non_const_data_type = typename data_analysis::non_const_type; + + //------------------------------------ + // Compatible array of trivial type traits: + + using scalar_array_type = typename data_analysis::scalar_array_type; + using const_scalar_array_type = + typename data_analysis::const_scalar_array_type; + using non_const_scalar_array_type = + typename data_analysis::non_const_scalar_array_type; + + //------------------------------------ + // Value type traits: + + using value_type = typename data_analysis::value_type; + using const_value_type = typename data_analysis::const_value_type; + using non_const_value_type = typename data_analysis::non_const_value_type; + + //------------------------------------ + // Mapping traits: + + using array_layout = ArrayLayout; + using dimension = typename data_analysis::dimension; + + using specialize = std::conditional_t< + std::is_void_v, + typename prop::specialize, + typename data_analysis::specialize>; /* mapping specialization tag */ + + static constexpr unsigned rank = dimension::rank; + static constexpr unsigned rank_dynamic = dimension::rank_dynamic; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device; + using memory_traits = MemoryTraits; + using host_mirror_space = HostMirrorSpace; + using hooks_policy = HooksPolicy; + + using size_type = typename MemorySpace::size_type; + + enum { is_hostspace = std::is_same_v }; + enum { is_managed = MemoryTraits::is_unmanaged == 0 }; + enum { is_random_access = MemoryTraits::is_random_access == 1 }; + + //------------------------------------ +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Impl { +template +struct TypeListToViewTraits; + +template +struct TypeListToViewTraits> { + using type = ViewTraits; +}; + +// It is not safe to assume that subviews of views with the Aligned memory trait +// are also aligned. Hence, just remove that attribute for subviews. +template +struct RemoveAlignedMemoryTrait { + private: + using type_list_in = Kokkos::Impl::type_list; + using memory_traits = typename ViewTraits::memory_traits; + using type_list_in_wo_memory_traits = + typename Kokkos::Impl::type_list_remove_first::type; + using new_memory_traits = + Kokkos::MemoryTraits; + using new_type_list = typename Kokkos::Impl::concat_type_list< + type_list_in_wo_memory_traits, + Kokkos::Impl::type_list>::type; + + public: + using type = typename TypeListToViewTraits::type; +}; +} // namespace Impl + +} /* namespace Kokkos */ + +#endif /* KOKKOS_VIEWTRAITS_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp b/lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp similarity index 88% rename from lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp index 7de2869a0d8..1e476132858 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp @@ -24,11 +24,14 @@ namespace Impl { template struct ViewScalarToDataType { using type = typename ViewScalarToDataType::type *; + using const_type = + typename ViewScalarToDataType::const_type *; }; template struct ViewScalarToDataType { - using type = ScalarType; + using type = ScalarType; + using const_type = const ScalarType; }; template @@ -49,12 +52,13 @@ struct ViewUniformLayout { template struct ViewUniformType { using data_type = typename ViewType::data_type; - using const_data_type = std::add_const_t; + using const_data_type = typename ViewType::const_data_type; using runtime_data_type = typename ViewScalarToDataType::type; - using runtime_const_data_type = typename ViewScalarToDataType< - std::add_const_t, ViewType::rank>::type; + using runtime_const_data_type = + typename ViewScalarToDataType::const_type; using array_layout = typename ViewUniformLayout { } KOKKOS_FUNCTION - constexpr typename offset_policy::data_handle_type offset(data_handle_type p, - size_t i) const - noexcept { + constexpr typename offset_policy::data_handle_type offset( + data_handle_type p, size_t i) const noexcept { return nested_acc.offset(p, i); } @@ -214,6 +212,199 @@ struct AtomicAccessorRelaxed { } }; +//===================================================================== +//============= Reference Counted Accessor and DataHandle ============= +//===================================================================== + +template +class ReferenceCountedDataHandle { + public: + using value_type = ElementType; + using pointer = value_type*; + using reference = value_type&; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + ReferenceCountedDataHandle() = default; + + // this only ever works on host + explicit ReferenceCountedDataHandle(SharedAllocationRecord* rec) { + m_tracker.assign_allocated_record_to_uninitialized(rec); + m_handle = static_cast(get_record()->data()); + } + + KOKKOS_FUNCTION + ReferenceCountedDataHandle(const SharedAllocationTracker& tracker, + pointer data_handle) + : m_tracker(tracker), m_handle(data_handle) {} + + // unmanaged ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle(OtherElementType* ptr) + : m_tracker(), m_handle(ptr) {} + + // subview ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other, OtherElementType* ptr) + : m_tracker(other.m_tracker), m_handle(ptr) {} + + // converting ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + template < + class OtherElementType, class OtherSpace, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)>> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + KOKKOS_FUNCTION + pointer get() const noexcept { return m_handle; } + KOKKOS_FUNCTION + explicit operator pointer() const noexcept { return m_handle; } + + bool has_record() const { return m_tracker.has_record(); } + auto* get_record() const { return m_tracker.get_record(); } + int use_count() const noexcept { return m_tracker.use_count(); } + + std::string get_label() const { return m_tracker.get_label(); } + KOKKOS_FUNCTION + const SharedAllocationTracker& tracker() const noexcept { return m_tracker; } + + KOKKOS_FUNCTION + friend bool operator==(const ReferenceCountedDataHandle& lhs, + const value_type* rhs) { + return lhs.m_handle == rhs; + } + + KOKKOS_FUNCTION + friend bool operator==(const value_type* lhs, + const ReferenceCountedDataHandle& rhs) { + return lhs == rhs.m_handle; + } + + private: + template + friend class ReferenceCountedDataHandle; + + template + friend class ReferenceCountedAccessor; + + SharedAllocationTracker m_tracker; + pointer m_handle = nullptr; +}; + +template +class ReferenceCountedAccessor; + +template +struct IsReferenceCountedAccessor : std::false_type {}; + +template +struct IsReferenceCountedAccessor< + ReferenceCountedAccessor> + : std::true_type {}; + +template +class ReferenceCountedAccessor { + public: + using element_type = ElementType; + using data_handle_type = ReferenceCountedDataHandle; + using reference = typename NestedAccessor::reference; + using offset_policy = + ReferenceCountedAccessor; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + constexpr ReferenceCountedAccessor() noexcept = default; + + template < + class OtherElementType, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + std::is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template < + class OtherElementType, class OtherSpace, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)&&std:: + is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template >> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const default_accessor&) {} + + template ::value && + std::is_convertible_v>> + KOKKOS_FUNCTION operator DstAccessor() const { + return m_nested_acc; + } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const { + return m_nested_acc.access(p.get(), i); + } + + KOKKOS_FUNCTION + constexpr data_handle_type offset(data_handle_type p, size_t i) const { + return data_handle_type(p, m_nested_acc.offset(p.get(), i)); + } + + KOKKOS_FUNCTION + constexpr auto nested_accessor() const { return m_nested_acc; } + + private: +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor m_nested_acc; +}; + +template +using CheckedReferenceCountedAccessor = + SpaceAwareAccessor>>; + +template +using CheckedRelaxedAtomicAccessor = + SpaceAwareAccessor>; + +template +using CheckedReferenceCountedRelaxedAtomicAccessor = SpaceAwareAccessor< + MemorySpace, ReferenceCountedAccessor>>; + } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp index 089628137d7..f990d158bfa 100644 --- a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp +++ b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -23,7 +23,11 @@ static_assert(false, #define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP #include "Kokkos_MDSpan_Extents.hpp" -#include +#include + +// The difference between a legacy Kokkos array layout and an +// mdspan layout is that the array layouts can have state, but don't have the +// nested mapping. This file provides interoperability helpers. namespace Kokkos::Impl { @@ -77,32 +81,7 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 7 ? mapping.stride(7) : 0, }; } else { - // FIXME: Kokkos Layouts don't store stride (it's in the mapping) - // We could conceivably fix this by adding an extra ViewCtorProp for - // an abritrary padding. For now we will check for this. - if constexpr (rank > 1 && - (std::is_same_v> || - std::is_same_v>)) { - [[maybe_unused]] constexpr size_t strided_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 1 - : rank - 2; - [[maybe_unused]] constexpr size_t extent_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 0 - : rank - 1; - KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); - } - - return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + ArrayLayout layout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -110,12 +89,98 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + + if constexpr (rank > 1 && + std::is_same_v>) { + layout.stride = mapping.stride(1); + } + if constexpr (std::is_same_v>) { + if constexpr (rank == 2) { + layout.stride = mapping.stride(0); + } + if constexpr (rank > 2) { + if (mapping.stride(rank - 2) != mapping.extents().extent(rank - 1)) + Kokkos::abort( + "Invalid conversion from layout_right_padded to LayoutRight"); + } + } + return layout; } #ifdef KOKKOS_COMPILER_INTEL __builtin_unreachable(); #endif } +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + ArrayLayout layout, std::index_sequence) { + using index_type = typename MappingType::index_type; + using extents_type = typename MappingType::extents_type; + if constexpr (std::is_same_v || + std::is_same_v) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if (layout.stride == KOKKOS_IMPL_CTOR_DEFAULT_ARG || + extents_type::rank() < 2) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if constexpr (std::is_same_v && + extents_type::rank() > 2) { + size_t product_of_dimensions = 1; + for (size_t r = 1; r < extents_type::rank(); r++) + product_of_dimensions *= layout.dimension[r]; + if (product_of_dimensions != layout.stride) + Kokkos::abort( + "Invalid conversion from LayoutRight to layout_right_padded"); + } else { + return MappingType{ + extents_type{ + dextents{ + layout.dimension[Idx]...}}, + layout.stride}; + } + } + } +} +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride layout, std::index_sequence) { + static_assert( + std::is_same_v); + using index_type = typename MappingType::index_type; + index_type strides[MappingType::extents_type::rank()] = { + layout.stride[Idx]...}; + return MappingType{ + mdspan_non_standard_tag(), + static_cast( + dextents{ + layout.dimension[Idx]...}), + strides}; +} + +// specialization for rank 0 to avoid empty array +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride, std::index_sequence<>) { + return MappingType{}; +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout(ArrayLayout layout) { + return mapping_from_array_layout_impl( + layout, std::make_index_sequence()); +} + template KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { using mapping_type = typename MDSpanType::mapping_type; diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp index ebdf2c8211f..79c137bfddd 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -28,7 +28,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp index d13c90825c5..3570ed2b6e1 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -35,6 +35,16 @@ #include #include #include + +namespace Kokkos { +namespace Experimental { +using SYCLDeviceUSMSpace = ::Kokkos::SYCLDeviceUSMSpace; +using SYCLHostUSMSpace = ::Kokkos::SYCLHostUSMSpace; +using SYCLSharedUSMSpace = ::Kokkos::SYCLSharedUSMSpace; +using SYCL = ::Kokkos::SYCL; +} // namespace Experimental +} // namespace Kokkos + #endif #endif diff --git a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp index 400794f8659..399b986041e 100644 --- a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp +++ b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp @@ -19,7 +19,6 @@ #if defined(KOKKOS_ENABLE_SYCL) namespace Kokkos { -namespace Experimental { class SYCLDeviceUSMSpace; ///< Memory space on SYCL device, not accessible from ///< the host class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL @@ -27,7 +26,6 @@ class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL class SYCLHostUSMSpace; ///< Memory space accessible from both the SYCL ///< device and the host (host pinned) class SYCL; ///< Execution space for SYCL -} // namespace Experimental } // namespace Kokkos #endif #endif diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index a44ffefa6b7..a9db2c4cf4a 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -1458,7 +1458,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> { template struct Tile_Loop_Type<1, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1477,7 +1477,7 @@ struct Tile_Loop_Type<1, IsLeft, IType, Tagged, template struct Tile_Loop_Type<2, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1496,7 +1496,7 @@ struct Tile_Loop_Type<2, IsLeft, IType, Tagged, template struct Tile_Loop_Type<3, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1515,7 +1515,7 @@ struct Tile_Loop_Type<3, IsLeft, IType, Tagged, template struct Tile_Loop_Type<4, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1534,7 +1534,7 @@ struct Tile_Loop_Type<4, IsLeft, IType, Tagged, template struct Tile_Loop_Type<5, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1553,7 +1553,7 @@ struct Tile_Loop_Type<5, IsLeft, IType, Tagged, template struct Tile_Loop_Type<6, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1572,7 +1572,7 @@ struct Tile_Loop_Type<6, IsLeft, IType, Tagged, template struct Tile_Loop_Type<7, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1591,7 +1591,7 @@ struct Tile_Loop_Type<7, IsLeft, IType, Tagged, template struct Tile_Loop_Type<8, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1616,7 +1616,7 @@ struct HostIterateTile; // For ParallelFor template struct HostIterateTile::value>> { + std::enable_if_t>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -1635,12 +1635,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2000,30 +1999,28 @@ struct HostIterateTile - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void_v), void> apply(Args&&... args) const { m_func(args...); } template - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void_v), void> apply(Args&&... args) const { m_func(m_tag, args...); } RP const m_rp; Functor const m_func; - std::conditional_t::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce // ValueType - scalar: For reductions template struct HostIterateTile::value && - !std::is_array::value>> { + std::enable_if_t && + !std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2050,12 +2047,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2430,7 +2426,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce @@ -2438,8 +2434,8 @@ struct HostIterateTile struct HostIterateTile::value && - std::is_array::value>> { + std::enable_if_t && + std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2463,12 +2459,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2842,7 +2837,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // ------------------------------------------------------------------ // diff --git a/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp index e1273ab9e3b..e6b2fcbef4b 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp @@ -41,13 +41,13 @@ struct EmulateCUDADim3 { template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f((Args &&) args...); + f((Args&&)args...); } template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f(Tag{}, (Args &&) args...); + f(Tag{}, (Args&&)args...); } template , Args&&... args) { - _tag_invoke(f, vals[Idxs]..., (Args &&) args...); + _tag_invoke(f, vals[Idxs]..., (Args&&)args...); } template @@ -63,7 +63,7 @@ KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array(Functor const& f, T (&vals)[N], Args&&... args) { _tag_invoke_array_helper(f, vals, std::make_index_sequence{}, - (Args &&) args...); + (Args&&)args...); } // ------------------------------------------------------------------ // diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp index d77ec0c7537..b483653021a 100644 --- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -143,7 +143,7 @@ struct AnalyzeExecPolicyUseMatcher, Trait, Traits...> { static constexpr auto trigger_error_message = show_name_of_invalid_execution_policy_trait{}; static_assert( - /* always false: */ std::is_void::value, + /* always false: */ std::is_void_v, "Unknown execution policy trait. Search compiler output for " "'show_name_of_invalid_execution_policy_trait' to see the type of the " "invalid trait."); diff --git a/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp index d8ab77b2056..4ea0b8d343b 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp @@ -95,12 +95,12 @@ struct non_owning_variable_size_circular_buffer { non_owning_variable_size_circular_buffer( non_owning_variable_size_circular_buffer const&) = delete; non_owning_variable_size_circular_buffer( - non_owning_variable_size_circular_buffer&&) = default; - non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer&&) = default; + non_owning_variable_size_circular_buffer& operator=( non_owning_variable_size_circular_buffer const&) = delete; - non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer& operator=( non_owning_variable_size_circular_buffer&&) = default; - ~non_owning_variable_size_circular_buffer() = default; + ~non_owning_variable_size_circular_buffer() = default; KOKKOS_FORCEINLINE_FUNCTION constexpr size_type size() const noexcept { return m_size; } @@ -138,7 +138,7 @@ struct ChaseLevDeque { public: template ::value>> + std::is_default_constructible_v>> ChaseLevDeque() : m_array() {} explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {} @@ -165,7 +165,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; @@ -226,7 +226,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; diff --git a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp index 6e3d99ebd68..ee53fd8bc6d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -27,8 +27,9 @@ // To use OpenCL(TM) built-in intrinsics inside kernels, we have to // forward-declare their prototype, also see // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md -#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +#if defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) extern SYCL_EXTERNAL unsigned long __attribute__((overloadable)) intel_get_cycle_counter(); #endif @@ -55,8 +56,10 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline uint64_t clock_tic_device() noexcept { // Return value of 64-bit hi-res clock register. return clock64(); -#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +// FIXME_SYCL We can only return something useful for Intel GPUs and with RDC +#elif defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) return intel_get_cycle_counter(); diff --git a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp index e6dd3c63391..d7319e80c87 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -93,7 +93,7 @@ struct CombinedReducerValueImpl, std::move(arg_values))... {} template - KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { + KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { return this->CombinedReducerValueItemImpl::ref(); } template @@ -181,7 +181,7 @@ struct CombinedReducerImpl, Space, KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( - CombinedReducerImpl&&) = default; + CombinedReducerImpl&&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( @@ -192,8 +192,8 @@ struct CombinedReducerImpl, Space, template KOKKOS_FUNCTION constexpr explicit CombinedReducerImpl( value_type& value, ReducersDeduced&&... reducers) noexcept - : CombinedReducerStorageImpl((ReducersDeduced &&) - reducers)..., + : CombinedReducerStorageImpl( + (ReducersDeduced&&)reducers)..., m_value_view(&value) {} KOKKOS_FUNCTION constexpr void join(value_type& dest, @@ -348,8 +348,8 @@ struct CombinedReductionFunctorWrapperImpl< IndexOrMemberOrTagType1&& arg_first, IndexOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IndexOrMemberOrTagType1 &&) arg_first, - (IndexOrMemberTypesThenValueType &&) args...); + (IndexOrMemberOrTagType1&&)arg_first, + (IndexOrMemberTypesThenValueType&&)args...); } // end call operator }}}2 @@ -369,19 +369,19 @@ struct CombinedReductionFunctorWrapperImpl< template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_same, value_type>::value> + !std::is_same_v, value_type>> _call_op_impl(IdxOrMemberTypes&&... idxs, IdxOrMemberType1&& idx, IdxOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IdxOrMemberTypes &&) idxs..., (IdxOrMemberType1 &&) idx, - (IdxOrMemberTypesThenValueType &&) args...); + (IdxOrMemberTypes&&)idxs..., (IdxOrMemberType1&&)idx, + (IdxOrMemberTypesThenValueType&&)args...); } // base case template KOKKOS_FORCEINLINE_FUNCTION void _call_op_impl(IdxOrMemberTypes&&... idxs, value_type& out) const { - m_functor((IdxOrMemberTypes &&) idxs..., + m_functor((IdxOrMemberTypes&&)idxs..., out.template get()...); } }; @@ -464,8 +464,8 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer_value( typename _reducer_from_arg_t::value_type...>{ // This helper function is now poorly named after refactoring. - _get_value_from_combined_reducer_ctor_arg((ReferencesOrViewsOrReducers &&) - args)...}; + _get_value_from_combined_reducer_ctor_arg( + (ReferencesOrViewsOrReducers&&)args)...}; //---------------------------------------- } @@ -480,7 +480,7 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer( Space, _reducer_from_arg_t...>; return reducer_type(value, _reducer_from_arg_t{ - (ReferencesOrViewsOrReducers &&) args}...); + (ReferencesOrViewsOrReducers&&)args}...); //---------------------------------------- } diff --git a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp index ca4edce5c38..9bde2f72a3f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp @@ -110,15 +110,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -132,7 +132,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -194,15 +195,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -216,7 +217,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -262,8 +264,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_and(buffer + (bit >> bits_per_int_lg2) + 1, ~mask); + const uint32_t prev = Kokkos::atomic_fetch_and( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, ~mask); if (!(prev & mask)) { return -1; @@ -273,7 +275,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); // Flush the store-release Kokkos::memory_fence(); @@ -299,8 +301,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_or(buffer + (bit >> bits_per_int_lg2) + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, mask); if (!(prev & mask)) { return -1; @@ -310,7 +312,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return (count & state_used_mask) - 1; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index 532709aa989..72f33ffaab9 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -138,7 +138,7 @@ int get_device_count() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); return count; #elif defined(KOKKOS_ENABLE_SYCL) - return Kokkos::Experimental::Impl::get_sycl_devices().size(); + return Kokkos::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -183,7 +183,7 @@ std::vector const& Kokkos::Impl::get_visible_devices() { #elif defined(KOKKOS_ENABLE_OPENMPTARGET) int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - int device = Experimental::Impl::SYCLInternal::m_syclDev; + int device = Impl::SYCLInternal::m_syclDev; #else int device = -1; return device; @@ -271,7 +271,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: local rank " << local_rank << " is outside the bounds of resource groups provided by CTest. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the resource types allocated to this resource group @@ -284,7 +284,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_name << " is not specified. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Look for the device type specified in CTEST_KOKKOS_DEVICE_TYPE @@ -308,7 +308,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: device type '" << ctest_kokkos_device_type << "' not included in " << ctest_resource_group_name << ". Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the device ID @@ -324,7 +324,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_id_name << " is not specified. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } auto const* comma = std::strchr(resource_str, ','); @@ -332,7 +332,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: invalid value of " << ctest_resource_group_id_name << ": '" << resource_str << "'. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } std::string id(resource_str + 3, comma - resource_str - 3); @@ -613,7 +613,7 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #endif declare_configuration_metadata("architecture", "Default Device", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); #if defined(KOKKOS_ARCH_A64FX) declare_configuration_metadata("architecture", "CPU architecture", "A64FX"); @@ -666,6 +666,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_RISCV_SG2042) declare_configuration_metadata("architecture", "CPU architecture", "SG2042 (RISC-V)") +#elif defined(KOKKOS_ARCH_RISCV_RVA22V) + declare_configuration_metadata("architecture", "CPU architecture", + "RVA22V (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -738,8 +741,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "GPU architecture", "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX906"); + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX906"); #elif defined(KOKKOS_ARCH_AMD_GFX908) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX908"); @@ -976,7 +979,7 @@ void Kokkos::Impl::parse_environment_variables( Tools::Impl::parse_environment_variables(tools_init_arguments); if (init_result.result == Tools::Impl::InitializationStatus::environment_argument_mismatch) { - Impl::throw_runtime_exception(init_result.error_message); + Kokkos::abort(init_result.error_message.c_str()); } combine(settings, tools_init_arguments); diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp index c71c21d2ac9..cd00fdadeba 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp @@ -36,15 +36,22 @@ struct GraphNodeKernelDefaultImpl { // TODO @graphs decide if this should use vtable or intrusive erasure via // function pointers like in the rest of the graph interface virtual void execute_kernel() = 0; + + GraphNodeKernelDefaultImpl() = default; + + explicit GraphNodeKernelDefaultImpl(ExecutionSpace exec) + : m_execution_space(std::move(exec)) {} + + ExecutionSpace m_execution_space; }; // TODO Indicate that this kernel specialization is only for the Host somehow? template class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag::type, - public GraphNodeKernelDefaultImpl { + : public GraphNodeKernelDefaultImpl, + public PatternImplSpecializationFromTag::type { public: using base_t = typename PatternImplSpecializationFromTag - GraphNodeKernelImpl(std::string const&, ExecutionSpace const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...), - execute_kernel_vtable_base_t() {} + GraphNodeKernelImpl(std::string const &, ExecutionSpace const &, + Functor arg_functor, PolicyDeduced &&arg_policy, + ArgsDeduced &&...args) + : execute_kernel_vtable_base_t(arg_policy.space()), + base_t(std::move(arg_functor), (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) {} // FIXME @graph Forward through the instance once that works in the backends template - GraphNodeKernelImpl(ExecutionSpace const& ex, Functor arg_functor, - PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + GraphNodeKernelImpl(ExecutionSpace const &ex, Functor arg_functor, + PolicyDeduced &&arg_policy, ArgsDeduced &&...args) : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) { + // FIXME This constructor seem unused. + } - void execute_kernel() final { this->base_t::execute(); } + void execute_kernel() override final { this->base_t::execute(); } }; // end GraphNodeKernelImpl }}}1 @@ -88,7 +97,7 @@ struct GraphNodeAggregateKernelDefaultImpl using is_graph_kernel = std::true_type; }; using graph_kernel = GraphNodeAggregateKernelDefaultImpl; - void execute_kernel() final {} + void execute_kernel() override final {} }; } // end namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp index 223ae391ab4..31d147ea894 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp @@ -69,10 +69,10 @@ struct GraphNodeBackendSpecificDetails { GraphNodeBackendSpecificDetails(GraphNodeBackendSpecificDetails&&) noexcept = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails const&) = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails&&) noexcept = delete; ~GraphNodeBackendSpecificDetails() = default; @@ -92,6 +92,18 @@ struct GraphNodeBackendSpecificDetails { m_is_aggregate = true; } + // A node is awaitable if it can execute a kernel. + // A root node or an aggregate node cannot be waited for, because it does + // not launch anything. + bool awaitable() const { return (!m_is_root) && (!m_is_aggregate); } + + // Retrieve the execution space instance that has been passed to + // the kernel at construction phase. + const ExecutionSpace& get_execution_space() const { + KOKKOS_EXPECTS(m_kernel_ptr != nullptr) + return m_kernel_ptr->m_execution_space; + } + void set_predecessor( std::shared_ptr> arg_pred_impl) { @@ -104,7 +116,7 @@ struct GraphNodeBackendSpecificDetails { m_predecessors.push_back(std::move(arg_pred_impl)); } - void execute_node() { + void execute_node(const ExecutionSpace& exec) { // This node could have already been executed as the predecessor of some // other KOKKOS_EXPECTS(bool(m_kernel_ptr) || m_has_executed) @@ -115,8 +127,18 @@ struct GraphNodeBackendSpecificDetails { // supported semantics, but instinct I have feels like it should be... m_has_executed = true; for (auto const& predecessor : m_predecessors) { - predecessor->execute_node(); + predecessor->execute_node(exec); } + + // Before executing the kernel, be sure to fence the execution space + // instance of predecessors. + for (const auto& predecessor : m_predecessors) { + if (predecessor->awaitable() && + predecessor->get_execution_space() != this->get_execution_space()) + predecessor->get_execution_space().fence( + "Kokkos::DefaultGraphNode::execute_node: sync with predecessors"); + } + m_kernel_ptr->execute_kernel(); } KOKKOS_ENSURES(m_has_executed) diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 05d48549193..8dfa19a178c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -58,12 +58,12 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { // Not movable or copyable; it spends its whole live as a shared_ptr in the // Graph object - GraphImpl() = default; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = default; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; - ~GraphImpl() = default; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() = default; explicit GraphImpl(ExecutionSpace arg_space) : execution_space_instance_storage_base_t(std::move(arg_space)) {} @@ -136,17 +136,40 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { return rv; } - void submit() { + void instantiate() { + KOKKOS_EXPECTS(!m_has_been_instantiated); + m_has_been_instantiated = true; + } + + void submit(const ExecutionSpace& exec) { + if (!m_has_been_instantiated) instantiate(); // This reset is gross, but for the purposes of our simple host // implementation... for (auto& sink : m_sinks) { sink->reset_has_executed(); } + + // We don't know where the nodes will execute, so we need to fence the given + // execution space instance before proceeding. This is the simplest way + // of guaranteeing that the kernels in the graph are correctly "enqueued". + exec.fence( + "Kokkos::DefaultGraph::submit: fencing before launching graph nodes"); + for (auto& sink : m_sinks) { - sink->execute_node(); + sink->execute_node(exec); + } + + // Once all sinks have been executed, we need to fence them. + for (const auto& sink : m_sinks) { + if (sink->awaitable() && sink->get_execution_space() != exec) + sink->get_execution_space().fence( + "Kokkos::DefaultGraph::submit: fencing before ending graph submit"); } } + private: + bool m_has_been_instantiated = false; + // end required customizations }}}2 //---------------------------------------------------------------------------- }; diff --git a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp index 8ba94ba4ccc..a8a4d6617bc 100644 --- a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp @@ -52,16 +52,16 @@ struct EBOBaseImpl; template class CtorNotOnDevice> struct EBOBaseImpl { template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && !CtorNotOnDevice::value, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&...) noexcept {} template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && CtorNotOnDevice::value, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {} @@ -110,18 +110,18 @@ struct EBOBaseImpl { T m_ebo_object; template ::value && + std::enable_if_t && !CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&... args) noexcept(noexcept(T(std::forward(args)...))) : m_ebo_object(std::forward(args)...) {} template ::value && + std::enable_if_t && CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept( noexcept(T(std::forward(args)...))) @@ -167,9 +167,9 @@ struct EBOBaseImpl { template class CtorsNotOnDevice = NoCtorsNotOnDevice> struct StandardLayoutNoUniqueAddressMemberEmulation - : EBOBaseImpl::value, CtorsNotOnDevice> { + : EBOBaseImpl, CtorsNotOnDevice> { private: - using ebo_base_t = EBOBaseImpl::value, CtorsNotOnDevice>; + using ebo_base_t = EBOBaseImpl, CtorsNotOnDevice>; public: using ebo_base_t::ebo_base_t; diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp index 04c5e0bd22a..58a5de2aa62 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp @@ -41,7 +41,7 @@ void team_policy_check_valid_storage_level_argument(int level) { std::stringstream ss; ss << "TeamPolicy::set_scratch_size(/*level*/ " << level << ", ...) storage level argument must be 0 or 1 to be valid\n"; - Impl::throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } } diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp index 58ed54275a6..5805b78ee75 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp @@ -123,14 +123,14 @@ template struct ExecSpaceDerived : ExecSpaceBase { static_assert(check_valid_execution_space()); static_assert(check_is_regular()); - void initialize(InitializationSettings const& settings) final { + void initialize(InitializationSettings const& settings) override final { ExecutionSpace::impl_initialize(settings); } - void finalize() final { ExecutionSpace::impl_finalize(); } - void static_fence(std::string const& label) final { + void finalize() override final { ExecutionSpace::impl_finalize(); } + void static_fence(std::string const& label) override final { ExecutionSpace::impl_static_fence(label); } - void print_configuration(std::ostream& os, bool verbose) final { + void print_configuration(std::ostream& os, bool verbose) override final { ExecutionSpace().print_configuration(os, verbose); } }; diff --git a/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp b/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp deleted file mode 100644 index 4726a87b97c..00000000000 --- a/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp +++ /dev/null @@ -1,279 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP -#define KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP - -#include -#include - -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class FixedBlockSizeMemoryPool - : private MemorySpaceInstanceStorage { - public: - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - - private: - using memory_space_storage_base = - MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { - union { - char ignore; - char data[Size]; - }; - }; - - static constexpr auto actual_size = sizeof(Block); - - // TODO shared allocation tracker - // TODO @optimization put the index values on different cache lines (CPU) or - // pages (GPU)? - - tracker_type m_tracker = {}; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - Kokkos::OwningRawPtr m_first_block = nullptr; - Kokkos::OwningRawPtr m_free_indices = nullptr; - - enum : size_type { IndexInUse = ~size_type(0) }; - - public: - FixedBlockSizeMemoryPool(memory_space const& mem_space, size_type num_blocks) - : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block)); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = - record_type::allocate(mem_space, "Kokkos::FixedBlockSizeMemPool_blocks", - num_blocks * sizeof(size_type)); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for (size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool(memory_space const& mem_space, - size_t mempool_capacity, unsigned, unsigned, - unsigned) - : FixedBlockSizeMemoryPool( - mem_space, mempool_capacity / - actual_size) { /* forwarding ctor, must be empty */ - } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool const&) = default; - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept { - (void)alloc_size; - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], - current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if (free_idx == IndexInUse) { - return nullptr; - } else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type /*alloc_size*/) const noexcept { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && - offset / actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } -}; - -#if 0 -template < - class DeviceType, - size_t Size, - size_t Align=1, - class SizeType = typename DeviceType::execution_space::size_type -> -class FixedBlockSizeChaseLevMemoryPool - : private MemorySpaceInstanceStorage -{ -public: - - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - -private: - - using memory_space_storage_base = MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { union { char ignore; char data[Size]; }; }; - - static constexpr auto actual_size = sizeof(Block); - - tracker_type m_tracker = { }; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - - - enum : size_type { IndexInUse = ~size_type(0) }; - -public: - - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_type num_blocks - ) : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) - { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block) - ); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(size_type) - ); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for(size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_t mempool_capacity, - unsigned, unsigned, unsigned - ) : FixedBlockSizeMemoryPool(mem_space, mempool_capacity / actual_size) - { /* forwarding ctor, must be empty */ } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool const&) = default; - - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept - { - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add((volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = - Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if(free_idx == IndexInUse) { - return nullptr; - } - else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type alloc_size) const noexcept - { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && offset/actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add((volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } - -}; -#endif - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp index e844a5295e5..29a365e6e41 100644 --- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -118,8 +118,8 @@ struct FunctorAnalysis { using functor_has_space = has_execution_space; static_assert(!policy_has_space::value || !functor_has_space::value || - std::is_same::value, + std::is_same_v, "Execution Policy and Functor execution space must match"); //---------------------------------------- @@ -136,9 +136,8 @@ struct FunctorAnalysis { typename std::is_void::type> { using type = typename F::value_type; - static_assert(!std::is_reference::value && - std::rank::value <= 1 && - std::extent::value == 0, + static_assert(!std::is_reference_v && std::rank_v <= 1 && + std::extent_v == 0, "Kokkos Functor::value_type is T or T[]"); }; @@ -149,7 +148,7 @@ struct FunctorAnalysis { template ::type, - bool T = std::is_void::value> + bool T = std::is_void_v> struct deduce_value_type { using type = V; }; @@ -290,8 +289,8 @@ struct FunctorAnalysis { using candidate_type = typename deduce_value_type::type; enum { - candidate_is_void = std::is_void::value, - candidate_is_array = std::rank::value == 1 + candidate_is_void = std::is_void_v, + candidate_is_array = std::rank_v == 1 }; //---------------------------------------- @@ -306,7 +305,7 @@ struct FunctorAnalysis { using value_type = std::remove_extent_t; - static_assert(!std::is_const::value, + static_assert(!std::is_const_v, "Kokkos functor operator reduce argument cannot be const"); private: @@ -614,21 +613,20 @@ struct FunctorAnalysis { }; template - struct DeduceJoinNoTag::value || - (!is_reducer::value && - std::is_void::value)) && - detected_join_no_tag::value>> + struct DeduceJoinNoTag< + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + detected_join_no_tag::value>> : public has_join_no_tag_function { enum : bool { value = true }; }; template struct DeduceJoinNoTag< - F, - std::enable_if_t<(is_reducer::value || - (!is_reducer::value && std::is_void::value)) && - (!detected_join_no_tag::value && - detected_volatile_join_no_tag::value)>> + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + (!detected_join_no_tag::value && + detected_volatile_join_no_tag::value)>> : public has_volatile_join_no_tag_function { enum : bool { value = true }; static_assert(Impl::dependent_false_v, @@ -735,8 +733,8 @@ struct FunctorAnalysis { template struct DeduceInitNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_init_no_tag_function::enable_if( &F::init))>> : public has_init_no_tag_function { @@ -835,8 +833,8 @@ struct FunctorAnalysis { template struct DeduceFinalNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_final_no_tag_function::enable_if( &F::final))>> : public has_final_no_tag_function { @@ -906,14 +904,14 @@ struct FunctorAnalysis { Functor m_functor; template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return m_functor.value_count; } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return candidate_is_void ? 0 : 1; } @@ -973,8 +971,8 @@ struct FunctorAnalysis { DeduceJoin<>::join(&m_functor, dst, src); } - KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const - noexcept { + KOKKOS_INLINE_FUNCTION reference_type + init(ValueType* const dst) const noexcept { DeduceInit<>::init(&m_functor, dst); return reference(dst); } @@ -987,11 +985,11 @@ struct FunctorAnalysis { KOKKOS_INLINE_FUNCTION const Functor& get_functor() const { return m_functor; } - Reducer(Reducer const&) = default; - Reducer(Reducer&&) = default; + Reducer(Reducer const&) = default; + Reducer(Reducer&&) = default; Reducer& operator=(Reducer const&) = delete; - Reducer& operator=(Reducer&&) = delete; - ~Reducer() = default; + Reducer& operator=(Reducer&&) = delete; + ~Reducer() = default; KOKKOS_INLINE_FUNCTION explicit constexpr Reducer( Functor const& arg_functor) noexcept diff --git a/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp b/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp index 56f95c814d8..6d3ebf64bef 100644 --- a/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp @@ -56,7 +56,7 @@ struct GraphAccess { static_assert( Kokkos::Impl::is_specialization_of::value, "Kokkos Internal Error in graph interface"); - return std::make_shared((Args &&) args...); + return std::make_shared((Args&&)args...); } template ::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_node_ptr()`)"); - return ((NodeRef &&) node_ref).get_node_ptr(); + return ((NodeRef&&)node_ref).get_node_ptr(); } template @@ -93,7 +93,7 @@ struct GraphAccess { Kokkos::Experimental::GraphNodeRef>::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_graph_weak_ptr()`)"); - return ((NodeRef &&) node_ref).get_graph_weak_ptr(); + return ((NodeRef&&)node_ref).get_graph_weak_ptr(); } // end accessors for private members of public interface }}}2 diff --git a/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp index 2ab05cb8e43..b02a2654722 100644 --- a/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp @@ -54,9 +54,9 @@ template