alpaka-group · MichaelVarvarin · Jul 1, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 29, 2024
diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake
@@ -735,7 +735,7 @@ if(alpaka_ACC_SYCL_ENABLE)
 
         #-----------------------------------------------------------------------------------------------------------------
         # Generic SYCL options
-        alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-unnamed-lambda") # Compiler default but made explicit here
+    alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-unnamed-lambda") # Compiler default but made explicit here
 
         if(alpaka_RELOCATABLE_DEVICE_CODE STREQUAL ON)
             alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-rdc")

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -25,6 +25,7 @@ add_subdirectory("heatEquation/")
 add_subdirectory("heatEquation2D/")
 add_subdirectory("helloWorld/")
 add_subdirectory("helloWorldLambda/")
+add_subdirectory("helloWorldGridSync/")
 add_subdirectory("kernelSpecialization/")
 add_subdirectory("ls/")
 add_subdirectory("matrixMulWithMdspan/")

diff --git a/example/helloWorldGridSync/CMakeLists.txt b/example/helloWorldGridSync/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# Copyright 2024 Mykhailo Varvarin
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME helloWorldGridSync)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/helloWorldGridSync.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/example/helloWorldGridSync/src/helloWorldGridSync.cpp b/example/helloWorldGridSync/src/helloWorldGridSync.cpp
@@ -0,0 +1,165 @@
+/* Copyright 2024 Mykhailo Varvarin
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <cstdint>
+#include <iostream>
+
+//! Hello world kernel, utilizing grid synchronization.
+//! Prints hello world from a thread, performs grid sync.
+//! and prints the sum of indixes of this thread and the opposite thread (the sums have to be the same).
+//! Prints an error if sum is incorrect.
+struct HelloWorldKernel
+{
+    template<typename Acc>
+    ALPAKA_FN_ACC void operator()(Acc const& acc, size_t* array, bool* success) const
+    {
+        // Get index of the current thread in the grid and the total number of threads.
+        size_t gridThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+        size_t gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];
+
+        if(gridThreadIdx == 0)
+            printf("Hello, World from alpaka thread %zu!\n", gridThreadIdx);
+
+        // Write the index of the thread to array.
+        array[gridThreadIdx] = gridThreadIdx;
+
+        // Perform grid synchronization.
+        alpaka::syncGridThreads(acc);
+
+        // Get the index of the thread from the opposite side of 1D array.
+        size_t gridThreadIdxOpposite = array[gridThreadExtent - gridThreadIdx - 1];
+
+        // Sum them.
+        size_t sum = gridThreadIdx + gridThreadIdxOpposite;
+
+        // Get the expected sum.
+        size_t expectedSum = gridThreadExtent - 1;
+
+        // Print the result and signify an error if the grid synchronization fails.
+        if(sum != expectedSum)
+        {
+            *success = false;
+            printf(
+                "After grid sync, this thread is %zu, thread on the opposite side is %zu. Their sum is %zu, expected: "
+                "%zu.%s",
+                gridThreadIdx,
+                gridThreadIdxOpposite,
+                sum,
+                expectedSum,
+                sum == expectedSum ? "\n" : " ERROR: the sum is incorrect.\n");
+        }
+    }
+};
+
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
+{
+    // Define the accelerator
+    // For simplicity this examples always uses 1 dimensional indexing, and index type size_t
+    using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, std::size_t>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
+
+    // Define dimensionality and type of indices to be used in kernels
+    using Dim = alpaka::DimInt<1>;
+    using Idx = size_t;
+
+
+    // Select the first device available on a system, for the chosen accelerator
+    auto const platformAcc = alpaka::Platform<Acc>{};
+    auto const devAcc = getDevByIdx(platformAcc, 0u);
+
+    // Select CPU host
+    constexpr auto platformHost = alpaka::Platform<alpaka::DevCpu>{};
+    auto const devHost = getDevByIdx(platformHost, 0u);
+
+    // Define type for a queue with requested properties: Blocking.
+    using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
+    // Create a queue for the device.
+    auto queue = Queue{devAcc};
+
+    // Define kernel execution configuration of blocks,
+    // threads per block, and elements per thread.
+    Idx blocksPerGrid = 100;
+    Idx threadsPerBlock = 1;
+    Idx elementsPerThread = 1;
+
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+
+    // Allocate memory on the device.
+    alpaka::Vec<Dim, Idx> bufferExtent{blocksPerGrid * threadsPerBlock};
+    auto deviceMemory = alpaka::allocBuf<Idx, Idx>(devAcc, bufferExtent);
+
+    // Allocate the result value
+    auto bufAccResult = alpaka::allocBuf<bool, Idx>(devAcc, static_cast<Idx>(1u));
+    memset(queue, bufAccResult, static_cast<std::uint8_t>(true));
+
+
+    // Instantiate the kernel object.
+    HelloWorldKernel helloWorldKernel;
+
+    // Query the maximum number of blocks allowed for the device
+    int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
+        devAcc,
+        helloWorldKernel,
+        threadsPerBlock,
+        elementsPerThread,
+        getPtrNative(deviceMemory),
+        getPtrNative(bufAccResult));
+    std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;
+
+    // Create a workdiv according to the limitations
+    blocksPerGrid = std::min(static_cast<Idx>(maxBlocks), blocksPerGrid);
+    auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
+
+    // Create a task to run the kernel.
+    // Note the cooperative kernel specification.
+    // Only cooperative kernels can perform grid synchronization.
+    auto taskRunKernel = alpaka::createTaskCooperativeKernel<Acc>(
+        workDiv,
+        helloWorldKernel,
+        getPtrNative(deviceMemory),
+        getPtrNative(bufAccResult));
+
+    // Enqueue the kernel execution task.
+    alpaka::enqueue(queue, taskRunKernel);
+
+    // Copy the result value to the host
+    auto bufHostResult = alpaka::allocBuf<bool, Idx>(devHost, static_cast<Idx>(1u));
+    memcpy(queue, bufHostResult, bufAccResult);
+    wait(queue);
+
+    auto const result = *getPtrNative(bufHostResult);
+
+    if(result)
+    {
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        return EXIT_FAILURE;
+    }
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/include/alpaka/acc/AccCpuOmp2Blocks.hpp
@@ -13,6 +13,7 @@
 #include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
 #include "alpaka/block/sync/BlockSyncNoOp.hpp"
 #include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/grid/GridSyncBarrierCpuOmp.hpp"
 #include "alpaka/idx/bt/IdxBtZero.hpp"
 #include "alpaka/idx/gb/IdxGbRef.hpp"
 #include "alpaka/intrinsic/IntrinsicCpu.hpp"
@@ -68,6 +69,7 @@ namespace alpaka
         , public BlockSharedMemDynMember<>
         , public BlockSharedMemStMember<>
         , public BlockSyncNoOp
+        , public GridSyncOmp
         , public IntrinsicCpu
         , public MemFenceOmp2Blocks
 #    ifdef ALPAKA_DISABLE_VENDOR_RNG
@@ -152,7 +154,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes()),
                         // m_globalMemSizeBytes
-                        getMemBytes(dev)};
+                        getMemBytes(dev),
+                        // m_cooperativeLaunch
+                        true};
             }
         };
 
@@ -203,6 +207,33 @@ namespace alpaka
             }
         };
 
+        //! The CPU OpenMP 2.0 block accelerator execution cooperative task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskCooperativeKernel<AccCpuOmp2Blocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
+                auto const maxBlocks = omp_get_max_threads();
+                if(gridBlockExtent.prod() != static_cast<TIdx>(maxBlocks))
+                {
+                    throw std::runtime_error(
+                        "The number of requested blocks is larger than maximuma of the device for OpenMP 2.0 blocks "
+                        "accelerator. Requested: "
+                        + std::to_string(gridBlockExtent.prod()) + ", maximum allowed: " + std::to_string(maxBlocks)
+                        + ". Use getMaxActiveBlocks().");
+                }
+
+                return TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
         //! The CPU OpenMP 2.0 block execution task platform type trait specialization.
         template<typename TDim, typename TIdx>
         struct PlatformType<AccCpuOmp2Blocks<TDim, TIdx>>

diff --git a/include/alpaka/acc/AccCpuOmp2Threads.hpp b/include/alpaka/acc/AccCpuOmp2Threads.hpp
@@ -12,6 +12,7 @@
 #include "alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp"
 #include "alpaka/block/sync/BlockSyncBarrierOmp.hpp"
 #include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/grid/GridSyncBarrierCpuOmp.hpp"
 #include "alpaka/idx/bt/IdxBtOmp.hpp"
 #include "alpaka/idx/gb/IdxGbRef.hpp"
 #include "alpaka/intrinsic/IntrinsicCpu.hpp"
@@ -68,6 +69,7 @@ namespace alpaka
         , public BlockSharedMemDynMember<>
         , public BlockSharedMemStMemberMasterSync<>
         , public BlockSyncBarrierOmp
+        , public GridSyncOmp
         , public IntrinsicCpu
         , public MemFenceOmp2Threads
 #    ifdef ALPAKA_DISABLE_VENDOR_RNG
@@ -162,7 +164,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         memBytes,
                         // m_globalMemSizeBytes
-                        memBytes};
+                        memBytes,
+                        // m_cooperativeLaunch
+                        true};
             }
         };
 
@@ -206,6 +210,30 @@ namespace alpaka
             }
         };
 
+        //! The CPU OpenMP 2.0 thread accelerator execution cooperative task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskCooperativeKernel<AccCpuOmp2Threads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
+                if(gridBlockExtent.prod() != static_cast<TIdx>(1u))
+                {
+                    throw std::runtime_error("OpenMP 2.0 thread accelerator supports only a single block operation "
+                                             "with cooperative kernel!\n"
+                                             "Consider useing a different CPU accelerator.");
+                }
+
+                return TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
         //! The CPU OpenMP 2.0 thread execution task platform type trait specialization.
         template<typename TDim, typename TIdx>
         struct PlatformType<AccCpuOmp2Threads<TDim, TIdx>>