Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for cooperative groups and grid synchronization #2307

Draft
wants to merge 42 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
cadb1af
Add CreateTaskCooperativeKernel, grid sync and HelloWorldGridSyncExam…
MichaelVarvarin Jul 1, 2024
0453b9f
Add comment about issue with grid sync on CUDA Clang
MichaelVarvarin Jul 26, 2024
df5d4fd
Add cooperative kernel launch and grid sync support for HIP
MichaelVarvarin Jul 26, 2024
978f195
Add m_cooperativeLaunch device prop and runtime check for CG support …
MichaelVarvarin Jul 29, 2024
b7aee7a
Clean errors in previous commit
MichaelVarvarin Aug 1, 2024
3acaf11
Clean formatting
MichaelVarvarin Aug 2, 2024
5e9c5ce
Add getMaxActiveBlocks to get the maximum allowed block count for lau…
MichaelVarvarin Aug 7, 2024
a7a9b03
Rename maxActiveBlocks trait
MichaelVarvarin Aug 10, 2024
08e18cb
Fix issues from bad rebase
MichaelVarvarin Aug 12, 2024
c6dc462
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for A…
MichaelVarvarin Aug 12, 2024
e88e0c1
Clean formatting
MichaelVarvarin Aug 13, 2024
bf9ddf0
Correct the comment
MichaelVarvarin Aug 13, 2024
148c521
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for O…
MichaelVarvarin Aug 20, 2024
db43ec9
Clean formatting
MichaelVarvarin Aug 20, 2024
5f0bfa8
Update comments
MichaelVarvarin Aug 20, 2024
9247156
Add include gridSync OMP to alpaka.hpp
MichaelVarvarin Aug 27, 2024
907c0b9
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for s…
MichaelVarvarin Aug 27, 2024
0455158
Clean warnings for CPU accelerators
MichaelVarvarin Sep 9, 2024
f7da2fe
Clean warnings for the HIP accelerator
MichaelVarvarin Sep 9, 2024
b20ddf2
Merge SYCL changes (#2)
MichaelVarvarin Nov 11, 2024
7cf652e
Revert "Merge SYCL changes (#2)" (#3)
MichaelVarvarin Nov 11, 2024
aaed855
Add cooperative groups and grid sync functionality to SYCL
MichaelVarvarin Nov 11, 2024
fab1f24
Rewrite example to use executeForEachAccTag
MichaelVarvarin Nov 12, 2024
1222309
Change from using concepts to interface due to rebase
MichaelVarvarin Nov 12, 2024
f2af59a
Implement grid sync and cooperative kernel functionality for Intel TBB
MichaelVarvarin Nov 16, 2024
215a292
BUGFIX: Change m_cooperativeLaunch prop for SYCL to true m_cooperativ…
MichaelVarvarin Nov 16, 2024
c95b136
Change AccCpuTbbBlocks object back to being local for each thread
MichaelVarvarin Nov 17, 2024
53e51c2
BUGFIX: add includes to achieve compilation
MichaelVarvarin Nov 17, 2024
0ff0779
Add gridSyncTest to test syncGridThreads()
MichaelVarvarin Nov 17, 2024
1070851
Clearer code and comments for helloWorldGridSync
MichaelVarvarin Nov 18, 2024
d83dfed
Add check if grid sync was successful
MichaelVarvarin Nov 18, 2024
0115af2
Remove template from example kernel
MichaelVarvarin Nov 18, 2024
07aed98
Try different parameter expansion
MichaelVarvarin Nov 18, 2024
62a58cc
Revert "Try different parameter expansion"
MichaelVarvarin Nov 18, 2024
4d914bd
Formatting changes
MichaelVarvarin Nov 18, 2024
ceac5d3
Formatting changes
MichaelVarvarin Nov 18, 2024
0a990d4
BUGFIX: Add explicit conversions
MichaelVarvarin Nov 19, 2024
dfcfb28
BUGFIX: add missing argument in GridSyncTest
MichaelVarvarin Nov 25, 2024
1203c09
BUGFIX: fix SYCL compilation error by passing additional template arg…
MichaelVarvarin Nov 25, 2024
310ad88
BUGFIX: add runtime check for cooperative launch support
MichaelVarvarin Nov 26, 2024
05effa5
Suppress GCC warning about pointer to object conversion
MichaelVarvarin Nov 26, 2024
9568256
Change format specifiers
MichaelVarvarin Nov 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/alpakaCommon.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -735,7 +735,7 @@ if(alpaka_ACC_SYCL_ENABLE)

#-----------------------------------------------------------------------------------------------------------------
# Generic SYCL options
alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-unnamed-lambda") # Compiler default but made explicit here
alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-unnamed-lambda") # Compiler default but made explicit here

if(alpaka_RELOCATABLE_DEVICE_CODE STREQUAL ON)
alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-rdc")
Expand Down
1 change: 1 addition & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_subdirectory("heatEquation/")
add_subdirectory("heatEquation2D/")
add_subdirectory("helloWorld/")
add_subdirectory("helloWorldLambda/")
add_subdirectory("helloWorldGridSync/")
add_subdirectory("kernelSpecialization/")
add_subdirectory("ls/")
add_subdirectory("matrixMulWithMdspan/")
Expand Down
47 changes: 47 additions & 0 deletions example/helloWorldGridSync/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Copyright 2024 Mykhailo Varvarin
# SPDX-License-Identifier: ISC
#

################################################################################
# Required CMake version.

cmake_minimum_required(VERSION 3.22)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

################################################################################
# Project.

set(_TARGET_NAME helloWorldGridSync)

project(${_TARGET_NAME} LANGUAGES CXX)

#-------------------------------------------------------------------------------
# Find alpaka.

if(NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

if(alpaka_USE_SOURCE_TREE)
# Don't build the examples recursively
set(alpaka_BUILD_EXAMPLES OFF)
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
else()
find_package(alpaka REQUIRED)
endif()
endif()

#-------------------------------------------------------------------------------
# Add executable.

alpaka_add_executable(
${_TARGET_NAME}
src/helloWorldGridSync.cpp)
target_link_libraries(
${_TARGET_NAME}
PUBLIC alpaka::alpaka)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)

add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
165 changes: 165 additions & 0 deletions example/helloWorldGridSync/src/helloWorldGridSync.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/* Copyright 2024 Mykhailo Varvarin
* SPDX-License-Identifier: MPL-2.0
*/

#include <alpaka/alpaka.hpp>
#include <alpaka/example/ExecuteForEachAccTag.hpp>

#include <cstdint>
#include <iostream>

//! Hello world kernel, utilizing grid synchronization.
//! Prints hello world from a thread, performs grid sync.
//! and prints the sum of indixes of this thread and the opposite thread (the sums have to be the same).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] Could you explain what is the opposite thread here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thread, that has the same distance from the end of the grid dimension, as this from the start. So, if the IDs range from 0 to 9, these are 0 and 9, 1 and 8, 2 and 7 and so on. Their sum is constant, so we can check, if grid sync was performed successfully

//! Prints an error if sum is incorrect.
struct HelloWorldKernel
{
template<typename Acc>
ALPAKA_FN_ACC void operator()(Acc const& acc, size_t* array, bool* success) const
{
// Get index of the current thread in the grid and the total number of threads.
size_t gridThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
size_t gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];

if(gridThreadIdx == 0)
printf("Hello, World from alpaka thread %zu!\n", gridThreadIdx);

// Write the index of the thread to array.
array[gridThreadIdx] = gridThreadIdx;

// Perform grid synchronization.
alpaka::syncGridThreads(acc);

// Get the index of the thread from the opposite side of 1D array.
size_t gridThreadIdxOpposite = array[gridThreadExtent - gridThreadIdx - 1];

// Sum them.
size_t sum = gridThreadIdx + gridThreadIdxOpposite;

// Get the expected sum.
size_t expectedSum = gridThreadExtent - 1;

// Print the result and signify an error if the grid synchronization fails.
if(sum != expectedSum)
{
*success = false;
printf(
"After grid sync, this thread is %zu, thread on the opposite side is %zu. Their sum is %zu, expected: "
"%zu.%s",
gridThreadIdx,
gridThreadIdxOpposite,
sum,
expectedSum,
sum == expectedSum ? "\n" : " ERROR: the sum is incorrect.\n");
}
}
};

// In standard projects, you typically do not execute the code with any available accelerator.
// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
// selected accelerator only. If you use the example as the starting point for your project, you can rename the
// example() function to main() and move the accelerator tag to the function body.
template<typename TAccTag>
auto example(TAccTag const&) -> int
{
// Define the accelerator
// For simplicity this examples always uses 1 dimensional indexing, and index type size_t
using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, std::size_t>;
std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;

// Define dimensionality and type of indices to be used in kernels
using Dim = alpaka::DimInt<1>;
using Idx = size_t;


// Select the first device available on a system, for the chosen accelerator
auto const platformAcc = alpaka::Platform<Acc>{};
auto const devAcc = getDevByIdx(platformAcc, 0u);

// Select CPU host
constexpr auto platformHost = alpaka::Platform<alpaka::DevCpu>{};
auto const devHost = getDevByIdx(platformHost, 0u);

// Define type for a queue with requested properties: Blocking.
using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
// Create a queue for the device.
auto queue = Queue{devAcc};

// Define kernel execution configuration of blocks,
// threads per block, and elements per thread.
Idx blocksPerGrid = 100;
Idx threadsPerBlock = 1;
Idx elementsPerThread = 1;

using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

// Allocate memory on the device.
alpaka::Vec<Dim, Idx> bufferExtent{blocksPerGrid * threadsPerBlock};
auto deviceMemory = alpaka::allocBuf<Idx, Idx>(devAcc, bufferExtent);

// Allocate the result value
auto bufAccResult = alpaka::allocBuf<bool, Idx>(devAcc, static_cast<Idx>(1u));
memset(queue, bufAccResult, static_cast<std::uint8_t>(true));


// Instantiate the kernel object.
HelloWorldKernel helloWorldKernel;

// Query the maximum number of blocks allowed for the device
int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
devAcc,
helloWorldKernel,
threadsPerBlock,
elementsPerThread,
getPtrNative(deviceMemory),
getPtrNative(bufAccResult));
std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;

// Create a workdiv according to the limitations
blocksPerGrid = std::min(static_cast<Idx>(maxBlocks), blocksPerGrid);
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};

// Create a task to run the kernel.
// Note the cooperative kernel specification.
// Only cooperative kernels can perform grid synchronization.
auto taskRunKernel = alpaka::createTaskCooperativeKernel<Acc>(
workDiv,
helloWorldKernel,
getPtrNative(deviceMemory),
getPtrNative(bufAccResult));

// Enqueue the kernel execution task.
alpaka::enqueue(queue, taskRunKernel);

// Copy the result value to the host
auto bufHostResult = alpaka::allocBuf<bool, Idx>(devHost, static_cast<Idx>(1u));
memcpy(queue, bufHostResult, bufAccResult);
wait(queue);

auto const result = *getPtrNative(bufHostResult);

if(result)
{
return EXIT_SUCCESS;
}
else
{
return EXIT_FAILURE;
}
}

auto main() -> int
{
// Execute the example once for each enabled accelerator.
// If you would like to execute it for a single accelerator only you can use the following code.
// \code{.cpp}
// auto tag = TagCpuSerial;
// return example(tag);
// \endcode
//
// valid tags:
// TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
// TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
// TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
}
33 changes: 32 additions & 1 deletion include/alpaka/acc/AccCpuOmp2Blocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
#include "alpaka/block/sync/BlockSyncNoOp.hpp"
#include "alpaka/core/DemangleTypeNames.hpp"
#include "alpaka/grid/GridSyncBarrierCpuOmp.hpp"
#include "alpaka/idx/bt/IdxBtZero.hpp"
#include "alpaka/idx/gb/IdxGbRef.hpp"
#include "alpaka/intrinsic/IntrinsicCpu.hpp"
Expand Down Expand Up @@ -68,6 +69,7 @@ namespace alpaka
, public BlockSharedMemDynMember<>
, public BlockSharedMemStMember<>
, public BlockSyncNoOp
, public GridSyncOmp
, public IntrinsicCpu
, public MemFenceOmp2Blocks
# ifdef ALPAKA_DISABLE_VENDOR_RNG
Expand Down Expand Up @@ -152,7 +154,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes()),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
// m_cooperativeLaunch
true};
}
};

Expand Down Expand Up @@ -203,6 +207,33 @@ namespace alpaka
}
};

//! The CPU OpenMP 2.0 block accelerator execution cooperative task type trait specialization.
template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
struct CreateTaskCooperativeKernel<AccCpuOmp2Blocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
{
ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
TWorkDiv const& workDiv,
TKernelFnObj const& kernelFnObj,
TArgs&&... args)
{
auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
auto const maxBlocks = omp_get_max_threads();
if(gridBlockExtent.prod() != static_cast<TIdx>(maxBlocks))
{
throw std::runtime_error(
"The number of requested blocks is larger than maximuma of the device for OpenMP 2.0 blocks "
"accelerator. Requested: "
+ std::to_string(gridBlockExtent.prod()) + ", maximum allowed: " + std::to_string(maxBlocks)
+ ". Use getMaxActiveBlocks().");
}

return TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>(
workDiv,
kernelFnObj,
std::forward<TArgs>(args)...);
}
};

//! The CPU OpenMP 2.0 block execution task platform type trait specialization.
template<typename TDim, typename TIdx>
struct PlatformType<AccCpuOmp2Blocks<TDim, TIdx>>
Expand Down
30 changes: 29 additions & 1 deletion include/alpaka/acc/AccCpuOmp2Threads.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp"
#include "alpaka/block/sync/BlockSyncBarrierOmp.hpp"
#include "alpaka/core/DemangleTypeNames.hpp"
#include "alpaka/grid/GridSyncBarrierCpuOmp.hpp"
#include "alpaka/idx/bt/IdxBtOmp.hpp"
#include "alpaka/idx/gb/IdxGbRef.hpp"
#include "alpaka/intrinsic/IntrinsicCpu.hpp"
Expand Down Expand Up @@ -68,6 +69,7 @@ namespace alpaka
, public BlockSharedMemDynMember<>
, public BlockSharedMemStMemberMasterSync<>
, public BlockSyncBarrierOmp
, public GridSyncOmp
, public IntrinsicCpu
, public MemFenceOmp2Threads
# ifdef ALPAKA_DISABLE_VENDOR_RNG
Expand Down Expand Up @@ -162,7 +164,9 @@ namespace alpaka
// m_sharedMemSizeBytes
memBytes,
// m_globalMemSizeBytes
memBytes};
memBytes,
// m_cooperativeLaunch
true};
}
};

Expand Down Expand Up @@ -206,6 +210,30 @@ namespace alpaka
}
};

//! The CPU OpenMP 2.0 thread accelerator execution cooperative task type trait specialization.
template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
struct CreateTaskCooperativeKernel<AccCpuOmp2Threads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
{
ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
TWorkDiv const& workDiv,
TKernelFnObj const& kernelFnObj,
TArgs&&... args)
{
auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
if(gridBlockExtent.prod() != static_cast<TIdx>(1u))
{
throw std::runtime_error("OpenMP 2.0 thread accelerator supports only a single block operation "
"with cooperative kernel!\n"
"Consider useing a different CPU accelerator.");
}

return TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>(
workDiv,
kernelFnObj,
std::forward<TArgs>(args)...);
}
};

//! The CPU OpenMP 2.0 thread execution task platform type trait specialization.
template<typename TDim, typename TIdx>
struct PlatformType<AccCpuOmp2Threads<TDim, TIdx>>
Expand Down
Loading
Loading