From 98f9524fbaa9356385f2f80188cc055385018519 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 6 Jan 2025 12:03:59 -0600 Subject: [PATCH] [libc] Switch to using the generic `` implementations Summary: This patch switches the GPU utility helpers to wrapping around the gpuintrin.h ones with a C++ flavor. --- libc/src/__support/GPU/CMakeLists.txt | 10 +- libc/src/__support/GPU/amdgpu/CMakeLists.txt | 7 - libc/src/__support/GPU/amdgpu/utils.h | 183 ------------------ libc/src/__support/GPU/generic/CMakeLists.txt | 7 - libc/src/__support/GPU/generic/utils.h | 84 -------- libc/src/__support/GPU/nvptx/CMakeLists.txt | 7 - libc/src/__support/GPU/nvptx/utils.h | 160 --------------- libc/src/__support/GPU/utils.h | 108 ++++++++--- libc/src/time/gpu/clock.cpp | 2 + libc/src/time/gpu/nanosleep.cpp | 1 + 10 files changed, 90 insertions(+), 479 deletions(-) delete mode 100644 libc/src/__support/GPU/amdgpu/CMakeLists.txt delete mode 100644 libc/src/__support/GPU/amdgpu/utils.h delete mode 100644 libc/src/__support/GPU/generic/CMakeLists.txt delete mode 100644 libc/src/__support/GPU/generic/utils.h delete mode 100644 libc/src/__support/GPU/nvptx/CMakeLists.txt delete mode 100644 libc/src/__support/GPU/nvptx/utils.h diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt index 28fd9a1ebcc97e..9b359f65cdb332 100644 --- a/libc/src/__support/GPU/CMakeLists.txt +++ b/libc/src/__support/GPU/CMakeLists.txt @@ -1,16 +1,12 @@ -if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE}) +# These utilities are GPU only. +if(NOT LIBC_TARGET_OS_IS_GPU) return() endif() -add_subdirectory(${LIBC_TARGET_ARCHITECTURE}) -set(target_gpu_utils libc.src.__support.GPU.${LIBC_TARGET_ARCHITECTURE}.${LIBC_TARGET_ARCHITECTURE}_utils) - add_header_library( utils HDRS utils.h - DEPENDS - ${target_gpu_utils} ) add_object_library( @@ -21,6 +17,6 @@ add_object_library( allocator.h DEPENDS libc.src.__support.common - libc.src.__support.GPU.utils libc.src.__support.RPC.rpc_client + .utils ) diff --git a/libc/src/__support/GPU/amdgpu/CMakeLists.txt b/libc/src/__support/GPU/amdgpu/CMakeLists.txt deleted file mode 100644 index f2b98fc03b218d..00000000000000 --- a/libc/src/__support/GPU/amdgpu/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - amdgpu_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h deleted file mode 100644 index 6ab95403ca3890..00000000000000 --- a/libc/src/__support/GPU/amdgpu/utils.h +++ /dev/null @@ -1,183 +0,0 @@ -//===-------------- AMDGPU implementation of GPU utils ----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -/// Type aliases to the address spaces used by the AMDGPU backend. -template using Private = [[clang::opencl_private]] T; -template using Constant = [[clang::opencl_constant]] T; -template using Local = [[clang::opencl_local]] T; -template using Global = [[clang::opencl_global]] T; - -/// Returns the number of workgroups in the 'x' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_x() { - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); -} - -/// Returns the number of workgroups in the 'y' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_y() { - return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); -} - -/// Returns the number of workgroups in the 'z' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_z() { - return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); -} - -/// Returns the total number of workgruops in the grid. -LIBC_INLINE uint64_t get_num_blocks() { - return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); -} - -/// Returns the 'x' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_x() { - return __builtin_amdgcn_workgroup_id_x(); -} - -/// Returns the 'y' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_y() { - return __builtin_amdgcn_workgroup_id_y(); -} - -/// Returns the 'z' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_z() { - return __builtin_amdgcn_workgroup_id_z(); -} - -/// Returns the absolute id of the AMD workgroup. -LIBC_INLINE uint64_t get_block_id() { - return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + - get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); -} - -/// Returns the number of workitems in the 'x' dimension. -LIBC_INLINE uint32_t get_num_threads_x() { - return __builtin_amdgcn_workgroup_size_x(); -} - -/// Returns the number of workitems in the 'y' dimension. -LIBC_INLINE uint32_t get_num_threads_y() { - return __builtin_amdgcn_workgroup_size_y(); -} - -/// Returns the number of workitems in the 'z' dimension. -LIBC_INLINE uint32_t get_num_threads_z() { - return __builtin_amdgcn_workgroup_size_z(); -} - -/// Returns the total number of workitems in the workgroup. -LIBC_INLINE uint64_t get_num_threads() { - return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); -} - -/// Returns the 'x' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_x() { - return __builtin_amdgcn_workitem_id_x(); -} - -/// Returns the 'y' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_y() { - return __builtin_amdgcn_workitem_id_y(); -} - -/// Returns the 'z' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_z() { - return __builtin_amdgcn_workitem_id_z(); -} - -/// Returns the absolute id of the thread in the current AMD workgroup. -LIBC_INLINE uint64_t get_thread_id() { - return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + - get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); -} - -/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware -/// and compilation options. -LIBC_INLINE uint32_t get_lane_size() { - return __builtin_amdgcn_wavefrontsize(); -} - -/// Returns the id of the thread inside of an AMD wavefront executing together. -[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); -} - -/// Returns the bit-mask of active threads in the current wavefront. -[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() { - return __builtin_amdgcn_read_exec(); -} - -/// Copies the value from the first active thread in the wavefront to the rest. -[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t, - uint32_t x) { - return __builtin_amdgcn_readfirstlane(x); -} - -/// Returns a bitmask of threads in the current lane for which \p x is true. -[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - // the lane_mask & gives the nvptx semantics when lane_mask is a subset of - // the active threads - return lane_mask & __builtin_amdgcn_ballot_w64(x); -} - -/// Waits for all the threads in the block to converge and issues a fence. -[[clang::convergent]] LIBC_INLINE void sync_threads() { - __builtin_amdgcn_s_barrier(); - __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); -} - -/// Waits for all pending memory operations to complete in program order. -[[clang::convergent]] LIBC_INLINE void memory_fence() { - __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, ""); -} - -/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. -[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) { - __builtin_amdgcn_wave_barrier(); -} - -/// Shuffles the the lanes inside the wavefront according to the given index. -[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx, - uint32_t x) { - return __builtin_amdgcn_ds_bpermute(idx << 2, x); -} - -/// Returns the current value of the GPU's processor clock. -/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter. -LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } - -/// Returns a fixed-frequency timestamp. The actual frequency is dependent on -/// the card and can only be queried via the driver. -LIBC_INLINE uint64_t fixed_frequency_clock() { - return __builtin_readsteadycounter(); -} - -/// Terminates execution of the associated wavefront. -[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); } - -/// Returns a unique identifier for the process cluster the current wavefront is -/// executing on. Here we use the identifier for the compute unit (CU) and -/// shader engine. -/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface -/// than the one at -/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899 -LIBC_INLINE uint32_t get_cluster_id() { return 0; } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif diff --git a/libc/src/__support/GPU/generic/CMakeLists.txt b/libc/src/__support/GPU/generic/CMakeLists.txt deleted file mode 100644 index 68ba7d1ec80e95..00000000000000 --- a/libc/src/__support/GPU/generic/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - generic_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h deleted file mode 100644 index 9461ef0aa245b7..00000000000000 --- a/libc/src/__support/GPU/generic/utils.h +++ /dev/null @@ -1,84 +0,0 @@ -//===-------------- Generic implementation of GPU utils ---------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -template using Private = T; -template using Constant = T; -template using Shared = T; -template using Global = T; - -LIBC_INLINE uint32_t get_num_blocks_x() { return 1; } - -LIBC_INLINE uint32_t get_num_blocks_y() { return 1; } - -LIBC_INLINE uint32_t get_num_blocks_z() { return 1; } - -LIBC_INLINE uint64_t get_num_blocks() { return 1; } - -LIBC_INLINE uint32_t get_block_id_x() { return 0; } - -LIBC_INLINE uint32_t get_block_id_y() { return 0; } - -LIBC_INLINE uint32_t get_block_id_z() { return 0; } - -LIBC_INLINE uint64_t get_block_id() { return 0; } - -LIBC_INLINE uint32_t get_num_threads_x() { return 1; } - -LIBC_INLINE uint32_t get_num_threads_y() { return 1; } - -LIBC_INLINE uint32_t get_num_threads_z() { return 1; } - -LIBC_INLINE uint64_t get_num_threads() { return 1; } - -LIBC_INLINE uint32_t get_thread_id_x() { return 0; } - -LIBC_INLINE uint32_t get_thread_id_y() { return 0; } - -LIBC_INLINE uint32_t get_thread_id_z() { return 0; } - -LIBC_INLINE uint64_t get_thread_id() { return 0; } - -LIBC_INLINE uint32_t get_lane_size() { return 1; } - -LIBC_INLINE uint32_t get_lane_id() { return 0; } - -LIBC_INLINE uint64_t get_lane_mask() { return 1; } - -LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; } - -LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; } - -LIBC_INLINE void sync_threads() {} - -LIBC_INLINE void sync_lane(uint64_t) {} - -LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; } - -LIBC_INLINE uint64_t processor_clock() { return 0; } - -LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; } - -[[noreturn]] LIBC_INLINE void end_program() { __builtin_unreachable(); } - -LIBC_INLINE uint32_t get_cluster_id() { return 0; } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H diff --git a/libc/src/__support/GPU/nvptx/CMakeLists.txt b/libc/src/__support/GPU/nvptx/CMakeLists.txt deleted file mode 100644 index 0d3f8c7933c86c..00000000000000 --- a/libc/src/__support/GPU/nvptx/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - nvptx_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h deleted file mode 100644 index 1a43a839a9ce46..00000000000000 --- a/libc/src/__support/GPU/nvptx/utils.h +++ /dev/null @@ -1,160 +0,0 @@ -//===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-id: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -/// Type aliases to the address spaces used by the NVPTX backend. -template using Private = [[clang::opencl_private]] T; -template using Constant = [[clang::opencl_constant]] T; -template using Local = [[clang::opencl_local]] T; -template using Global = [[clang::opencl_global]] T; - -/// Returns the number of CUDA blocks in the 'x' dimension. -LIBC_INLINE uint32_t get_num_blocks_x() { - return __nvvm_read_ptx_sreg_nctaid_x(); -} - -/// Returns the number of CUDA blocks in the 'y' dimension. -LIBC_INLINE uint32_t get_num_blocks_y() { - return __nvvm_read_ptx_sreg_nctaid_y(); -} - -/// Returns the number of CUDA blocks in the 'z' dimension. -LIBC_INLINE uint32_t get_num_blocks_z() { - return __nvvm_read_ptx_sreg_nctaid_z(); -} - -/// Returns the total number of CUDA blocks. -LIBC_INLINE uint64_t get_num_blocks() { - return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); -} - -/// Returns the 'x' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); } - -/// Returns the 'y' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); } - -/// Returns the 'z' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); } - -/// Returns the absolute id of the CUDA block. -LIBC_INLINE uint64_t get_block_id() { - return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + - get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); -} - -/// Returns the number of CUDA threads in the 'x' dimension. -LIBC_INLINE uint32_t get_num_threads_x() { - return __nvvm_read_ptx_sreg_ntid_x(); -} - -/// Returns the number of CUDA threads in the 'y' dimension. -LIBC_INLINE uint32_t get_num_threads_y() { - return __nvvm_read_ptx_sreg_ntid_y(); -} - -/// Returns the number of CUDA threads in the 'z' dimension. -LIBC_INLINE uint32_t get_num_threads_z() { - return __nvvm_read_ptx_sreg_ntid_z(); -} - -/// Returns the total number of threads in the block. -LIBC_INLINE uint64_t get_num_threads() { - return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); -} - -/// Returns the 'x' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); } - -/// Returns the 'y' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); } - -/// Returns the 'z' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); } - -/// Returns the absolute id of the thread in the current CUDA block. -LIBC_INLINE uint64_t get_thread_id() { - return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + - get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); -} - -/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware. -LIBC_INLINE uint32_t get_lane_size() { return 32; } - -/// Returns the id of the thread inside of a CUDA warp executing together. -[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - return __nvvm_read_ptx_sreg_laneid(); -} - -/// Returns the bit-mask of active threads in the current warp. -[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() { - return __nvvm_activemask(); -} - -/// Copies the value from the first active thread in the warp to the rest. -[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, - uint32_t x) { - uint32_t mask = static_cast(lane_mask); - uint32_t id = __builtin_ffs(mask) - 1; - return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1); -} - -/// Returns a bitmask of threads in the current lane for which \p x is true. -[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - uint32_t mask = static_cast(lane_mask); - return __nvvm_vote_ballot_sync(mask, x); -} - -/// Waits for all the threads in the block to converge and issues a fence. -[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); } - -/// Waits for all pending memory operations to complete in program order. -[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); } - -/// Waits for all threads in the warp to reconverge for independent scheduling. -[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) { - __nvvm_bar_warp_sync(static_cast(mask)); -} - -/// Shuffles the the lanes inside the warp according to the given index. -[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, - uint32_t idx, uint32_t x) { - uint32_t mask = static_cast(lane_mask); - uint32_t bitmask = (mask >> idx) & 1; - return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1); -} - -/// Returns the current value of the GPU's processor clock. -LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } - -/// Returns a global fixed-frequency timer at nanosecond frequency. -LIBC_INLINE uint64_t fixed_frequency_clock() { - return __builtin_readsteadycounter(); -} - -/// Terminates execution of the calling thread. -[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); } - -/// Returns a unique identifier for the process cluster the current warp is -/// executing on. Here we use the identifier for the symmetric multiprocessor. -LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index ae52e7a088ad51..e138c84c0cb22d 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -9,48 +9,108 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H +#include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) -#include "amdgpu/utils.h" -#elif defined(LIBC_TARGET_ARCH_IS_NVPTX) -#include "nvptx/utils.h" -#else -#include "generic/utils.h" +#if !__has_include() +#error "Unsupported compiler" #endif +#include + namespace LIBC_NAMESPACE_DECL { namespace gpu { -/// Get the first active thread inside the lane. -LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) { - return __builtin_ffsll(lane_mask) - 1; + +template using Private = __gpu_private T; +template using Constant = __gpu_constant T; +template using Local = __gpu_local T; +template using Global = __gpu_local T; + +LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); } + +LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); } + +LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); } + +LIBC_INLINE uint64_t get_num_blocks() { + return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); +} + +LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); } + +LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); } + +LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); } + +LIBC_INLINE uint64_t get_block_id() { + return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + + get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); +} + +LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); } + +LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); } + +LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); } + +LIBC_INLINE uint64_t get_num_threads() { + return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); +} + +LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); } + +LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); } + +LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); } + +LIBC_INLINE uint64_t get_thread_id() { + return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + + get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); +} + +LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); } + +LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); } + +LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); } + +LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) { + return __gpu_read_first_lane_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { + return __gpu_ballot(lane_mask, x); +} + +LIBC_INLINE void sync_threads() { __gpu_sync_threads(); } + +LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); } + +LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) { + return __gpu_shuffle_idx_u32(lane_mask, idx, x); } -/// Conditional that is only true for a single thread in a lane. +[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } + LIBC_INLINE bool is_first_lane(uint64_t lane_mask) { - return gpu::get_lane_id() == get_first_lane_id(lane_mask); + return __gpu_is_first_in_lane(lane_mask); } -/// Gets the sum of all lanes inside the warp or wavefront. LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) { - for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) { - uint32_t index = step + gpu::get_lane_id(); - x += gpu::shuffle(lane_mask, index, x); - } - return gpu::broadcast_value(lane_mask, x); + return __gpu_lane_sum_u32(lane_mask, x); } -/// Gets the accumulator scan of the threads in the warp or wavefront. LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) { - for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) { - uint32_t index = gpu::get_lane_id() - step; - uint32_t bitmask = gpu::get_lane_id() >= step; - x += -bitmask & gpu::shuffle(lane_mask, index, x); - } - return x; + return __gpu_lane_scan_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t fixed_frequency_clock() { + return __builtin_readsteadycounter(); } +LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } + } // namespace gpu } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/gpu/clock.cpp b/libc/src/time/gpu/clock.cpp index add5b2725ef8f0..8609c5cd6b6b73 100644 --- a/libc/src/time/gpu/clock.cpp +++ b/libc/src/time/gpu/clock.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/time/clock.h" + +#include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/time/gpu/time_utils.h" diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp index a92f660f225cb1..d22d9d6bd8d792 100644 --- a/libc/src/time/gpu/nanosleep.cpp +++ b/libc/src/time/gpu/nanosleep.cpp @@ -8,6 +8,7 @@ #include "src/time/nanosleep.h" +#include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/time/gpu/time_utils.h"