From 60e9cb89cabb36d507bdeea199c97a11f41787dd Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Wed, 27 Mar 2024 21:35:54 +0000 Subject: [PATCH 01/12] Enable runtime hipification --- cupy_backends/cuda/api/runtime.pyx | 2106 +++++++++++++------------- cupy_backends/hip/cupy_hip_runtime.h | 493 +----- install/cupy_builder/_features.py | 1 + 3 files changed, 1072 insertions(+), 1528 deletions(-) diff --git a/cupy_backends/cuda/api/runtime.pyx b/cupy_backends/cuda/api/runtime.pyx index 63b11edc8c5..34f12afecb0 100644 --- a/cupy_backends/cuda/api/runtime.pyx +++ b/cupy_backends/cuda/api/runtime.pyx @@ -20,1079 +20,1109 @@ cimport cython # NOQA from cupy_backends.cuda.api cimport driver # NOQA from cupy_backends.cuda.libs cimport nvrtc # no-cython-lint +IF CUPY_USE_GEN_HIP_CODE: + from cupy_backends.cuda.api.runtime_hip import * +ELSE: + ########################################################################### + # Classes + ########################################################################### -############################################################################### -# Classes -############################################################################### + cdef class PointerAttributes: -cdef class PointerAttributes: + def __init__(self, int device, intptr_t devicePointer, + intptr_t hostPointer, int type=-1): + self.type = type + self.device = device + self.devicePointer = devicePointer + self.hostPointer = hostPointer - def __init__(self, int device, intptr_t devicePointer, - intptr_t hostPointer, int type=-1): - self.type = type - self.device = device - self.devicePointer = devicePointer - self.hostPointer = hostPointer + cdef class MemPoolProps: -cdef class MemPoolProps: + def __init__(self, int allocType, int handleType, + int locationType, int devId): + self.allocType = allocType + self.handleType = handleType + self.locationType = locationType + self.devId = devId - def __init__( - self, int allocType, int handleType, int locationType, int devId): - self.allocType = allocType - self.handleType = handleType - self.locationType = locationType - self.devId = devId + ########################################################################### + # Thread-local storage + ########################################################################### -############################################################################### -# Thread-local storage -############################################################################### + cdef object _thread_local = _threading.local() -cdef object _thread_local = _threading.local() + cdef class _ThreadLocal: -cdef class _ThreadLocal: + cdef list context_initialized - cdef list context_initialized + def __init__(self): + cdef int i + self.context_initialized = [False for i in range(getDeviceCount())] - def __init__(self): - cdef int i - self.context_initialized = [False for i in range(getDeviceCount())] + @staticmethod + cdef _ThreadLocal get(): + try: + tls = _thread_local.tls + except AttributeError: + tls = _thread_local.tls = _ThreadLocal() + return <_ThreadLocal>tls - @staticmethod - cdef _ThreadLocal get(): - try: - tls = _thread_local.tls - except AttributeError: - tls = _thread_local.tls = _ThreadLocal() - return <_ThreadLocal>tls + ########################################################################### + # Extern + ########################################################################### -############################################################################### -# Extern -############################################################################### + IF CUPY_USE_CUDA_PYTHON: + from cuda.ccudart cimport * + ELSE: + IF CUPY_HIP_VERSION > 0: + include '_runtime_extern_hip.pxi' + ELSE: + include '_runtime_extern.pxi' + pass # for cython-lint -IF CUPY_USE_CUDA_PYTHON: - from cuda.ccudart cimport * -ELSE: - include '_runtime_extern.pxi' - pass # for cython-lint + cdef extern from '../../cupy_backend_runtime.h' nogil: + bint hip_environment -cdef extern from '../../cupy_backend_runtime.h' nogil: - bint hip_environment + ########################################################################### + # Constants + ########################################################################### -############################################################################### -# Constants -############################################################################### + # TODO(kmaehashi): Deprecate these aliases and use `cuda*`. + errorInvalidValue = cudaErrorInvalidValue + errorMemoryAllocation = cudaErrorMemoryAllocation + errorPeerAccessAlreadyEnabled = cudaErrorPeerAccessAlreadyEnabled + errorContextIsDestroyed = cudaErrorContextIsDestroyed + errorInvalidResourceHandle = cudaErrorInvalidResourceHandle + deviceAttributeComputeCapabilityMajor = cudaDevAttrComputeCapabilityMajor + deviceAttributeComputeCapabilityMinor = cudaDevAttrComputeCapabilityMinor -# TODO(kmaehashi): Deprecate these aliases and use `cuda*`. -errorInvalidValue = cudaErrorInvalidValue -errorMemoryAllocation = cudaErrorMemoryAllocation -errorPeerAccessAlreadyEnabled = cudaErrorPeerAccessAlreadyEnabled -errorContextIsDestroyed = cudaErrorContextIsDestroyed -errorInvalidResourceHandle = cudaErrorInvalidResourceHandle -deviceAttributeComputeCapabilityMajor = cudaDevAttrComputeCapabilityMajor -deviceAttributeComputeCapabilityMinor = cudaDevAttrComputeCapabilityMinor + # Provide access to constants from Python. + # TODO(kmaehashi): Deprecate aliases above so that we can just do: + # from cupy_backends.cuda.api._runtime_enum import * + def _export_enum(): + import sys + import cupy_backends.cuda.api._runtime_enum as _runtime_enum + this = sys.modules[__name__] + for key in dir(_runtime_enum): + if not key.startswith('_'): + setattr(this, key, getattr(_runtime_enum, key)) -# Provide access to constants from Python. -# TODO(kmaehashi): Deprecate aliases above so that we can just do: -# from cupy_backends.cuda.api._runtime_enum import * -def _export_enum(): - import sys - import cupy_backends.cuda.api._runtime_enum as _runtime_enum - this = sys.modules[__name__] - for key in dir(_runtime_enum): - if not key.startswith('_'): - setattr(this, key, getattr(_runtime_enum, key)) + _export_enum() -_export_enum() + ########################################################################### + # Constants (CuPy) + ########################################################################### -############################################################################### -# Constants (CuPy) -############################################################################### + _is_hip_environment = hip_environment # for runtime being cimport'd + is_hip = hip_environment # for runtime being import'd -_is_hip_environment = hip_environment # for runtime being cimport'd -is_hip = hip_environment # for runtime being import'd + ########################################################################### + # Error handling + ########################################################################### -############################################################################### -# Error handling -############################################################################### + class CUDARuntimeError(RuntimeError): -class CUDARuntimeError(RuntimeError): + def __init__(self, status): + self.status = status + cdef bytes name = cudaGetErrorName(status) + cdef bytes msg = cudaGetErrorString(status) + super(CUDARuntimeError, self).__init__( + '%s: %s' % (name.decode(), msg.decode())) - def __init__(self, status): - self.status = status - cdef bytes name = cudaGetErrorName(status) - cdef bytes msg = cudaGetErrorString(status) - super(CUDARuntimeError, self).__init__( - '%s: %s' % (name.decode(), msg.decode())) + def __reduce__(self): + return (type(self), (self.status,)) - def __reduce__(self): - return (type(self), (self.status,)) + @cython.profile(False) + cpdef inline check_status(int status): + if status != 0: + # to reset error status + cudaGetLastError() + raise CUDARuntimeError(status) -@cython.profile(False) -cpdef inline check_status(int status): - if status != 0: - # to reset error status - cudaGetLastError() - raise CUDARuntimeError(status) + ########################################################################### + # Initialization + ########################################################################### -############################################################################### -# Initialization -############################################################################### + cpdef int driverGetVersion() except? -1: + cdef int version + status = cudaDriverGetVersion(&version) + check_status(status) + return version + + cpdef int runtimeGetVersion() except? -1: + cdef int version + IF CUPY_USE_CUDA_PYTHON: + # Workarounds an issue that cuda-python returns its version instead + # of the real runtime version. + # https://github.com/NVIDIA/cuda-python/issues/16 + cdef int major, minor + (major, minor) = nvrtc.getVersion() + version = major * 1000 + minor * 10 + ELSE: + status = cudaRuntimeGetVersion(&version) + check_status(status) + return version + + + ########################################################################### + # Device and context operations + ########################################################################### + + cpdef int getDevice() except? -1: + cdef int device + status = cudaGetDevice(&device) + check_status(status) + return device -cpdef int driverGetVersion() except? -1: - cdef int version - status = cudaDriverGetVersion(&version) - check_status(status) - return version + cpdef int deviceGetAttribute(int attrib, int device) except? -1: + cdef int ret + status = cudaDeviceGetAttribute(&ret, attrib, device) + check_status(status) + return ret -cpdef int runtimeGetVersion() except? -1: - cdef int version - IF CUPY_USE_CUDA_PYTHON: - # Workarounds an issue that cuda-python returns its version instead of - # the real runtime version. - # https://github.com/NVIDIA/cuda-python/issues/16 - cdef int major, minor - (major, minor) = nvrtc.getVersion() - version = major * 1000 + minor * 10 - ELSE: - status = cudaRuntimeGetVersion(&version) - check_status(status) - return version - - -############################################################################### -# Device and context operations -############################################################################### - -cpdef int getDevice() except? -1: - cdef int device - status = cudaGetDevice(&device) - check_status(status) - return device - -cpdef int deviceGetAttribute(int attrib, int device) except? -1: - cdef int ret - status = cudaDeviceGetAttribute(&ret, attrib, device) - check_status(status) - return ret - -cpdef getDeviceProperties(int device): - cdef DeviceProp props - cdef int status = cudaGetDeviceProperties(&props, device) - check_status(status) - - cdef dict properties = {'name': b'UNAVAILABLE'} # for RTD - - # Common properties to CUDA 9.0, 9.2, 10.x, 11.x, and HIP - IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 0: - properties = { - 'name': props.name, - 'totalGlobalMem': props.totalGlobalMem, - 'sharedMemPerBlock': props.sharedMemPerBlock, - 'regsPerBlock': props.regsPerBlock, - 'warpSize': props.warpSize, - 'maxThreadsPerBlock': props.maxThreadsPerBlock, - 'maxThreadsDim': tuple(props.maxThreadsDim), - 'maxGridSize': tuple(props.maxGridSize), - 'clockRate': props.clockRate, - 'totalConstMem': props.totalConstMem, - 'major': props.major, - 'minor': props.minor, - 'textureAlignment': props.textureAlignment, - 'texturePitchAlignment': props.texturePitchAlignment, - 'multiProcessorCount': props.multiProcessorCount, - 'kernelExecTimeoutEnabled': props.kernelExecTimeoutEnabled, - 'integrated': props.integrated, - 'canMapHostMemory': props.canMapHostMemory, - 'computeMode': props.computeMode, - 'maxTexture1D': props.maxTexture1D, - 'maxTexture2D': tuple(props.maxTexture2D), - 'maxTexture3D': tuple(props.maxTexture3D), - 'concurrentKernels': props.concurrentKernels, - 'ECCEnabled': props.ECCEnabled, - 'pciBusID': props.pciBusID, - 'pciDeviceID': props.pciDeviceID, - 'pciDomainID': props.pciDomainID, - 'tccDriver': props.tccDriver, - 'memoryClockRate': props.memoryClockRate, - 'memoryBusWidth': props.memoryBusWidth, - 'l2CacheSize': props.l2CacheSize, - 'maxThreadsPerMultiProcessor': props.maxThreadsPerMultiProcessor, - 'isMultiGpuBoard': props.isMultiGpuBoard, - 'cooperativeLaunch': props.cooperativeLaunch, - 'cooperativeMultiDeviceLaunch': props.cooperativeMultiDeviceLaunch, - } - IF CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 9020: - properties['deviceOverlap'] = props.deviceOverlap - properties['maxTexture1DMipmap'] = props.maxTexture1DMipmap - properties['maxTexture1DLinear'] = props.maxTexture1DLinear - properties['maxTexture1DLayered'] = tuple(props.maxTexture1DLayered) - properties['maxTexture2DMipmap'] = tuple(props.maxTexture2DMipmap) - properties['maxTexture2DLinear'] = tuple(props.maxTexture2DLinear) - properties['maxTexture2DLayered'] = tuple(props.maxTexture2DLayered) - properties['maxTexture2DGather'] = tuple(props.maxTexture2DGather) - properties['maxTexture3DAlt'] = tuple(props.maxTexture3DAlt) - properties['maxTextureCubemap'] = props.maxTextureCubemap - properties['maxTextureCubemapLayered'] = tuple( - props.maxTextureCubemapLayered) - properties['maxSurface1D'] = props.maxSurface1D - properties['maxSurface1DLayered'] = tuple(props.maxSurface1DLayered) - properties['maxSurface2D'] = tuple(props.maxSurface2D) - properties['maxSurface2DLayered'] = tuple(props.maxSurface2DLayered) - properties['maxSurface3D'] = tuple(props.maxSurface3D) - properties['maxSurfaceCubemap'] = props.maxSurfaceCubemap - properties['maxSurfaceCubemapLayered'] = tuple( - props.maxSurfaceCubemapLayered) - properties['surfaceAlignment'] = props.surfaceAlignment - properties['asyncEngineCount'] = props.asyncEngineCount - properties['unifiedAddressing'] = props.unifiedAddressing - properties['streamPrioritiesSupported'] = ( - props.streamPrioritiesSupported) - properties['globalL1CacheSupported'] = props.globalL1CacheSupported - properties['localL1CacheSupported'] = props.localL1CacheSupported - properties['sharedMemPerMultiprocessor'] = ( - props.sharedMemPerMultiprocessor) - properties['regsPerMultiprocessor'] = props.regsPerMultiprocessor - properties['managedMemory'] = props.managedMemory - properties['multiGpuBoardGroupID'] = props.multiGpuBoardGroupID - properties['hostNativeAtomicSupported'] = ( - props.hostNativeAtomicSupported) - properties['singleToDoublePrecisionPerfRatio'] = ( - props.singleToDoublePrecisionPerfRatio) - properties['pageableMemoryAccess'] = props.pageableMemoryAccess - properties['concurrentManagedAccess'] = props.concurrentManagedAccess - properties['computePreemptionSupported'] = ( - props.computePreemptionSupported) - properties['canUseHostPointerForRegisteredMem'] = ( - props.canUseHostPointerForRegisteredMem) - properties['sharedMemPerBlockOptin'] = props.sharedMemPerBlockOptin - properties['pageableMemoryAccessUsesHostPageTables'] = ( - props.pageableMemoryAccessUsesHostPageTables) - properties['directManagedMemAccessFromHost'] = ( - props.directManagedMemAccessFromHost) - if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >=10000: - properties['uuid'] = props.uuid.bytes - properties['luid'] = props.luid - properties['luidDeviceNodeMask'] = props.luidDeviceNodeMask - if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 11000: - properties['persistingL2CacheMaxSize'] = props.persistingL2CacheMaxSize - properties['maxBlocksPerMultiProcessor'] = ( - props.maxBlocksPerMultiProcessor) - properties['accessPolicyMaxWindowSize'] = ( - props.accessPolicyMaxWindowSize) - properties['reservedSharedMemPerBlock'] = ( - props.reservedSharedMemPerBlock) - IF CUPY_HIP_VERSION > 0: # HIP-only props - properties['clockInstructionRate'] = props.clockInstructionRate - properties['maxSharedMemoryPerMultiProcessor'] = ( - props.maxSharedMemoryPerMultiProcessor) - properties['hdpMemFlushCntl'] = (props.hdpMemFlushCntl) - properties['hdpRegFlushCntl'] = (props.hdpRegFlushCntl) - properties['memPitch'] = props.memPitch - properties['cooperativeMultiDeviceUnmatchedFunc'] = ( - props.cooperativeMultiDeviceUnmatchedFunc) - properties['cooperativeMultiDeviceUnmatchedGridDim'] = ( - props.cooperativeMultiDeviceUnmatchedGridDim) - properties['cooperativeMultiDeviceUnmatchedBlockDim'] = ( - props.cooperativeMultiDeviceUnmatchedBlockDim) - properties['cooperativeMultiDeviceUnmatchedSharedMem'] = ( - props.cooperativeMultiDeviceUnmatchedSharedMem) - properties['isLargeBar'] = props.isLargeBar - - cdef dict arch = {} # for hipDeviceArch_t - arch['hasGlobalInt32Atomics'] = props.arch.hasGlobalInt32Atomics - arch['hasGlobalFloatAtomicExch'] = props.arch.hasGlobalFloatAtomicExch - arch['hasSharedInt32Atomics'] = props.arch.hasSharedInt32Atomics - arch['hasSharedFloatAtomicExch'] = props.arch.hasSharedFloatAtomicExch - arch['hasFloatAtomicAdd'] = props.arch.hasFloatAtomicAdd - arch['hasGlobalInt64Atomics'] = props.arch.hasGlobalInt64Atomics - arch['hasSharedInt64Atomics'] = props.arch.hasSharedInt64Atomics - arch['hasDoubles'] = props.arch.hasDoubles - arch['hasWarpVote'] = props.arch.hasWarpVote - arch['hasWarpBallot'] = props.arch.hasWarpBallot - arch['hasWarpShuffle'] = props.arch.hasWarpShuffle - arch['hasFunnelShift'] = props.arch.hasFunnelShift - arch['hasThreadFenceSystem'] = props.arch.hasThreadFenceSystem - arch['hasSyncThreadsExt'] = props.arch.hasSyncThreadsExt - arch['hasSurfaceFuncs'] = props.arch.hasSurfaceFuncs - arch['has3dGrid'] = props.arch.has3dGrid - arch['hasDynamicParallelism'] = props.arch.hasDynamicParallelism - properties['arch'] = arch - IF 0 < CUPY_HIP_VERSION < 310: # gcnArchName used after ROCm 3.1+ - properties['gcnArch'] = props.gcnArch - IF CUPY_HIP_VERSION >= 310: - properties['gcnArchName'] = props.gcnArchName - properties['asicRevision'] = props.asicRevision - properties['managedMemory'] = props.managedMemory - properties['directManagedMemAccessFromHost'] = ( - props.directManagedMemAccessFromHost) - properties['concurrentManagedAccess'] = props.concurrentManagedAccess - properties['pageableMemoryAccess'] = props.pageableMemoryAccess - properties['pageableMemoryAccessUsesHostPageTables'] = ( - props.pageableMemoryAccessUsesHostPageTables) - return properties - -cpdef int deviceGetByPCIBusId(str pci_bus_id) except? -1: - # Encode the python string before passing to native code - byte_pci_bus_id = pci_bus_id.encode('ascii') - cdef const char* c_pci_bus_id = byte_pci_bus_id - - cdef int device = -1 - cdef int status - status = cudaDeviceGetByPCIBusId(&device, c_pci_bus_id) - check_status(status) - # on ROCm, it might fail silently, so we also need to check if the - # device is meaningful or not - if hip_environment and device == -1: - check_status(cudaErrorInvalidValue) - return device - -cpdef str deviceGetPCIBusId(int device): - # The PCI Bus ID string must be able to store 13 characters including the - # NULL-terminator according to the CUDA documentation. - # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html - cdef char pci_bus_id[13] - status = cudaDeviceGetPCIBusId(pci_bus_id, 13, device) - check_status(status) - return pci_bus_id.decode('ascii') - -cpdef int getDeviceCount() except? -1: - cdef int count - status = cudaGetDeviceCount(&count) - check_status(status) - return count - -cpdef setDevice(int device): - status = cudaSetDevice(device) - check_status(status) - -cpdef deviceSynchronize(): - with nogil: - status = cudaDeviceSynchronize() - check_status(status) - -cpdef int deviceCanAccessPeer(int device, int peerDevice) except? -1: - cdef int ret - status = cudaDeviceCanAccessPeer(&ret, device, peerDevice) - check_status(status) - return ret - -cpdef deviceEnablePeerAccess(int peerDevice): - status = cudaDeviceEnablePeerAccess(peerDevice, 0) - check_status(status) - -cpdef deviceDisablePeerAccess(int peerDevice): - status = cudaDeviceDisablePeerAccess(peerDevice) - check_status(status) - -cpdef _deviceEnsurePeerAccess(int peerDevice): - status = cudaDeviceEnablePeerAccess(peerDevice, 0) - if status == 0: - return - elif status == errorPeerAccessAlreadyEnabled: - cudaGetLastError() # clear error status - return - check_status(status) - -cpdef size_t deviceGetLimit(int limit) except? -1: - cdef size_t value - status = cudaDeviceGetLimit(&value, limit) - check_status(status) - return value - -cpdef deviceSetLimit(int limit, size_t value): - status = cudaDeviceSetLimit(limit, value) - check_status(status) - - -############################################################################### -# IPC operations -############################################################################### - -cpdef ipcCloseMemHandle(intptr_t devPtr): - status = cudaIpcCloseMemHandle(devPtr) - check_status(status) - -cpdef ipcGetEventHandle(intptr_t event): - cdef IpcEventHandle handle - status = cudaIpcGetEventHandle(&handle, event) - check_status(status) - # We need to do this due to a bug in Cython that - # cuts out the 0 bytes in an array of chars when - # constructing the python object - # resulting in different sizes assignment errors - # when recreating the struct from the python - # array of bytes - reserved = [handle.reserved[i] for i in range(64)] - return bytes(reserved) - -cpdef ipcGetMemHandle(intptr_t devPtr): - cdef IpcMemHandle handle - status = cudaIpcGetMemHandle(&handle, devPtr) - check_status(status) - # We need to do this due to a bug in Cython that - # when converting an array of chars in C to a python object - # it discards the data after the first 0 value - # resulting in a loss of data, as this is not a string - # but a buffer of bytes - reserved = [handle.reserved[i] for i in range(64)] - return bytes(reserved) - -cpdef ipcOpenEventHandle(bytes handle): - cdef driver.Event event - cdef IpcEventHandle handle_ - handle_.reserved = handle - status = cudaIpcOpenEventHandle(&event, handle_) - check_status(status) - return event - -cpdef ipcOpenMemHandle(bytes handle, - unsigned int flags=cudaIpcMemLazyEnablePeerAccess): - cdef void* devPtr - cdef IpcMemHandle handle_ - handle_.reserved = handle - status = cudaIpcOpenMemHandle(&devPtr, handle_, flags) - check_status(status) - return devPtr - - -############################################################################### -# Memory management -############################################################################### - -cpdef intptr_t malloc(size_t size) except? 0: - cdef void* ptr - with nogil: - status = cudaMalloc(&ptr, size) - check_status(status) - return ptr - -cpdef intptr_t mallocManaged( - size_t size, unsigned int flags=cudaMemAttachGlobal) except? 0: - if 0 < CUPY_HIP_VERSION < 40300000: - raise RuntimeError('Managed memory requires ROCm 4.3+') - cdef void* ptr - with nogil: - status = cudaMallocManaged(&ptr, size, flags) - check_status(status) - return ptr - -cpdef intptr_t malloc3DArray(intptr_t descPtr, size_t width, size_t height, - size_t depth, unsigned int flags=0) except? 0: - cdef Array ptr - cdef Extent extent = make_cudaExtent(width, height, depth) - with nogil: - status = cudaMalloc3DArray(&ptr, descPtr, extent, - flags) - check_status(status) - return ptr - -cpdef intptr_t mallocArray(intptr_t descPtr, size_t width, size_t height, - unsigned int flags=0) except? 0: - cdef Array ptr - with nogil: - status = cudaMallocArray(&ptr, descPtr, width, - height, flags) - check_status(status) - return ptr - -cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0: - cdef void* ptr - if _is_hip_environment: - raise RuntimeError('HIP does not support mallocAsync') - if runtimeGetVersion() < 11020: - raise RuntimeError('mallocAsync is supported since CUDA 11.2') - with nogil: - status = cudaMallocAsync(&ptr, size, stream) - check_status(status) - return ptr - -cpdef intptr_t mallocFromPoolAsync( - size_t size, intptr_t pool, intptr_t stream) except? 0: - cdef void* ptr - if _is_hip_environment: - raise RuntimeError('HIP does not support mallocFromPoolAsync') - if runtimeGetVersion() < 11020: - raise RuntimeError('mallocFromPoolAsync is supported since CUDA 11.2') - with nogil: - status = cudaMallocFromPoolAsync( - &ptr, size, pool, stream) - check_status(status) - return ptr - -cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0: - cdef void* ptr - with nogil: - status = cudaHostAlloc(&ptr, size, flags) - check_status(status) - return ptr - -cpdef hostRegister(intptr_t ptr, size_t size, unsigned int flags): - with nogil: - status = cudaHostRegister(ptr, size, flags) - check_status(status) - -cpdef hostUnregister(intptr_t ptr): - with nogil: - status = cudaHostUnregister(ptr) - check_status(status) - -cpdef free(intptr_t ptr): - with nogil: - status = cudaFree(ptr) - check_status(status) - -cpdef freeHost(intptr_t ptr): - with nogil: - status = cudaFreeHost(ptr) - check_status(status) - -cpdef freeArray(intptr_t ptr): - with nogil: - status = cudaFreeArray(ptr) - check_status(status) - -cpdef freeAsync(intptr_t ptr, intptr_t stream): - if _is_hip_environment: - raise RuntimeError('HIP does not support freeAsync') - if runtimeGetVersion() < 11020: - raise RuntimeError('freeAsync is supported since CUDA 11.2') - with nogil: - status = cudaFreeAsync(ptr, stream) - check_status(status) - -cpdef memGetInfo(): - cdef size_t free, total - status = cudaMemGetInfo(&free, &total) - check_status(status) - return free, total - -cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind): - with nogil: - status = cudaMemcpy(dst, src, size, kind) - check_status(status) - -cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind, - intptr_t stream): - with nogil: - status = cudaMemcpyAsync( - dst, src, size, kind, - stream) - check_status(status) - -cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice, - size_t size): - with nogil: - status = cudaMemcpyPeer(dst, dstDevice, src, srcDevice, - size) - check_status(status) - -cpdef memcpyPeerAsync(intptr_t dst, int dstDevice, intptr_t src, int srcDevice, - size_t size, intptr_t stream): - with nogil: - status = cudaMemcpyPeerAsync(dst, dstDevice, src, - srcDevice, size, stream) - check_status(status) - -cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, - size_t width, size_t height, MemoryKind kind): - with nogil: - status = cudaMemcpy2D(dst, dpitch, src, spitch, width, - height, kind) - check_status(status) - -cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, - size_t width, size_t height, MemoryKind kind, - intptr_t stream): - with nogil: - status = cudaMemcpy2DAsync(dst, dpitch, src, spitch, - width, height, kind, stream) - check_status(status) - -cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src, - size_t wOffset, size_t hOffset, size_t width, - size_t height, int kind): - with nogil: - status = cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, - hOffset, width, height, - kind) - check_status(status) - -cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src, - size_t wOffset, size_t hOffset, size_t width, - size_t height, int kind, intptr_t stream): - with nogil: - status = cudaMemcpy2DFromArrayAsync(dst, dpitch, src, - wOffset, hOffset, width, height, - kind, - stream) - check_status(status) - -cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset, - intptr_t src, size_t spitch, size_t width, size_t height, - int kind): - with nogil: - status = cudaMemcpy2DToArray(dst, wOffset, hOffset, src, - spitch, width, height, kind) - check_status(status) - -cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset, - intptr_t src, size_t spitch, size_t width, - size_t height, int kind, intptr_t stream): - with nogil: - status = cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, - src, spitch, width, height, - kind, - stream) - check_status(status) - -cpdef memcpy3D(intptr_t Memcpy3DParmsPtr): - with nogil: - status = cudaMemcpy3D(Memcpy3DParmsPtr) - check_status(status) - -cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream): - with nogil: - status = cudaMemcpy3DAsync(Memcpy3DParmsPtr, - stream) - check_status(status) - -cpdef memset(intptr_t ptr, int value, size_t size): - with nogil: - status = cudaMemset(ptr, value, size) - check_status(status) - -cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream): - with nogil: - status = cudaMemsetAsync(ptr, value, size, - stream) - check_status(status) - -cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice, - intptr_t stream): - if 0 < CUPY_HIP_VERSION < 40300000: - raise RuntimeError('Managed memory requires ROCm 4.3+') - with nogil: - status = cudaMemPrefetchAsync(devPtr, count, dstDevice, - stream) - check_status(status) - -cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device): - if 0 < CUPY_HIP_VERSION < 40300000: - raise RuntimeError('Managed memory requires ROCm 4.3+') - with nogil: - status = cudaMemAdvise(devPtr, count, - advice, device) - check_status(status) - -cpdef PointerAttributes pointerGetAttributes(intptr_t ptr): - cdef _PointerAttributes attrs - status = cudaPointerGetAttributes(&attrs, ptr) - check_status(status) - IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 60000000: - return PointerAttributes( - attrs.device, - attrs.devicePointer, - attrs.hostPointer, - attrs.type) - ELIF 0 < CUPY_HIP_VERSION < 60000000: - return PointerAttributes( - attrs.device, - attrs.devicePointer, - attrs.hostPointer, - attrs.memoryType) - ELSE: # for RTD - return None - -cpdef intptr_t deviceGetDefaultMemPool(int device) except? 0: - '''Get the default mempool on the current device.''' - if _is_hip_environment: - raise RuntimeError('HIP does not support deviceGetDefaultMemPool') - if runtimeGetVersion() < 11020: - raise RuntimeError('deviceGetDefaultMemPool is supported since ' - 'CUDA 11.2') - cdef MemPool pool - with nogil: - status = cudaDeviceGetDefaultMemPool(&pool, device) - check_status(status) - return (pool) - -cpdef intptr_t deviceGetMemPool(int device) except? 0: - '''Get the current mempool on the current device.''' - if _is_hip_environment: - raise RuntimeError('HIP does not support deviceGetMemPool') - if runtimeGetVersion() < 11020: - raise RuntimeError('deviceGetMemPool is supported since ' - 'CUDA 11.2') - cdef MemPool pool - with nogil: - status = cudaDeviceGetMemPool(&pool, device) - check_status(status) - return (pool) - -cpdef deviceSetMemPool(int device, intptr_t pool): - '''Set the current mempool on the current device to pool.''' - if _is_hip_environment: - raise RuntimeError('HIP does not support deviceSetMemPool') - if runtimeGetVersion() < 11020: - raise RuntimeError('deviceSetMemPool is supported since ' - 'CUDA 11.2') - with nogil: - status = cudaDeviceSetMemPool(device, pool) - check_status(status) - -cpdef intptr_t memPoolCreate(MemPoolProps props) except? 0: - if _is_hip_environment: - raise RuntimeError('HIP does not support memPoolCreate') - if runtimeGetVersion() < 11020: - raise RuntimeError('memPoolCreate is supported since CUDA 11.2') - - cdef MemPool pool - cdef _MemPoolProps props_c - c_memset(&props_c, 0, sizeof(_MemPoolProps)) - props_c.allocType = props.allocType - props_c.handleTypes = props.handleType - props_c.location.type = props.locationType - props_c.location.id = props.devId - - with nogil: - status = cudaMemPoolCreate(&pool, &props_c) - check_status(status) - return pool - -cpdef memPoolDestroy(intptr_t pool): - if _is_hip_environment: - raise RuntimeError('HIP does not support memPoolDestroy') - if runtimeGetVersion() < 11020: - raise RuntimeError('memPoolDestroy is supported since CUDA 11.2') - with nogil: - status = cudaMemPoolDestroy(pool) - check_status(status) - -cpdef memPoolTrimTo(intptr_t pool, size_t size): - if _is_hip_environment: - raise RuntimeError('HIP does not support memPoolTrimTo') - if runtimeGetVersion() < 11020: - raise RuntimeError('memPoolTrimTo is supported since CUDA 11.2') - with nogil: - status = cudaMemPoolTrimTo(pool, size) - check_status(status) - -cpdef memPoolGetAttribute(intptr_t pool, int attr): - if _is_hip_environment: - raise RuntimeError('HIP does not support memPoolGetAttribute') - if runtimeGetVersion() < 11020: - raise RuntimeError('memPoolGetAttribute is supported since CUDA 11.2') - cdef int val1 - cdef uint64_t val2 - cdef void* out - # TODO(leofang): check this hack when more cudaMemPoolAttr are added! - out = (&val1) if attr <= 0x3 else (&val2) - with nogil: - status = cudaMemPoolGetAttribute(pool, attr, out) - check_status(status) - # TODO(leofang): check this hack when more cudaMemPoolAttr are added! - # cast to Python int regardless of C types - return val1 if attr <= 0x3 else val2 - -cpdef memPoolSetAttribute(intptr_t pool, int attr, object value): - if _is_hip_environment: - raise RuntimeError('HIP does not support memPoolSetAttribute') - if runtimeGetVersion() < 11020: - raise RuntimeError('memPoolSetAttribute is supported since CUDA 11.2') - cdef int val1 - cdef uint64_t val2 - cdef void* out - # TODO(leofang): check this hack when more cudaMemPoolAttr are added! - if attr <= 0x3: - val1 = value - out = (&val1) - else: - val2 = value - out = (&val2) - with nogil: - status = cudaMemPoolSetAttribute(pool, attr, out) - check_status(status) - - -############################################################################### -# Stream and Event -############################################################################### - -cpdef intptr_t streamCreate() except? 0: - cdef driver.Stream stream - status = cudaStreamCreate(&stream) - check_status(status) - return stream - - -cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0: - cdef driver.Stream stream - status = cudaStreamCreateWithFlags(&stream, flags) - check_status(status) - return stream - - -cpdef streamDestroy(intptr_t stream): - status = cudaStreamDestroy(stream) - check_status(status) - - -cpdef streamSynchronize(intptr_t stream): - with nogil: - status = cudaStreamSynchronize(stream) - check_status(status) - - -cdef _streamCallbackFunc(driver.Stream hStream, int status, - void* func_arg) with gil: - obj = func_arg - func, arg = obj - func(hStream, status, arg) - cpython.Py_DECREF(obj) - - -cdef _HostFnFunc(void* func_arg) with gil: - obj = func_arg - func, arg = obj - func(arg) - cpython.Py_DECREF(obj) - - -cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg, - unsigned int flags=0): - if _is_hip_environment and stream == 0: - raise RuntimeError('HIP does not allow adding callbacks to the ' - 'default (null) stream') - func_arg = (callback, arg) - cpython.Py_INCREF(func_arg) - with nogil: - status = cudaStreamAddCallback( - stream, _streamCallbackFunc, - func_arg, flags) - check_status(status) - - -cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg): - if _is_hip_environment: - raise RuntimeError('This feature is not supported on HIP') - - func_arg = (callback, arg) - cpython.Py_INCREF(func_arg) - with nogil: - status = cudaLaunchHostFunc( - stream, _HostFnFunc, - func_arg) - check_status(status) - - -cpdef streamQuery(intptr_t stream): - return cudaStreamQuery(stream) - - -cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=0): - with nogil: - status = cudaStreamWaitEvent(stream, - event, flags) - check_status(status) - - -cpdef streamBeginCapture(intptr_t stream, int mode=streamCaptureModeRelaxed): - if _is_hip_environment: - raise RuntimeError('streamBeginCapture is not supported in ROCm') - # TODO(leofang): check and raise if stream == 0? - with nogil: - status = cudaStreamBeginCapture(stream, - mode) - check_status(status) - - -cpdef intptr_t streamEndCapture(intptr_t stream) except? 0: - # TODO(leofang): check and raise if stream == 0? - cdef Graph g - if _is_hip_environment: - raise RuntimeError('streamEndCapture is not supported in ROCm') - with nogil: - status = cudaStreamEndCapture(stream, &g) - check_status(status) - return g - - -cpdef bint streamIsCapturing(intptr_t stream) except*: - cdef StreamCaptureStatus s - if _is_hip_environment: - raise RuntimeError('streamIsCapturing is not supported in ROCm') - with nogil: - status = cudaStreamIsCapturing(stream, &s) - check_status(status) # cudaErrorStreamCaptureImplicit could be raised here - if s == streamCaptureStatusInvalidated: - raise RuntimeError('the stream was capturing, but an error has ' - 'invalidated the capture sequence') - return s - - -cpdef intptr_t eventCreate() except? 0: - cdef driver.Event event - status = cudaEventCreate(&event) - check_status(status) - return event - -cpdef intptr_t eventCreateWithFlags(unsigned int flags) except? 0: - cdef driver.Event event - status = cudaEventCreateWithFlags(&event, flags) - check_status(status) - return event - - -cpdef eventDestroy(intptr_t event): - status = cudaEventDestroy(event) - check_status(status) - - -cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0: - cdef float ms - status = cudaEventElapsedTime(&ms, start, end) - check_status(status) - return ms - - -cpdef eventQuery(intptr_t event): - return cudaEventQuery(event) - - -cpdef eventRecord(intptr_t event, intptr_t stream): - status = cudaEventRecord(event, stream) - check_status(status) - - -cpdef eventSynchronize(intptr_t event): - with nogil: - status = cudaEventSynchronize(event) - check_status(status) - - -############################################################################## -# util -############################################################################## - -cdef _ensure_context(): - """Ensure that CUcontext bound to the calling host thread exists. - - See discussion on https://github.com/cupy/cupy/issues/72 for details. - """ - tls = _ThreadLocal.get() - cdef int dev = getDevice() - if not tls.context_initialized[dev]: - # Call Runtime API to establish context on this host thread. - memGetInfo() - tls.context_initialized[dev] = True - - -############################################################################## -# Texture -############################################################################## - -cpdef uintmax_t createTextureObject( - intptr_t ResDescPtr, intptr_t TexDescPtr) except? 0: - cdef uintmax_t texobj = 0 - with nogil: - status = cudaCreateTextureObject((&texobj), - ResDescPtr, - TexDescPtr, - NULL) - check_status(status) - return texobj - -cpdef destroyTextureObject(uintmax_t texObject): - with nogil: - status = cudaDestroyTextureObject(texObject) - check_status(status) - -cpdef uintmax_t createSurfaceObject(intptr_t ResDescPtr) except? 0: - cdef uintmax_t surfobj = 0 - with nogil: - status = cudaCreateSurfaceObject((&surfobj), - ResDescPtr) - check_status(status) - return surfobj - -cpdef destroySurfaceObject(uintmax_t surfObject): - with nogil: - status = cudaDestroySurfaceObject(surfObject) - check_status(status) - -cdef ChannelFormatDesc getChannelDesc(intptr_t array) except*: - cdef ChannelFormatDesc desc - with nogil: - status = cudaGetChannelDesc(&desc, array) - check_status(status) - return desc - -cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t obj) except*: - cdef ResourceDesc desc - with nogil: - status = cudaGetTextureObjectResourceDesc(&desc, obj) - check_status(status) - return desc - -cdef TextureDesc getTextureObjectTextureDesc(uintmax_t obj) except*: - cdef TextureDesc desc - with nogil: - status = cudaGetTextureObjectTextureDesc(&desc, obj) - check_status(status) - return desc - -cdef Extent make_Extent(size_t w, size_t h, size_t d) except*: - return make_cudaExtent(w, h, d) - -cdef Pos make_Pos(size_t x, size_t y, size_t z) except*: - return make_cudaPos(x, y, z) - -cdef PitchedPtr make_PitchedPtr( - intptr_t d, size_t p, size_t xsz, size_t ysz) except*: - return make_cudaPitchedPtr(d, p, xsz, ysz) - - -############################################################################## -# Graph -############################################################################## - -cpdef graphDestroy(intptr_t graph): - with nogil: - status = cudaGraphDestroy(graph) - check_status(status) - -cpdef graphExecDestroy(intptr_t graphExec): - with nogil: - status = cudaGraphExecDestroy(graphExec) - check_status(status) - -cpdef intptr_t graphInstantiate(intptr_t graph) except? 0: - # TODO(leofang): support reporting error log? - cdef GraphExec ge - with nogil: - status = cudaGraphInstantiate((&ge), graph, - NULL, NULL, 0) - check_status(status) - return ge - -cpdef graphLaunch(intptr_t graphExec, intptr_t stream): - with nogil: - status = cudaGraphLaunch((graphExec), stream) - check_status(status) - -cpdef graphUpload(intptr_t graphExec, intptr_t stream): - if runtimeGetVersion() < 11010: - raise RuntimeError('graphUpload is supported since CUDA 11.1+') - with nogil: - status = cudaGraphUpload((graphExec), stream) - check_status(status) + cpdef getDeviceProperties(int device): + cdef DeviceProp props + cdef int status = cudaGetDeviceProperties(&props, device) + check_status(status) + + cdef dict properties = {'name': b'UNAVAILABLE'} # for RTD + + # Common properties to CUDA 9.0, 9.2, 10.x, 11.x, and HIP + IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 0: + properties = { + 'name': props.name, + 'totalGlobalMem': props.totalGlobalMem, + 'sharedMemPerBlock': props.sharedMemPerBlock, + 'regsPerBlock': props.regsPerBlock, + 'warpSize': props.warpSize, + 'maxThreadsPerBlock': props.maxThreadsPerBlock, + 'maxThreadsDim': tuple(props.maxThreadsDim), + 'maxGridSize': tuple(props.maxGridSize), + 'clockRate': props.clockRate, + 'totalConstMem': props.totalConstMem, + 'major': props.major, + 'minor': props.minor, + 'textureAlignment': props.textureAlignment, + 'texturePitchAlignment': props.texturePitchAlignment, + 'multiProcessorCount': props.multiProcessorCount, + 'kernelExecTimeoutEnabled': props.kernelExecTimeoutEnabled, + 'integrated': props.integrated, + 'canMapHostMemory': props.canMapHostMemory, + 'computeMode': props.computeMode, + 'maxTexture1D': props.maxTexture1D, + 'maxTexture2D': tuple(props.maxTexture2D), + 'maxTexture3D': tuple(props.maxTexture3D), + 'concurrentKernels': props.concurrentKernels, + 'ECCEnabled': props.ECCEnabled, + 'pciBusID': props.pciBusID, + 'pciDeviceID': props.pciDeviceID, + 'pciDomainID': props.pciDomainID, + 'tccDriver': props.tccDriver, + 'memoryClockRate': props.memoryClockRate, + 'memoryBusWidth': props.memoryBusWidth, + 'l2CacheSize': props.l2CacheSize, + 'maxThreadsPerMultiProcessor': props.maxThreadsPerMultiProcessor, + 'isMultiGpuBoard': props.isMultiGpuBoard, + 'cooperativeLaunch': props.cooperativeLaunch, + 'cooperativeMultiDeviceLaunch': props.cooperativeMultiDeviceLaunch, + } + IF CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 9020: + properties['deviceOverlap'] = props.deviceOverlap + properties['maxTexture1DMipmap'] = props.maxTexture1DMipmap + properties['maxTexture1DLinear'] = props.maxTexture1DLinear + properties['maxTexture1DLayered'] = tuple(props.maxTexture1DLayered) + properties['maxTexture2DMipmap'] = tuple(props.maxTexture2DMipmap) + properties['maxTexture2DLinear'] = tuple(props.maxTexture2DLinear) + properties['maxTexture2DLayered'] = tuple(props.maxTexture2DLayered) + properties['maxTexture2DGather'] = tuple(props.maxTexture2DGather) + properties['maxTexture3DAlt'] = tuple(props.maxTexture3DAlt) + properties['maxTextureCubemap'] = props.maxTextureCubemap + properties['maxTextureCubemapLayered'] = tuple( + props.maxTextureCubemapLayered) + properties['maxSurface1D'] = props.maxSurface1D + properties['maxSurface1DLayered'] = tuple(props.maxSurface1DLayered) + properties['maxSurface2D'] = tuple(props.maxSurface2D) + properties['maxSurface2DLayered'] = tuple(props.maxSurface2DLayered) + properties['maxSurface3D'] = tuple(props.maxSurface3D) + properties['maxSurfaceCubemap'] = props.maxSurfaceCubemap + properties['maxSurfaceCubemapLayered'] = tuple( + props.maxSurfaceCubemapLayered) + properties['surfaceAlignment'] = props.surfaceAlignment + properties['asyncEngineCount'] = props.asyncEngineCount + properties['unifiedAddressing'] = props.unifiedAddressing + properties['streamPrioritiesSupported'] = ( + props.streamPrioritiesSupported) + properties['globalL1CacheSupported'] = props.globalL1CacheSupported + properties['localL1CacheSupported'] = props.localL1CacheSupported + properties['sharedMemPerMultiprocessor'] = ( + props.sharedMemPerMultiprocessor) + properties['regsPerMultiprocessor'] = props.regsPerMultiprocessor + properties['managedMemory'] = props.managedMemory + properties['multiGpuBoardGroupID'] = props.multiGpuBoardGroupID + properties['hostNativeAtomicSupported'] = ( + props.hostNativeAtomicSupported) + properties['singleToDoublePrecisionPerfRatio'] = ( + props.singleToDoublePrecisionPerfRatio) + properties['pageableMemoryAccess'] = props.pageableMemoryAccess + properties['concurrentManagedAccess'] = props.concurrentManagedAccess + properties['computePreemptionSupported'] = ( + props.computePreemptionSupported) + properties['canUseHostPointerForRegisteredMem'] = ( + props.canUseHostPointerForRegisteredMem) + properties['sharedMemPerBlockOptin'] = props.sharedMemPerBlockOptin + properties['pageableMemoryAccessUsesHostPageTables'] = ( + props.pageableMemoryAccessUsesHostPageTables) + properties['directManagedMemAccessFromHost'] = ( + props.directManagedMemAccessFromHost) + if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >=10000: + properties['uuid'] = props.uuid.bytes + properties['luid'] = props.luid + properties['luidDeviceNodeMask'] = props.luidDeviceNodeMask + if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 11000: + properties['persistingL2CacheMaxSize'] = props.persistingL2CacheMaxSize + properties['maxBlocksPerMultiProcessor'] = ( + props.maxBlocksPerMultiProcessor) + properties['accessPolicyMaxWindowSize'] = ( + props.accessPolicyMaxWindowSize) + properties['reservedSharedMemPerBlock'] = ( + props.reservedSharedMemPerBlock) + IF CUPY_HIP_VERSION > 0: # HIP-only props + properties['clockInstructionRate'] = props.clockInstructionRate + properties['maxSharedMemoryPerMultiProcessor'] = ( + props.maxSharedMemoryPerMultiProcessor) + properties['hdpMemFlushCntl'] = (props.hdpMemFlushCntl) + properties['hdpRegFlushCntl'] = (props.hdpRegFlushCntl) + properties['memPitch'] = props.memPitch + properties['cooperativeMultiDeviceUnmatchedFunc'] = ( + props.cooperativeMultiDeviceUnmatchedFunc) + properties['cooperativeMultiDeviceUnmatchedGridDim'] = ( + props.cooperativeMultiDeviceUnmatchedGridDim) + properties['cooperativeMultiDeviceUnmatchedBlockDim'] = ( + props.cooperativeMultiDeviceUnmatchedBlockDim) + properties['cooperativeMultiDeviceUnmatchedSharedMem'] = ( + props.cooperativeMultiDeviceUnmatchedSharedMem) + properties['isLargeBar'] = props.isLargeBar + + cdef dict arch = {} # for hipDeviceArch_t + arch['hasGlobalInt32Atomics'] = props.arch.hasGlobalInt32Atomics + arch['hasGlobalFloatAtomicExch'] = props.arch.hasGlobalFloatAtomicExch + arch['hasSharedInt32Atomics'] = props.arch.hasSharedInt32Atomics + arch['hasSharedFloatAtomicExch'] = props.arch.hasSharedFloatAtomicExch + arch['hasFloatAtomicAdd'] = props.arch.hasFloatAtomicAdd + arch['hasGlobalInt64Atomics'] = props.arch.hasGlobalInt64Atomics + arch['hasSharedInt64Atomics'] = props.arch.hasSharedInt64Atomics + arch['hasDoubles'] = props.arch.hasDoubles + arch['hasWarpVote'] = props.arch.hasWarpVote + arch['hasWarpBallot'] = props.arch.hasWarpBallot + arch['hasWarpShuffle'] = props.arch.hasWarpShuffle + arch['hasFunnelShift'] = props.arch.hasFunnelShift + arch['hasThreadFenceSystem'] = props.arch.hasThreadFenceSystem + arch['hasSyncThreadsExt'] = props.arch.hasSyncThreadsExt + arch['hasSurfaceFuncs'] = props.arch.hasSurfaceFuncs + arch['has3dGrid'] = props.arch.has3dGrid + arch['hasDynamicParallelism'] = props.arch.hasDynamicParallelism + properties['arch'] = arch + IF 0 < CUPY_HIP_VERSION < 310: # gcnArchName used after ROCm 3.1+ + properties['gcnArch'] = props.gcnArch + IF CUPY_HIP_VERSION >= 310: + properties['gcnArchName'] = props.gcnArchName + properties['asicRevision'] = props.asicRevision + properties['managedMemory'] = props.managedMemory + properties['directManagedMemAccessFromHost'] = ( + props.directManagedMemAccessFromHost) + properties['concurrentManagedAccess'] = props.concurrentManagedAccess + properties['pageableMemoryAccess'] = props.pageableMemoryAccess + properties['pageableMemoryAccessUsesHostPageTables'] = ( + props.pageableMemoryAccessUsesHostPageTables) + return properties + + cpdef int deviceGetByPCIBusId(str pci_bus_id) except? -1: + # Encode the python string before passing to native code + byte_pci_bus_id = pci_bus_id.encode('ascii') + cdef const char* c_pci_bus_id = byte_pci_bus_id + + cdef int device = -1 + cdef int status + status = cudaDeviceGetByPCIBusId(&device, c_pci_bus_id) + check_status(status) + # on ROCm, it might fail silently, so we also need to check if the + # device is meaningful or not + if hip_environment and device == -1: + check_status(cudaErrorInvalidValue) + return device + + cpdef str deviceGetPCIBusId(int device): + # The PCI Bus ID string must be able to store 13 characters including + # the NULL-terminator according to the CUDA documentation. + # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html + cdef char pci_bus_id[13] + status = cudaDeviceGetPCIBusId(pci_bus_id, 13, device) + check_status(status) + return pci_bus_id.decode('ascii') + + cpdef int getDeviceCount() except? -1: + cdef int count + status = cudaGetDeviceCount(&count) + check_status(status) + return count + + cpdef setDevice(int device): + status = cudaSetDevice(device) + check_status(status) + + cpdef deviceSynchronize(): + with nogil: + status = cudaDeviceSynchronize() + check_status(status) + + cpdef int deviceCanAccessPeer(int device, int peerDevice) except? -1: + cdef int ret + status = cudaDeviceCanAccessPeer(&ret, device, peerDevice) + check_status(status) + return ret + + cpdef deviceEnablePeerAccess(int peerDevice): + status = cudaDeviceEnablePeerAccess(peerDevice, 0) + check_status(status) + + cpdef deviceDisablePeerAccess(int peerDevice): + status = cudaDeviceDisablePeerAccess(peerDevice) + check_status(status) + + cpdef _deviceEnsurePeerAccess(int peerDevice): + status = cudaDeviceEnablePeerAccess(peerDevice, 0) + if status == 0: + return + elif status == errorPeerAccessAlreadyEnabled: + cudaGetLastError() # clear error status + return + check_status(status) + + cpdef size_t deviceGetLimit(int limit) except? -1: + cdef size_t value + status = cudaDeviceGetLimit(&value, limit) + check_status(status) + return value + + cpdef deviceSetLimit(int limit, size_t value): + status = cudaDeviceSetLimit(limit, value) + check_status(status) + + + ########################################################################### + # IPC operations + ########################################################################### + + cpdef ipcCloseMemHandle(intptr_t devPtr): + status = cudaIpcCloseMemHandle(devPtr) + check_status(status) + + cpdef ipcGetEventHandle(intptr_t event): + cdef IpcEventHandle handle + status = cudaIpcGetEventHandle(&handle, event) + check_status(status) + # We need to do this due to a bug in Cython that + # cuts out the 0 bytes in an array of chars when + # constructing the python object + # resulting in different sizes assignment errors + # when recreating the struct from the python + # array of bytes + reserved = [handle.reserved[i] for i in range(64)] + return bytes(reserved) + + cpdef ipcGetMemHandle(intptr_t devPtr): + cdef IpcMemHandle handle + status = cudaIpcGetMemHandle(&handle, devPtr) + check_status(status) + # We need to do this due to a bug in Cython that + # when converting an array of chars in C to a python object + # it discards the data after the first 0 value + # resulting in a loss of data, as this is not a string + # but a buffer of bytes + reserved = [handle.reserved[i] for i in range(64)] + return bytes(reserved) + + cpdef ipcOpenEventHandle(bytes handle): + cdef driver.Event event + cdef IpcEventHandle handle_ + handle_.reserved = handle + status = cudaIpcOpenEventHandle(&event, handle_) + check_status(status) + return event + + cpdef ipcOpenMemHandle(bytes handle, + unsigned int flags=cudaIpcMemLazyEnablePeerAccess): + cdef void* devPtr + cdef IpcMemHandle handle_ + handle_.reserved = handle + status = cudaIpcOpenMemHandle(&devPtr, handle_, flags) + check_status(status) + return devPtr + + + ########################################################################### + # Memory management + ########################################################################### + + cpdef intptr_t malloc(size_t size) except? 0: + cdef void* ptr + with nogil: + status = cudaMalloc(&ptr, size) + check_status(status) + return ptr + + cpdef intptr_t mallocManaged( + size_t size, unsigned int flags=cudaMemAttachGlobal) except? 0: + if 0 < CUPY_HIP_VERSION < 40300000: + raise RuntimeError('Managed memory requires ROCm 4.3+') + cdef void* ptr + with nogil: + status = cudaMallocManaged(&ptr, size, flags) + check_status(status) + return ptr + + cpdef intptr_t malloc3DArray(intptr_t descPtr, size_t width, size_t height, + size_t depth, unsigned int flags=0) except? 0: + cdef Array ptr + cdef Extent extent = make_cudaExtent(width, height, depth) + with nogil: + status = cudaMalloc3DArray(&ptr, descPtr, + extent, flags) + check_status(status) + return ptr + + cpdef intptr_t mallocArray(intptr_t descPtr, size_t width, size_t height, + unsigned int flags=0) except? 0: + cdef Array ptr + with nogil: + status = cudaMallocArray(&ptr, descPtr, width, + height, flags) + check_status(status) + return ptr + + cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0: + cdef void* ptr + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('mallocAsync requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('mallocAsync is supported since CUDA 11.2') + with nogil: + status = cudaMallocAsync(&ptr, size, stream) + check_status(status) + return ptr + + cpdef intptr_t mallocFromPoolAsync( + size_t size, intptr_t pool, intptr_t stream) except? 0: + cdef void* ptr + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('mallocFromPoolAsync require ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('mallocFromPoolAsync is supported ' + 'since CUDA 11.2') + with nogil: + status = cudaMallocFromPoolAsync( + &ptr, size, pool, stream) + check_status(status) + return ptr + + cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0: + cdef void* ptr + with nogil: + status = cudaHostAlloc(&ptr, size, flags) + check_status(status) + return ptr + + cpdef hostRegister(intptr_t ptr, size_t size, unsigned int flags): + with nogil: + status = cudaHostRegister(ptr, size, flags) + check_status(status) + + cpdef hostUnregister(intptr_t ptr): + with nogil: + status = cudaHostUnregister(ptr) + check_status(status) + + cpdef free(intptr_t ptr): + with nogil: + status = cudaFree(ptr) + check_status(status) + + cpdef freeHost(intptr_t ptr): + with nogil: + status = cudaFreeHost(ptr) + check_status(status) + + cpdef freeArray(intptr_t ptr): + with nogil: + status = cudaFreeArray(ptr) + check_status(status) + + cpdef freeAsync(intptr_t ptr, intptr_t stream): + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('freeAsync requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('freeAsync is supported since CUDA 11.2') + with nogil: + status = cudaFreeAsync(ptr, stream) + check_status(status) + + cpdef memGetInfo(): + cdef size_t free, total + status = cudaMemGetInfo(&free, &total) + check_status(status) + return free, total + + cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind): + with nogil: + status = cudaMemcpy(dst, src, size, kind) + check_status(status) + + cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind, + intptr_t stream): + with nogil: + status = cudaMemcpyAsync( + dst, src, size, kind, + stream) + check_status(status) + + cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice, + size_t size): + with nogil: + status = cudaMemcpyPeer(dst, dstDevice, src, + srcDevice, size) + check_status(status) + + cpdef memcpyPeerAsync(intptr_t dst, int dstDevice, intptr_t src, + int srcDevice, size_t size, intptr_t stream): + with nogil: + status = cudaMemcpyPeerAsync(dst, dstDevice, src, + srcDevice, size, + stream) + check_status(status) + + cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, + size_t width, size_t height, MemoryKind kind): + with nogil: + status = cudaMemcpy2D(dst, dpitch, src, spitch, + width, height, kind) + check_status(status) + + cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, + size_t spitch, size_t width, size_t height, + MemoryKind kind, intptr_t stream): + with nogil: + status = cudaMemcpy2DAsync(dst, dpitch, src, spitch, + width, height, kind, + stream) + check_status(status) + + cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src, + size_t wOffset, size_t hOffset, size_t width, + size_t height, int kind): + with nogil: + status = cudaMemcpy2DFromArray(dst, dpitch, src, + wOffset, hOffset, width, height, + kind) + check_status(status) + + cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src, + size_t wOffset, size_t hOffset, size_t width, + size_t height, int kind, intptr_t stream): + with nogil: + status = cudaMemcpy2DFromArrayAsync(dst, dpitch, src, + wOffset, hOffset, width, + height, kind, + stream) + check_status(status) + + cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset, + intptr_t src, size_t spitch, size_t width, + size_t height, int kind): + with nogil: + status = cudaMemcpy2DToArray(dst, wOffset, hOffset, + src, spitch, width, height, + kind) + check_status(status) + + cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset, + intptr_t src, size_t spitch, size_t width, + size_t height, int kind, intptr_t stream): + with nogil: + status = cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, + src, spitch, width, + height, kind, + stream) + check_status(status) + + cpdef memcpy3D(intptr_t Memcpy3DParmsPtr): + with nogil: + status = cudaMemcpy3D(Memcpy3DParmsPtr) + check_status(status) + + cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream): + with nogil: + status = cudaMemcpy3DAsync(Memcpy3DParmsPtr, + stream) + check_status(status) + + cpdef memset(intptr_t ptr, int value, size_t size): + with nogil: + status = cudaMemset(ptr, value, size) + check_status(status) + + cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream): + with nogil: + status = cudaMemsetAsync(ptr, value, size, + stream) + check_status(status) + + cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice, + intptr_t stream): + if 0 < CUPY_HIP_VERSION < 40300000: + raise RuntimeError('Managed memory requires ROCm 4.3+') + with nogil: + status = cudaMemPrefetchAsync(devPtr, count, dstDevice, + stream) + check_status(status) + + cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device): + if 0 < CUPY_HIP_VERSION < 40300000: + raise RuntimeError('Managed memory requires ROCm 4.3+') + with nogil: + status = cudaMemAdvise(devPtr, count, + advice, device) + check_status(status) + + cpdef PointerAttributes pointerGetAttributes(intptr_t ptr): + cdef _PointerAttributes attrs + status = cudaPointerGetAttributes(&attrs, ptr) + check_status(status) + IF CUPY_HIP_VERSION >= 60000000: + if attrs.type == 0: # hipMemoryTypeHost + attrs.type = 1 # cudaMemoryTypeHost + elif attrs.type == 1: # hipMemoryTypeDevice + attrs.type = 2 # cudaMemoryTypeDevice + ELIF CUPY_HIP_VERSION > 0: + if attrs.memoryType == 0: # hipMemoryTypeHost + attrs.memoryType = 1 # cudaMemoryTypeHost + elif attrs.memoryType == 1: # hipMemoryTypeDevice + attrs.memoryType = 2 # cudaMemoryTypeDevice + IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 60000000: + return PointerAttributes( + attrs.device, + attrs.devicePointer, + attrs.hostPointer, + attrs.type) + ELIF 0 < CUPY_HIP_VERSION < 60000000: + return PointerAttributes( + attrs.device, + attrs.devicePointer, + attrs.hostPointer, + attrs.memoryType) + ELSE: # for RTD + return None + + cpdef intptr_t deviceGetDefaultMemPool(int device) except? 0: + '''Get the default mempool on the current device.''' + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('deviceGetDefaultMemPool requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('deviceGetDefaultMemPool is supported since ' + 'CUDA 11.2') + cdef MemPool pool + with nogil: + status = cudaDeviceGetDefaultMemPool(&pool, device) + check_status(status) + return (pool) + + cpdef intptr_t deviceGetMemPool(int device) except? 0: + '''Get the current mempool on the current device.''' + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('deviceGetMemPool requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('deviceGetMemPool is supported since ' + 'CUDA 11.2') + cdef MemPool pool + with nogil: + status = cudaDeviceGetMemPool(&pool, device) + check_status(status) + return (pool) + + cpdef deviceSetMemPool(int device, intptr_t pool): + '''Set the current mempool on the current device to pool.''' + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('deviceSetMemPool requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('deviceSetMemPool is supported since ' + 'CUDA 11.2') + with nogil: + status = cudaDeviceSetMemPool(device, pool) + check_status(status) + + cpdef intptr_t memPoolCreate(MemPoolProps props) except? 0: + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('memPoolCreate requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('memPoolCreate is supported since CUDA 11.2') + + cdef MemPool pool + cdef _MemPoolProps props_c + c_memset(&props_c, 0, sizeof(_MemPoolProps)) + props_c.allocType = props.allocType + props_c.handleTypes = props.handleType + props_c.location.type = props.locationType + props_c.location.id = props.devId + + with nogil: + status = cudaMemPoolCreate(&pool, &props_c) + check_status(status) + return pool + + cpdef memPoolDestroy(intptr_t pool): + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('memPoolDestroy requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('memPoolDestroy is supported since CUDA 11.2') + with nogil: + status = cudaMemPoolDestroy(pool) + check_status(status) + + cpdef memPoolTrimTo(intptr_t pool, size_t size): + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('memPoolTrimTo requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('memPoolTrimTo is supported since CUDA 11.2') + with nogil: + status = cudaMemPoolTrimTo(pool, size) + check_status(status) + + cpdef memPoolGetAttribute(intptr_t pool, int attr): + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('memPoolGetAttribute requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('memPoolGetAttribute is supported ' + 'since CUDA 11.2') + cdef int val1 + cdef uint64_t val2 + cdef void* out + # TODO(leofang): check this hack when more cudaMemPoolAttr are added! + out = (&val1) if attr <= 0x3 else (&val2) + with nogil: + status = cudaMemPoolGetAttribute(pool, attr, + out) + check_status(status) + # TODO(leofang): check this hack when more cudaMemPoolAttr are added! + # cast to Python int regardless of C types + return val1 if attr <= 0x3 else val2 + + cpdef memPoolSetAttribute(intptr_t pool, int attr, object value): + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('memPoolSetAttribute requires ROCm 5.2+') + if runtimeGetVersion() < 11020: + raise RuntimeError('memPoolSetAttribute is supported ' + 'since CUDA 11.2') + cdef int val1 + cdef uint64_t val2 + cdef void* out + # TODO(leofang): check this hack when more cudaMemPoolAttr are added! + if attr <= 0x3: + val1 = value + out = (&val1) + else: + val2 = value + out = (&val2) + with nogil: + status = cudaMemPoolSetAttribute(pool, attr, + out) + check_status(status) + + + ########################################################################### + # Stream and Event + ########################################################################### + + cpdef intptr_t streamCreate() except? 0: + cdef driver.Stream stream + status = cudaStreamCreate(&stream) + check_status(status) + return stream + + + cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0: + cdef driver.Stream stream + status = cudaStreamCreateWithFlags(&stream, flags) + check_status(status) + return stream + + + cpdef streamDestroy(intptr_t stream): + status = cudaStreamDestroy(stream) + check_status(status) + + + cpdef streamSynchronize(intptr_t stream): + with nogil: + status = cudaStreamSynchronize(stream) + check_status(status) + + + cdef _streamCallbackFunc(driver.Stream hStream, int status, + void* func_arg) with gil: + obj = func_arg + func, arg = obj + func(hStream, status, arg) + cpython.Py_DECREF(obj) + + + cdef _HostFnFunc(void* func_arg) with gil: + obj = func_arg + func, arg = obj + func(arg) + cpython.Py_DECREF(obj) + + + cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg, + unsigned int flags=0): + if _is_hip_environment and stream == 0: + raise RuntimeError('HIP does not allow adding callbacks to the ' + 'default (null) stream') + func_arg = (callback, arg) + cpython.Py_INCREF(func_arg) + with nogil: + status = cudaStreamAddCallback( + stream, _streamCallbackFunc, + func_arg, flags) + check_status(status) + + + cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg): + if 0 < CUPY_HIP_VERSION < 50200000: + raise RuntimeError('This feature is supported on HIP since ROCm 5.2') + + func_arg = (callback, arg) + cpython.Py_INCREF(func_arg) + with nogil: + status = cudaLaunchHostFunc( + stream, _HostFnFunc, + func_arg) + check_status(status) + + + cpdef streamQuery(intptr_t stream): + return cudaStreamQuery(stream) + + + cpdef streamWaitEvent(intptr_t stream, intptr_t event, + unsigned int flags=0): + with nogil: + status = cudaStreamWaitEvent(stream, + event, flags) + check_status(status) + + + cpdef streamBeginCapture(intptr_t stream, + int mode=streamCaptureModeRelaxed): + if 0 < CUPY_HIP_VERSION < 40300000: + raise RuntimeError('streamBeginCapture is not supported in ROCm') + # TODO(leofang): check and raise if stream == 0? + with nogil: + status = cudaStreamBeginCapture(stream, + mode) + check_status(status) + + + cpdef intptr_t streamEndCapture(intptr_t stream) except? 0: + # TODO(leofang): check and raise if stream == 0? + cdef Graph g + if 0 < CUPY_HIP_VERSION < 40300000: + raise RuntimeError('streamEndCapture is not supported in ROCm') + with nogil: + status = cudaStreamEndCapture(stream, &g) + check_status(status) + return g + + + cpdef bint streamIsCapturing(intptr_t stream) except*: + cdef StreamCaptureStatus s + if 0 < CUPY_HIP_VERSION < 50000000: + raise RuntimeError('streamIsCapturing is not supported in ROCm') + with nogil: + status = cudaStreamIsCapturing(stream, &s) + check_status(status) # cudaErrorStreamCaptureImplicit could be + # raised here + if s == streamCaptureStatusInvalidated: + raise RuntimeError('the stream was capturing, but an error has ' + 'invalidated the capture sequence') + return s + + + cpdef intptr_t eventCreate() except? 0: + cdef driver.Event event + status = cudaEventCreate(&event) + check_status(status) + return event + + cpdef intptr_t eventCreateWithFlags(unsigned int flags) except? 0: + cdef driver.Event event + status = cudaEventCreateWithFlags(&event, flags) + check_status(status) + return event + + + cpdef eventDestroy(intptr_t event): + status = cudaEventDestroy(event) + check_status(status) + + + cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0: + cdef float ms + status = cudaEventElapsedTime(&ms, start, + end) + check_status(status) + return ms + + + cpdef eventQuery(intptr_t event): + return cudaEventQuery(event) + + + cpdef eventRecord(intptr_t event, intptr_t stream): + status = cudaEventRecord(event, stream) + check_status(status) + + + cpdef eventSynchronize(intptr_t event): + with nogil: + status = cudaEventSynchronize(event) + check_status(status) + + + ########################################################################### + # util + ########################################################################### + + cdef _ensure_context(): + """Ensure that CUcontext bound to the calling host thread exists. + + See discussion on https://github.com/cupy/cupy/issues/72 for details. + """ + tls = _ThreadLocal.get() + cdef int dev = getDevice() + if not tls.context_initialized[dev]: + # Call Runtime API to establish context on this host thread. + memGetInfo() + tls.context_initialized[dev] = True + + + ########################################################################### + # Texture + ########################################################################### + + cpdef uintmax_t createTextureObject( + intptr_t ResDescPtr, intptr_t TexDescPtr) except? 0: + cdef uintmax_t texobj = 0 + with nogil: + status = cudaCreateTextureObject((&texobj), + ResDescPtr, + TexDescPtr, + NULL) + check_status(status) + return texobj + + cpdef destroyTextureObject(uintmax_t texObject): + with nogil: + status = cudaDestroyTextureObject(texObject) + check_status(status) + + cpdef uintmax_t createSurfaceObject(intptr_t ResDescPtr) except? 0: + cdef uintmax_t surfobj = 0 + with nogil: + status = cudaCreateSurfaceObject((&surfobj), + ResDescPtr) + check_status(status) + return surfobj + + cpdef destroySurfaceObject(uintmax_t surfObject): + with nogil: + status = cudaDestroySurfaceObject(surfObject) + check_status(status) + + cdef ChannelFormatDesc getChannelDesc(intptr_t array) except*: + cdef ChannelFormatDesc desc + with nogil: + status = cudaGetChannelDesc(&desc, array) + check_status(status) + return desc + + cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t obj) except*: + cdef ResourceDesc desc + with nogil: + status = cudaGetTextureObjectResourceDesc(&desc, + obj) + check_status(status) + return desc + + cdef TextureDesc getTextureObjectTextureDesc(uintmax_t obj) except*: + cdef TextureDesc desc + with nogil: + status = cudaGetTextureObjectTextureDesc(&desc, obj) + check_status(status) + return desc + + cdef Extent make_Extent(size_t w, size_t h, size_t d) except*: + return make_cudaExtent(w, h, d) + + cdef Pos make_Pos(size_t x, size_t y, size_t z) except*: + return make_cudaPos(x, y, z) + + cdef PitchedPtr make_PitchedPtr( + intptr_t d, size_t p, size_t xsz, size_t ysz) except*: + return make_cudaPitchedPtr(d, p, xsz, ysz) + + + ########################################################################### + # Graph + ########################################################################### + + cpdef graphDestroy(intptr_t graph): + with nogil: + status = cudaGraphDestroy(graph) + check_status(status) + + cpdef graphExecDestroy(intptr_t graphExec): + with nogil: + status = cudaGraphExecDestroy(graphExec) + check_status(status) + + cpdef intptr_t graphInstantiate(intptr_t graph) except? 0: + # TODO(leofang): support reporting error log? + cdef GraphExec ge + with nogil: + status = cudaGraphInstantiate((&ge), graph, + NULL, NULL, 0) + check_status(status) + return ge + + cpdef graphLaunch(intptr_t graphExec, intptr_t stream): + with nogil: + status = cudaGraphLaunch((graphExec), + stream) + check_status(status) + + cpdef graphUpload(intptr_t graphExec, intptr_t stream): + if runtimeGetVersion() < 11010: + raise RuntimeError('graphUpload is supported since CUDA 11.1+') + with nogil: + status = cudaGraphUpload((graphExec), + stream) + check_status(status) diff --git a/cupy_backends/hip/cupy_hip_runtime.h b/cupy_backends/hip/cupy_hip_runtime.h index 0d1eb208528..ef43172e23a 100644 --- a/cupy_backends/hip/cupy_hip_runtime.h +++ b/cupy_backends/hip/cupy_hip_runtime.h @@ -8,342 +8,6 @@ extern "C" { bool hip_environment = true; -// Error handling -const char* cudaGetErrorName(cudaError_t hipError) { - return hipGetErrorName(hipError); -} - -const char* cudaGetErrorString(cudaError_t hipError) { - return hipGetErrorString(hipError); -} - -cudaError_t cudaGetLastError() { - return hipGetLastError(); -} - - -// Initialization -cudaError_t cudaDriverGetVersion(int *driverVersion) { - return hipDriverGetVersion(driverVersion); -} - -cudaError_t cudaRuntimeGetVersion(int *runtimeVersion) { - return hipRuntimeGetVersion(runtimeVersion); -} - - -// CUdevice operations -cudaError_t cudaGetDevice(int *deviceId) { - return hipGetDevice(deviceId); -} - -cudaError_t cudaDeviceGetAttribute(int* pi, cudaDeviceAttr attr, - int deviceId) { - return hipDeviceGetAttribute(pi, attr, deviceId); -} - -cudaError_t cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) { - return hipDeviceGetByPCIBusId(device, pciBusId); -} - -cudaError_t cudaDeviceGetPCIBusId(char *pciBusId, int len, int device) { - return hipDeviceGetPCIBusId(pciBusId, len, device); -} - -cudaError_t cudaGetDeviceCount(int *count) { - return hipGetDeviceCount(count); -} - -cudaError_t cudaSetDevice(int deviceId) { - return hipSetDevice(deviceId); -} - -cudaError_t cudaDeviceSynchronize() { - return hipDeviceSynchronize(); -} - -cudaError_t cudaDeviceCanAccessPeer(int* canAccessPeer, int deviceId, - int peerDeviceId) { - return hipDeviceCanAccessPeer(canAccessPeer, deviceId, peerDeviceId); -} - -cudaError_t cudaDeviceEnablePeerAccess(int peerDeviceId, unsigned int flags) { - return hipDeviceEnablePeerAccess(peerDeviceId, flags); -} - -cudaError_t cudaDeviceDisablePeerAccess(int peerDeviceId) { - return hipDeviceDisablePeerAccess(peerDeviceId); -} - -cudaError_t cudaDeviceGetLimit(size_t* pValue, cudaLimit limit) { - return hipDeviceGetLimit(pValue, limit); -} - -cudaError_t cudaDeviceSetLimit(cudaLimit limit, size_t value) { - // see https://github.com/ROCm-Developer-Tools/HIP/issues/1632 - return hipErrorUnknown; -} - -// IPC operations -cudaError_t cudaIpcCloseMemHandle(void* devPtr) { - return hipIpcCloseMemHandle(devPtr); -} - -cudaError_t cudaIpcGetEventHandle(cudaIpcEventHandle_t* handle, cudaEvent_t event) { - return hipErrorUnknown; - - // TODO(leofang): this is supported after ROCm-Developer-Tools/HIP#1996 is released; - // as of ROCm 3.5.0 it is still not supported - //return hipIpcGetEventHandle(handle, event); -} - -cudaError_t cudaIpcGetMemHandle(cudaIpcMemHandle_t* handle, void* devPtr) { - return hipIpcGetMemHandle(handle, devPtr); -} - -cudaError_t cudaIpcOpenEventHandle(cudaEvent_t* event, cudaIpcEventHandle_t handle) { - return hipErrorUnknown; - - // TODO(leofang): this is supported after ROCm-Developer-Tools/HIP#1996 is released; - // as of ROCm 3.5.0 it is still not supported - //return hipIpcOpenEventHandle(event, handle); -} - -cudaError_t cudaIpcOpenMemHandle(void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags) { - return hipIpcOpenMemHandle(devPtr, handle, flags); -} - -// Memory management -enum cudaMemAllocationType {}; // stub -enum cudaMemAllocationHandleType {}; // stub -enum cudaMemLocationType {}; // stub -struct cudaMemLocation { // stub - int id; - cudaMemLocationType type; -}; -struct cudaMemPoolProps { // stub - cudaMemAllocationType allocType; - cudaMemAllocationHandleType handleTypes; - struct cudaMemLocation location; - unsigned char reserved[64]; - void* win32SecurityAttributes; -}; - -cudaError_t cudaMalloc(void** ptr, size_t size) { - return hipMalloc(ptr, size); -} - -cudaError_t cudaMalloc3DArray(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMallocArray(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMallocAsync(...) { - return hipErrorUnknown; -} - -cudaError_t cudaHostAlloc(void** ptr, size_t size, unsigned int flags) { - return hipHostMalloc(ptr, size, flags); -} - -cudaError_t cudaHostRegister(...) { - return hipErrorUnknown; -} - -cudaError_t cudaHostUnregister(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMallocManaged(void** ptr, size_t size, unsigned int flags) { -#if HIP_VERSION >= 40300000 - return hipMallocManaged(ptr, size, flags); -#else - return hipErrorUnknown; -#endif -} - -int cudaFree(void* ptr) { - return hipFree(ptr); -} - -cudaError_t cudaFreeArray(...) { - return hipErrorUnknown; -} - -cudaError_t cudaFreeHost(void* ptr) { - return hipHostFree(ptr); -} - -cudaError_t cudaFreeAsync(...) { - return hipErrorUnknown; -} - -int cudaMemGetInfo(size_t* free, size_t* total) { - return hipMemGetInfo(free, total); -} - -cudaError_t cudaMemcpy(void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind) { - return hipMemcpy(dst, src, sizeBytes, kind); -} - -cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t sizeBytes, - cudaMemcpyKind kind, cudaStream_t stream) { - return hipMemcpyAsync(dst, src, sizeBytes, kind, stream); -} - -cudaError_t cudaMemcpyPeer(void* dst, int dstDeviceId, const void* src, - int srcDeviceId, size_t sizeBytes) { - return hipMemcpyPeer(dst, dstDeviceId, src, srcDeviceId, sizeBytes); -} - -cudaError_t cudaMemcpyPeerAsync(void* dst, int dstDevice, const void* src, - int srcDevice, size_t sizeBytes, - cudaStream_t stream) { - return hipMemcpyPeerAsync(dst, dstDevice, src, srcDevice, sizeBytes, - stream); -} - -cudaError_t cudaMemcpy2D(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemcpy2DAsync(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemcpy2DFromArray(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemcpy2DFromArrayAsync(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemcpy2DToArray(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemcpy2DToArrayAsync(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemcpy3D(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemcpy3DAsync(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemset(void* dst, int value, size_t sizeBytes) { - return hipMemset(dst, value, sizeBytes); -} - -cudaError_t cudaMemsetAsync(void* dst, int value, size_t sizeBytes, - cudaStream_t stream) { - return hipMemsetAsync(dst, value, sizeBytes, stream); -} - -cudaError_t cudaMemAdvise(const void *devPtr, size_t count, - cudaMemoryAdvise advice, int device) { -#if HIP_VERSION >= 40300000 - return hipMemAdvise(devPtr, count, advice, device); -#else - return hipErrorUnknown; -#endif -} - -cudaError_t cudaMemPrefetchAsync(const void *devPtr, size_t count, - int dstDevice, cudaStream_t stream) { -#if HIP_VERSION >= 40300000 - return hipMemPrefetchAsync(devPtr, count, dstDevice, stream); -#else - return hipErrorUnknown; -#endif -} - -cudaError_t cudaPointerGetAttributes(cudaPointerAttributes *attributes, - const void* ptr) { - cudaError_t status = hipPointerGetAttributes(attributes, ptr); -#if HIP_VERSION >= 60000000 - if (status == cudaSuccess) { - switch (attributes->type) { - case 0 /* hipMemoryTypeHost */: - attributes->type = (hipMemoryType)1; /* cudaMemoryTypeHost */ - return status; - case 1 /* hipMemoryTypeDevice */: - attributes->type = (hipMemoryType)2; /* cudaMemoryTypeDevice */ - return status; - default: - /* we don't care the rest of possibilities */ - return status; - } - } -#else - if (status == cudaSuccess) { - switch (attributes->memoryType) { - case 0 /* hipMemoryTypeHost */: - attributes->memoryType = (hipMemoryType)1; /* cudaMemoryTypeHost */ - return status; - case 1 /* hipMemoryTypeDevice */: - attributes->memoryType = (hipMemoryType)2; /* cudaMemoryTypeDevice */ - return status; - default: - /* we don't care the rest of possibilities */ - return status; - } - } -#endif - else { - return status; - } -} - -cudaError_t cudaGetDeviceProperties(cudaDeviceProp *prop, int device) { - return hipGetDeviceProperties(prop, device); -} - -cudaError_t cudaMallocFromPoolAsync(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemPoolCreate(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemPoolDestroy(...) { - return hipErrorUnknown; -} - -cudaError_t cudaDeviceGetDefaultMemPool(...) { - return hipErrorUnknown; -} - -cudaError_t cudaDeviceGetMemPool(...) { - return hipErrorUnknown; -} - -cudaError_t cudaDeviceSetMemPool(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemPoolTrimTo(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemPoolGetAttribute(...) { - return hipErrorUnknown; -} - -cudaError_t cudaMemPoolSetAttribute(...) { - return hipErrorUnknown; -} - - // Stream and Event #if HIP_VERSION >= 40300000 typedef hipStreamCaptureMode cudaStreamCaptureMode; @@ -353,185 +17,34 @@ enum cudaStreamCaptureMode {}; enum cudaStreamCaptureStatus {}; #endif -cudaError_t cudaStreamCreate(cudaStream_t *stream) { - return hipStreamCreate(stream); -} - -cudaError_t cudaStreamCreateWithFlags(cudaStream_t *stream, - unsigned int flags) { - return hipStreamCreateWithFlags(stream, flags); -} - -cudaError_t cudaStreamDestroy(cudaStream_t stream) { - return hipStreamDestroy(stream); -} - -cudaError_t cudaStreamSynchronize(cudaStream_t stream) { - return hipStreamSynchronize(stream); -} - -cudaError_t cudaStreamAddCallback(cudaStream_t stream, - cudaStreamCallback_t callback, - void *userData, unsigned int flags) { - return hipStreamAddCallback(stream, callback, userData, flags); -} - -cudaError_t cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void* userData) { - return hipErrorUnknown; -} - -cudaError_t cudaStreamQuery(cudaStream_t stream) { - return hipStreamQuery(stream); -} - -cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, - unsigned int flags) { - return hipStreamWaitEvent(stream, event, flags); -} - -cudaError_t cudaEventCreate(cudaEvent_t* event) { - return hipEventCreate(event); -} - -cudaError_t cudaEventCreateWithFlags(cudaEvent_t* event, unsigned flags) { - return hipEventCreateWithFlags(event, flags); -} - -cudaError_t cudaEventDestroy(cudaEvent_t event) { - return hipEventDestroy(event); -} - -cudaError_t cudaEventElapsedTime(float *ms, cudaEvent_t start, - cudaEvent_t stop){ - return hipEventElapsedTime(ms, start, stop); -} - -cudaError_t cudaEventQuery(cudaEvent_t event) { - return hipEventQuery(event); -} - -cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream) { - return hipEventRecord(event, stream); -} - -cudaError_t cudaEventSynchronize(cudaEvent_t event) { - return hipEventSynchronize(event); -} - -cudaError_t cudaStreamBeginCapture(cudaStream_t stream, - cudaStreamCaptureMode mode) { -#if HIP_VERSION >= 40300000 - return hipStreamBeginCapture(stream, mode); -#else - return hipErrorUnknown; -#endif -} - -cudaError_t cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph) { -#if HIP_VERSION >= 40300000 - return hipStreamEndCapture(stream, pGraph); -#else - return hipErrorUnknown; -#endif -} - -cudaError_t cudaStreamIsCapturing(cudaStream_t stream, - cudaStreamCaptureStatus* pCaptureStatus) { -#if HIP_VERSION >= 50000000 - return hipStreamIsCapturing(stream, pCaptureStatus); -#else - return hipErrorUnknown; -#endif -} - - -// Texture -cudaError_t cudaCreateTextureObject(...) { - return cudaSuccess; -} - -cudaError_t cudaDestroyTextureObject(...) { - return cudaSuccess; -} - -cudaError_t cudaGetChannelDesc(...) { - return cudaSuccess; -} - -cudaError_t cudaGetTextureObjectResourceDesc(...) { - return cudaSuccess; -} - -cudaError_t cudaGetTextureObjectTextureDesc(...) { - return cudaSuccess; -} - -cudaExtent make_cudaExtent(...) { - cudaExtent ex = {}; - return ex; -} - -cudaPitchedPtr make_cudaPitchedPtr(...) { - cudaPitchedPtr ptr = {}; - return ptr; -} - -cudaPos make_cudaPos(...) { - cudaPos pos = {}; - return pos; -} - -// Surface -cudaError_t cudaCreateSurfaceObject(cudaSurfaceObject_t* pSurfObject, - const cudaResourceDesc* pResDesc) { - return hipCreateSurfaceObject(pSurfObject, pResDesc); -} - -cudaError_t cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) { - return hipDestroySurfaceObject(surfObject); -} - // CUDA Graph +#if 0 < HIP_VERSION < 40300000 cudaError_t cudaGraphInstantiate( cudaGraphExec_t* pGraphExec, cudaGraph_t graph, cudaGraphNode_t* pErrorNode, char* pLogBuffer, size_t bufferSize) { -#if HIP_VERSION >= 40300000 - return hipGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize); -#else return hipErrorUnknown; -#endif } cudaError_t cudaGraphDestroy(cudaGraph_t graph) { -#if HIP_VERSION >= 40300000 - return hipGraphDestroy(graph); -#else return hipErrorUnknown; -#endif } cudaError_t cudaGraphExecDestroy(cudaGraphExec_t graphExec) { -#if HIP_VERSION >= 40300000 - return hipGraphExecDestroy(graphExec); -#else return hipErrorUnknown; -#endif } cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream) { -#if HIP_VERSION >= 40300000 - return hipGraphLaunch(graphExec, stream); -#else return hipErrorUnknown; -#endif } +#elif 0 < HIP_VERSION < 50300000 cudaError_t cudaGraphUpload(...) { return hipErrorUnknown; } +#endif } // extern "C" diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index 88f30e59c60..75a79fe320c 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -96,6 +96,7 @@ def _from_dict(d: Dict[str, Any], ctx: Context) -> Feature: 'cupy_backends.cuda.api._driver_enum', 'cupy_backends.cuda.api.runtime', 'cupy_backends.cuda.api._runtime_enum', + 'cupy_backends.cuda.api.runtime_hip', 'cupy_backends.cuda.libs.cublas', 'cupy_backends.cuda.libs.curand', 'cupy_backends.cuda.libs.cusparse', From b799e4ef17aeb533d53f37097a54972923eb29d6 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Wed, 27 Mar 2024 21:36:37 +0000 Subject: [PATCH 02/12] Fix error with MemoryType --- cupy_backends/cuda/api/_runtime_typedef.pxi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cupy_backends/cuda/api/_runtime_typedef.pxi b/cupy_backends/cuda/api/_runtime_typedef.pxi index a4a129e0087..8538eaf189e 100644 --- a/cupy_backends/cuda/api/_runtime_typedef.pxi +++ b/cupy_backends/cuda/api/_runtime_typedef.pxi @@ -116,6 +116,9 @@ cdef extern from *: ctypedef int MemAllocationType 'cudaMemAllocationType' ctypedef int MemAllocationHandleType 'cudaMemAllocationHandleType' ctypedef int MemLocationType 'cudaMemLocationType' + IF CUPY_HIP_VERSION > 0: + ctypedef int MemoryType 'hipMemoryType' + IF CUPY_CUDA_VERSION > 0: # This is for the annoying nested struct, which is not # perfectly supprted in Cython From ab74117a0436a7a3bde8730b9a25787a5cdbb8d7 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Wed, 27 Mar 2024 21:37:00 +0000 Subject: [PATCH 03/12] Update runtime enums --- cupy_backends/cuda/api/_runtime_enum.pxd | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cupy_backends/cuda/api/_runtime_enum.pxd b/cupy_backends/cuda/api/_runtime_enum.pxd index 0e29e87d141..df1870d441a 100644 --- a/cupy_backends/cuda/api/_runtime_enum.pxd +++ b/cupy_backends/cuda/api/_runtime_enum.pxd @@ -33,9 +33,6 @@ cpdef enum: cudaLimitStackSize = 0x00 cudaLimitPrintfFifoSize = 0x01 cudaLimitMallocHeapSize = 0x02 - cudaLimitDevRuntimeSyncDepth = 0x03 - cudaLimitDevRuntimePendingLaunchCount = 0x04 - cudaLimitMaxL2FetchGranularity = 0x05 # cudaChannelFormatKind cudaChannelFormatKindSigned = 0 @@ -94,7 +91,6 @@ cpdef enum: # cudaMemLocationType cudaMemLocationTypeDevice = 1 - # This was a legacy mistake: the prefix "cuda" should have been removed # so that we can directly assign their C counterparts here. Now because # of backward compatibility and no flexible Cython macro (IF/ELSE), we @@ -228,6 +224,10 @@ IF CUPY_HIP_VERSION > 0: ELSE: # For CUDA/RTD cpdef enum: + cudaLimitDevRuntimeSyncDepth = 0x03 + cudaLimitDevRuntimePendingLaunchCount = 0x04 + cudaLimitMaxL2FetchGranularity = 0x05 + cudaDevAttrMaxThreadsPerBlock = 1 cudaDevAttrMaxBlockDimX cudaDevAttrMaxBlockDimY @@ -343,3 +343,8 @@ ELSE: cudaDevAttrGPUDirectRDMAFlushWritesOptions cudaDevAttrGPUDirectRDMAWritesOrdering cudaDevAttrMemoryPoolSupportedHandleTypes + +IF CUPY_HIP_VERSION > 0: + cpdef enum: + hipIpcMemLazyEnablePeerAccess = 1 + hipMemAttachGlobal = 1 From 3e6b9b6b260577463c58b187481f3914f66e1325 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Wed, 27 Mar 2024 21:38:17 +0000 Subject: [PATCH 04/12] Update runtime api declaration for HIP --- cupy_backends/cuda/api/runtime.pxd | 357 +++++++++++---------- install/amd_build/rocm_custom_mapping.json | 3 +- 2 files changed, 182 insertions(+), 178 deletions(-) diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd index 40f2968e446..32b0b7dea00 100644 --- a/cupy_backends/cuda/api/runtime.pxd +++ b/cupy_backends/cuda/api/runtime.pxd @@ -1,26 +1,6 @@ from libc.stdint cimport intptr_t, uintmax_t -############################################################################### -# Classes -############################################################################### - -cdef class PointerAttributes: - cdef: - readonly int device - readonly intptr_t devicePointer - readonly intptr_t hostPointer - readonly int type - -cdef class MemPoolProps: - # flatten the struct & list meaningful members - cdef: - int allocType - int handleType - int locationType - int devId - - ############################################################################### # Types and Enums ############################################################################### @@ -89,11 +69,15 @@ IF CUPY_USE_CUDA_PYTHON: ctypedef cudaMemAllocationHandleType MemAllocationHandleType ctypedef cudaMemLocationType MemLocationType + IF CUPY_HIP_VERSION > 0: + ctypedef hipMemoryType MemoryType ELSE: - include "_runtime_typedef.pxi" + IF CUPY_HIP_VERSION > 0: + include "_runtime_typedef_hip.pxi" + ELSE: + include "_runtime_typedef.pxi" from cupy_backends.cuda.api._runtime_enum cimport * - # For backward compatibility, keep APIs not prefixed with "cuda". cpdef enum: memcpyHostToHost = 0 @@ -158,164 +142,183 @@ cdef int deviceAttributeComputeCapabilityMinor cdef bint _is_hip_environment - -############################################################################### -# Error handling -############################################################################### - -cpdef check_status(int status) - - -############################################################################### -# Initialization -############################################################################### - -cpdef int driverGetVersion() except? -1 -cpdef int runtimeGetVersion() except? -1 - - -############################################################################### -# Device and context operations -############################################################################### - -cpdef int getDevice() except? -1 -cpdef int deviceGetAttribute(int attrib, int device) except? -1 -cpdef int deviceGetByPCIBusId(str pci_bus_id) except? -1 -cpdef str deviceGetPCIBusId(int device) -cpdef int getDeviceCount() except? -1 -cpdef setDevice(int device) -cpdef deviceSynchronize() -cpdef getDeviceProperties(int device) - -cpdef int deviceCanAccessPeer(int device, int peerDevice) except? -1 -cpdef deviceEnablePeerAccess(int peerDevice) -cpdef _deviceEnsurePeerAccess(int peerDevice) - -cpdef size_t deviceGetLimit(int limit) except? -1 -cpdef deviceSetLimit(int limit, size_t value) - - -############################################################################### -# Memory management -############################################################################### - -cpdef intptr_t malloc(size_t size) except? 0 -cpdef intptr_t mallocManaged(size_t size, unsigned int flags=*) except? 0 -cpdef intptr_t malloc3DArray(intptr_t desc, size_t width, size_t height, - size_t depth, unsigned int flags=*) except? 0 -cpdef intptr_t mallocArray(intptr_t desc, size_t width, size_t height, - unsigned int flags=*) except? 0 -cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0 -cpdef intptr_t mallocFromPoolAsync(size_t, intptr_t, intptr_t) except? 0 -cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0 -cpdef hostRegister(intptr_t ptr, size_t size, unsigned int flags) -cpdef hostUnregister(intptr_t ptr) -cpdef free(intptr_t ptr) -cpdef freeHost(intptr_t ptr) -cpdef freeArray(intptr_t ptr) -cpdef freeAsync(intptr_t ptr, intptr_t stream) -cpdef memGetInfo() -cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind) -cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind, - intptr_t stream) -cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice, - size_t size) -cpdef memcpyPeerAsync(intptr_t dst, int dstDevice, - intptr_t src, int srcDevice, - size_t size, intptr_t stream) -cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, - size_t width, size_t height, MemoryKind kind) -cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, - size_t width, size_t height, MemoryKind kind, - intptr_t stream) -cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src, - size_t wOffset, size_t hOffset, size_t width, - size_t height, int kind) -cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src, - size_t wOffset, size_t hOffset, size_t width, - size_t height, int kind, intptr_t stream) -cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset, - intptr_t src, size_t spitch, size_t width, size_t height, - int kind) -cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset, - intptr_t src, size_t spitch, size_t width, - size_t height, int kind, intptr_t stream) -cpdef memcpy3D(intptr_t Memcpy3DParmsPtr) -cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream) -cpdef memset(intptr_t ptr, int value, size_t size) -cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream) -cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice, - intptr_t stream) -cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device) -cpdef PointerAttributes pointerGetAttributes(intptr_t ptr) -cpdef intptr_t deviceGetDefaultMemPool(int) except? 0 -cpdef intptr_t deviceGetMemPool(int) except? 0 -cpdef deviceSetMemPool(int, intptr_t) -cpdef intptr_t memPoolCreate(MemPoolProps) except? 0 -cpdef memPoolDestroy(intptr_t) -cpdef memPoolTrimTo(intptr_t, size_t) -cpdef memPoolGetAttribute(intptr_t, int) -cpdef memPoolSetAttribute(intptr_t, int, object) - - -############################################################################### -# Stream and Event -############################################################################### - -cpdef intptr_t streamCreate() except? 0 -cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0 -cpdef streamDestroy(intptr_t stream) -cpdef streamSynchronize(intptr_t stream) -cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg, - unsigned int flags=*) -cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg) -cpdef streamQuery(intptr_t stream) -cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=*) -cpdef streamBeginCapture(intptr_t stream, int mode=*) -cpdef intptr_t streamEndCapture(intptr_t stream) except? 0 -cpdef bint streamIsCapturing(intptr_t stream) except* -cpdef intptr_t eventCreate() except? 0 -cpdef intptr_t eventCreateWithFlags(unsigned int flags) except? 0 -cpdef eventDestroy(intptr_t event) -cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0 -cpdef eventQuery(intptr_t event) -cpdef eventRecord(intptr_t event, intptr_t stream) -cpdef eventSynchronize(intptr_t event) - - -############################################################################## -# util -############################################################################## - -cdef _ensure_context() - - -############################################################################## -# Texture -############################################################################## - -cpdef uintmax_t createTextureObject( +IF CUPY_DONT_USE_GEN_HIP_CODE: + ############################################################################### + # Classes + ############################################################################### + + cdef class PointerAttributes: + cdef: + readonly int device + readonly intptr_t devicePointer + readonly intptr_t hostPointer + readonly int type + + cdef class MemPoolProps: + # flatten the struct & list meaningful members + cdef: + int allocType + int handleType + int locationType + int devId + + ############################################################################### + # Error handling + ############################################################################### + + cpdef check_status(int status) + + + ############################################################################### + # Initialization + ############################################################################### + + cpdef int driverGetVersion() except? -1 + cpdef int runtimeGetVersion() except? -1 + + + ############################################################################### + # Device and context operations + ############################################################################### + + cpdef int getDevice() except? -1 + cpdef int deviceGetAttribute(int attrib, int device) except? -1 + cpdef int deviceGetByPCIBusId(str pci_bus_id) except? -1 + cpdef str deviceGetPCIBusId(int device) + cpdef int getDeviceCount() except? -1 + cpdef setDevice(int device) + cpdef deviceSynchronize() + cpdef getDeviceProperties(int device) + + cpdef int deviceCanAccessPeer(int device, int peerDevice) except? -1 + cpdef deviceEnablePeerAccess(int peerDevice) + cpdef _deviceEnsurePeerAccess(int peerDevice) + + cpdef size_t deviceGetLimit(int limit) except? -1 + cpdef deviceSetLimit(int limit, size_t value) + + + ############################################################################### + # Memory management + ############################################################################### + + cpdef intptr_t malloc(size_t size) except? 0 + cpdef intptr_t mallocManaged(size_t size, unsigned int flags=*) except? 0 + cpdef intptr_t malloc3DArray(intptr_t desc, size_t width, size_t height, + size_t depth, unsigned int flags=*) except? 0 + cpdef intptr_t mallocArray(intptr_t desc, size_t width, size_t height, + unsigned int flags=*) except? 0 + cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0 + cpdef intptr_t mallocFromPoolAsync(size_t, intptr_t, intptr_t) except? 0 + cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0 + cpdef hostRegister(intptr_t ptr, size_t size, unsigned int flags) + cpdef hostUnregister(intptr_t ptr) + cpdef free(intptr_t ptr) + cpdef freeHost(intptr_t ptr) + cpdef freeArray(intptr_t ptr) + cpdef freeAsync(intptr_t ptr, intptr_t stream) + cpdef memGetInfo() + cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind) + cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind, + intptr_t stream) + cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice, + size_t size) + cpdef memcpyPeerAsync(intptr_t dst, int dstDevice, + intptr_t src, int srcDevice, + size_t size, intptr_t stream) + cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, + size_t width, size_t height, MemoryKind kind) + cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, + size_t width, size_t height, MemoryKind kind, + intptr_t stream) + cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src, + size_t wOffset, size_t hOffset, size_t width, + size_t height, int kind) + cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src, + size_t wOffset, size_t hOffset, size_t width, + size_t height, int kind, intptr_t stream) + cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset, + intptr_t src, size_t spitch, size_t width, size_t height, + int kind) + cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset, + intptr_t src, size_t spitch, size_t width, + size_t height, int kind, intptr_t stream) + cpdef memcpy3D(intptr_t Memcpy3DParmsPtr) + cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream) + cpdef memset(intptr_t ptr, int value, size_t size) + cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream) + cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice, + intptr_t stream) + cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device) + cpdef PointerAttributes pointerGetAttributes(intptr_t ptr) + cpdef intptr_t deviceGetDefaultMemPool(int) except? 0 + cpdef intptr_t deviceGetMemPool(int) except? 0 + cpdef deviceSetMemPool(int, intptr_t) + cpdef intptr_t memPoolCreate(MemPoolProps) except? 0 + cpdef memPoolDestroy(intptr_t) + cpdef memPoolTrimTo(intptr_t, size_t) + cpdef memPoolGetAttribute(intptr_t, int) + cpdef memPoolSetAttribute(intptr_t, int, object) + + + ############################################################################### + # Stream and Event + ############################################################################### + + cpdef intptr_t streamCreate() except? 0 + cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0 + cpdef streamDestroy(intptr_t stream) + cpdef streamSynchronize(intptr_t stream) + cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg, + unsigned int flags=*) + cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg) + cpdef streamQuery(intptr_t stream) + cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=*) + cpdef streamBeginCapture(intptr_t stream, int mode=*) + cpdef intptr_t streamEndCapture(intptr_t stream) except? 0 + cpdef bint streamIsCapturing(intptr_t stream) except* + cpdef intptr_t eventCreate() except? 0 + cpdef intptr_t eventCreateWithFlags(unsigned int flags) except? 0 + cpdef eventDestroy(intptr_t event) + cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0 + cpdef eventQuery(intptr_t event) + cpdef eventRecord(intptr_t event, intptr_t stream) + cpdef eventSynchronize(intptr_t event) + + + ############################################################################## + # util + ############################################################################## + + cdef _ensure_context() + + + ############################################################################## + # Texture + ############################################################################## + + cpdef uintmax_t createTextureObject( intptr_t ResDesc, intptr_t TexDesc) except? 0 -cpdef destroyTextureObject(uintmax_t texObject) -cdef ChannelFormatDesc getChannelDesc(intptr_t array) except* -cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t texobj) except* -cdef TextureDesc getTextureObjectTextureDesc(uintmax_t texobj) except* -cdef Extent make_Extent(size_t w, size_t h, size_t d) except* -cdef Pos make_Pos(size_t x, size_t y, size_t z) except* -cdef PitchedPtr make_PitchedPtr( + cpdef destroyTextureObject(uintmax_t texObject) + cdef ChannelFormatDesc getChannelDesc(intptr_t array) except* + cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t texobj) except* + cdef TextureDesc getTextureObjectTextureDesc(uintmax_t texobj) except* + cdef Extent make_Extent(size_t w, size_t h, size_t d) except* + cdef Pos make_Pos(size_t x, size_t y, size_t z) except* + cdef PitchedPtr make_PitchedPtr( intptr_t d, size_t p, size_t xsz, size_t ysz) except* -cpdef uintmax_t createSurfaceObject(intptr_t ResDesc) except? 0 -cpdef destroySurfaceObject(uintmax_t surfObject) -# TODO(leofang): add cudaGetSurfaceObjectResourceDesc + cpdef uintmax_t createSurfaceObject(intptr_t ResDesc) except? 0 + cpdef destroySurfaceObject(uintmax_t surfObject) + # TODO(leofang): add cudaGetSurfaceObjectResourceDesc -############################################################################## -# Graph -############################################################################## + ############################################################################## + # Graph + ############################################################################## -cpdef graphDestroy(intptr_t graph) -cpdef graphExecDestroy(intptr_t graphExec) -cpdef intptr_t graphInstantiate(intptr_t graph) except? 0 -cpdef graphLaunch(intptr_t graphExec, intptr_t stream) -cpdef graphUpload(intptr_t graphExec, intptr_t stream) + cpdef graphDestroy(intptr_t graph) + cpdef graphExecDestroy(intptr_t graphExec) + cpdef intptr_t graphInstantiate(intptr_t graph) except? 0 + cpdef graphLaunch(intptr_t graphExec, intptr_t stream) + cpdef graphUpload(intptr_t graphExec, intptr_t stream) diff --git a/install/amd_build/rocm_custom_mapping.json b/install/amd_build/rocm_custom_mapping.json index 728ed421b85..4ac4e0b978e 100644 --- a/install/amd_build/rocm_custom_mapping.json +++ b/install/amd_build/rocm_custom_mapping.json @@ -1,5 +1,6 @@ { "custom_map": { - "CUPY_USE_GEN_HIP_CODE" : "CUPY_DONT_USE_GEN_HIP_CODE" + "CUPY_USE_GEN_HIP_CODE" : "CUPY_DONT_USE_GEN_HIP_CODE", + "CUPY_DONT_USE_GEN_HIP_CODE" : "CUPY_USE_GEN_HIP_CODE" } } From 0afc224434728ee9b4b3f5761a37f2ddeefa84bf Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Wed, 27 Mar 2024 21:39:12 +0000 Subject: [PATCH 05/12] Fix cimport error with runtime --- cupy/_core/_accelerator.pyx | 5 ++++- cupy/_core/_cub_reduction.pyx | 5 ++++- cupy/_core/_dtype.pyx | 5 ++++- cupy/_core/_fusion_kernel.pyx | 5 ++++- cupy/_core/_kernel.pyx | 5 ++++- cupy/_core/_reduction.pyx | 5 ++++- cupy/_core/_routines_linalg.pyx | 5 ++++- cupy/_core/_routines_math.pyx | 5 ++++- cupy/_core/core.pyx | 5 ++++- cupy/_core/dlpack.pyx | 5 ++++- cupy/_core/raw.pyx | 5 ++++- cupy/cuda/common.pyx | 5 ++++- cupy/cuda/cub.pyx | 5 ++++- cupy/cuda/device.pyx | 5 ++++- cupy/cuda/function.pyx | 5 ++++- cupy/cuda/graph.pyx | 5 ++++- cupy/cuda/memory.pyx | 5 ++++- cupy/cuda/pinned_memory.pyx | 5 ++++- cupy/cuda/stream.pyx | 5 ++++- cupy/cuda/texture.pyx | 5 ++++- cupy/fft/_cache.pyx | 5 ++++- cupy/fft/_callback.pyx | 5 ++++- cupy_backends/cuda/libs/_cnvrtc.pxi | 5 ++++- cupy_backends/cuda/libs/cublas.pyx | 5 ++++- cupy_backends/cuda/libs/cudnn.pyx | 5 ++++- cupy_backends/cuda/libs/curand.pyx | 5 ++++- cupy_backends/cuda/libs/cusolver.pyx | 5 ++++- cupy_backends/cuda/libs/cusparse.pyx | 5 ++++- cupy_backends/cuda/libs/nccl.pyx | 5 ++++- cupy_backends/cuda/libs/nvrtc.pyx | 5 ++++- cupy_backends/cuda/libs/profiler.pyx | 5 ++++- cupy_backends/cuda/stream.pyx | 5 ++++- 32 files changed, 128 insertions(+), 32 deletions(-) diff --git a/cupy/_core/_accelerator.pyx b/cupy/_core/_accelerator.pyx index 6ce2991e786..00e2bfc6ebf 100644 --- a/cupy/_core/_accelerator.pyx +++ b/cupy/_core/_accelerator.pyx @@ -1,6 +1,9 @@ import os -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime cdef list _elementwise_accelerators = [] diff --git a/cupy/_core/_cub_reduction.pyx b/cupy/_core/_cub_reduction.pyx index 95ed9bce145..d3336fb1a4a 100644 --- a/cupy/_core/_cub_reduction.pyx +++ b/cupy/_core/_cub_reduction.pyx @@ -10,7 +10,10 @@ from cupy._core cimport internal from cupy.cuda cimport cub from cupy.cuda cimport function from cupy.cuda cimport memory -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime import math import string diff --git a/cupy/_core/_dtype.pyx b/cupy/_core/_dtype.pyx index e5bc7251c14..57963c40792 100644 --- a/cupy/_core/_dtype.pyx +++ b/cupy/_core/_dtype.pyx @@ -2,7 +2,10 @@ cimport cython # NOQA import numpy import warnings -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime all_type_chars = '?bhilqBHILQefdFD' diff --git a/cupy/_core/_fusion_kernel.pyx b/cupy/_core/_fusion_kernel.pyx index ed8f83738b4..928dbb43c5b 100644 --- a/cupy/_core/_fusion_kernel.pyx +++ b/cupy/_core/_fusion_kernel.pyx @@ -10,7 +10,10 @@ from cupy._core.core cimport _ndarray_base from cupy._core cimport internal from cupy._core cimport _routines_manipulation as _manipulation from cupy_backends.cuda.api cimport driver -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime import cupy as _cupy from cupy._core import _dtype diff --git a/cupy/_core/_kernel.pyx b/cupy/_core/_kernel.pyx index 1e719f6f6fc..f17af4120a1 100644 --- a/cupy/_core/_kernel.pyx +++ b/cupy/_core/_kernel.pyx @@ -27,7 +27,10 @@ from cupy._core.core cimport _ndarray_init from cupy._core.core cimport compile_with_cache from cupy._core.core cimport _ndarray_base from cupy._core cimport internal -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime try: import cupy_backends.cuda.libs.cutensor as cuda_cutensor diff --git a/cupy/_core/_reduction.pyx b/cupy/_core/_reduction.pyx index 0046ce801ca..374a9978b48 100644 --- a/cupy/_core/_reduction.pyx +++ b/cupy/_core/_reduction.pyx @@ -25,7 +25,10 @@ from cupy._core.core cimport _ndarray_base from cupy._core cimport internal from cupy.cuda cimport device from cupy.cuda cimport function -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime import math import string diff --git a/cupy/_core/_routines_linalg.pyx b/cupy/_core/_routines_linalg.pyx index 25fde4cf839..7b1373a6e55 100644 --- a/cupy/_core/_routines_linalg.pyx +++ b/cupy/_core/_routines_linalg.pyx @@ -26,7 +26,10 @@ from cupy._core cimport _memory_range from cupy._core cimport _routines_manipulation as _manipulation from cupy._core cimport _routines_math as _math from cupy.cuda cimport device -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime cdef extern from '../../cupy_backends/cupy_complex.h': diff --git a/cupy/_core/_routines_math.pyx b/cupy/_core/_routines_math.pyx index 440c8cbfa70..b040a34b6cc 100644 --- a/cupy/_core/_routines_math.pyx +++ b/cupy/_core/_routines_math.pyx @@ -11,7 +11,10 @@ import cupy._core.core as core from cupy._core cimport internal from cupy import _util -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy._core cimport _accelerator from cupy._core._dtype cimport get_dtype from cupy._core.core cimport _ndarray_init diff --git a/cupy/_core/core.pyx b/cupy/_core/core.pyx index 8f44ceafe5b..7276b8fda47 100644 --- a/cupy/_core/core.pyx +++ b/cupy/_core/core.pyx @@ -47,7 +47,10 @@ from cupy.cuda cimport pinned_memory from cupy.cuda cimport memory from cupy.cuda cimport stream as stream_module from cupy_backends.cuda cimport stream as _stream_module -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime # If rop of cupy.ndarray is called, cupy's op is the last chance. diff --git a/cupy/_core/dlpack.pyx b/cupy/_core/dlpack.pyx index 1da7155b87a..68eac765c87 100644 --- a/cupy/_core/dlpack.pyx +++ b/cupy/_core/dlpack.pyx @@ -9,7 +9,10 @@ from libc.stdint cimport uint64_t from libc.stdint cimport intptr_t from libcpp.vector cimport vector -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module from cupy._core.core cimport _ndarray_base from cupy.cuda cimport memory diff --git a/cupy/_core/raw.pyx b/cupy/_core/raw.pyx index c9905382d96..2a766037e33 100644 --- a/cupy/_core/raw.pyx +++ b/cupy/_core/raw.pyx @@ -3,7 +3,10 @@ import pickle import cupy from cupy_backends.cuda.api cimport driver -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy.cuda.function cimport Function, Module diff --git a/cupy/cuda/common.pyx b/cupy/cuda/common.pyx index 120c2399da7..caa1f2db0ac 100644 --- a/cupy/cuda/common.pyx +++ b/cupy/cuda/common.pyx @@ -1,4 +1,7 @@ -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy.cuda cimport device import numpy diff --git a/cupy/cuda/cub.pyx b/cupy/cuda/cub.pyx index 8b62e7bc5b6..ebc7b84c4d3 100644 --- a/cupy/cuda/cub.pyx +++ b/cupy/cuda/cub.pyx @@ -5,7 +5,10 @@ from cpython cimport sequence from libc.stdint cimport intptr_t -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy._core.core cimport _internal_ascontiguousarray from cupy._core.internal cimport _contig_axes, is_in from cupy.cuda cimport common diff --git a/cupy/cuda/device.pyx b/cupy/cuda/device.pyx index 034d3f6aefd..cdde85dd397 100644 --- a/cupy/cuda/device.pyx +++ b/cupy/cuda/device.pyx @@ -3,7 +3,10 @@ import threading from cupy._core import syncdetect -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda.api import runtime as runtime_module from cupy import _util diff --git a/cupy/cuda/function.pyx b/cupy/cuda/function.pyx index 9d7c4db7fbe..b1389d86e01 100644 --- a/cupy/cuda/function.pyx +++ b/cupy/cuda/function.pyx @@ -14,7 +14,10 @@ from libcpp cimport vector from cupy._core cimport _carray from cupy._core.core cimport _ndarray_base from cupy_backends.cuda.api cimport driver -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy.cuda cimport stream as stream_module from cupy.cuda.memory cimport MemoryPointer from cupy.cuda.texture cimport TextureObject, SurfaceObject diff --git a/cupy/cuda/graph.pyx b/cupy/cuda/graph.pyx index 005cf0be11b..a5a2a9a5597 100644 --- a/cupy/cuda/graph.pyx +++ b/cupy/cuda/graph.pyx @@ -1,4 +1,7 @@ -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module diff --git a/cupy/cuda/memory.pyx b/cupy/cuda/memory.pyx index e17cbcdd2e7..1c0a7f2251d 100644 --- a/cupy/cuda/memory.pyx +++ b/cupy/cuda/memory.pyx @@ -20,7 +20,10 @@ from cupy.cuda cimport device from cupy.cuda cimport memory_hook from cupy.cuda cimport stream as stream_module from cupy_backends.cuda.api cimport driver -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda.api.runtime import CUDARuntimeError from cupy import _util diff --git a/cupy/cuda/pinned_memory.pyx b/cupy/cuda/pinned_memory.pyx index 3e88580364c..4e3e0220510 100644 --- a/cupy/cuda/pinned_memory.pyx +++ b/cupy/cuda/pinned_memory.pyx @@ -8,7 +8,10 @@ from fastrlock cimport rlock from cupy_backends.cuda.api import runtime from cupy._core cimport internal -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy import _util diff --git a/cupy/cuda/stream.pyx b/cupy/cuda/stream.pyx index 96e34e6618f..6809d34e399 100644 --- a/cupy/cuda/stream.pyx +++ b/cupy/cuda/stream.pyx @@ -1,6 +1,9 @@ import threading -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as backends_stream from cupy.cuda cimport graph diff --git a/cupy/cuda/texture.pyx b/cupy/cuda/texture.pyx index d66cbb1ad82..33a2d3f7ec9 100644 --- a/cupy/cuda/texture.pyx +++ b/cupy/cuda/texture.pyx @@ -5,7 +5,10 @@ import numpy from cupy._core.core cimport _ndarray_base from cupy._core.core cimport _internal_ascontiguousarray -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda.api.runtime cimport Array, \ ChannelFormatDesc, ChannelFormatKind, \ Memcpy3DParms, MemoryKind, PitchedPtr, ResourceDesc, ResourceType, \ diff --git a/cupy/fft/_cache.pyx b/cupy/fft/_cache.pyx index 9eae6e273d0..8160877c829 100644 --- a/cupy/fft/_cache.pyx +++ b/cupy/fft/_cache.pyx @@ -4,7 +4,10 @@ import gc import warnings import weakref -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime import threading diff --git a/cupy/fft/_callback.pyx b/cupy/fft/_callback.pyx index 443835680d4..80a274cce32 100644 --- a/cupy/fft/_callback.pyx +++ b/cupy/fft/_callback.pyx @@ -1,6 +1,9 @@ from libc.stdint cimport intptr_t -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy._core.core cimport _ndarray_base from cupy.cuda.device cimport get_compute_capability diff --git a/cupy_backends/cuda/libs/_cnvrtc.pxi b/cupy_backends/cuda/libs/_cnvrtc.pxi index 0d22cd26486..4184189c53c 100644 --- a/cupy_backends/cuda/libs/_cnvrtc.pxi +++ b/cupy_backends/cuda/libs/_cnvrtc.pxi @@ -1,6 +1,9 @@ import sys as _sys -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda._softlink cimport SoftLink diff --git a/cupy_backends/cuda/libs/cublas.pyx b/cupy_backends/cuda/libs/cublas.pyx index 05690ad3bf5..5aea68b0412 100644 --- a/cupy_backends/cuda/libs/cublas.pyx +++ b/cupy_backends/cuda/libs/cublas.pyx @@ -4,7 +4,10 @@ cimport cython # NOQA -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module IF CUPY_USE_GEN_HIP_CODE: diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 464c59d8a00..479c19cc772 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -6,7 +6,10 @@ cimport cython # NOQA from libcpp cimport vector from cupy_backends.cuda.api cimport driver -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module ############################################################################### diff --git a/cupy_backends/cuda/libs/curand.pyx b/cupy_backends/cuda/libs/curand.pyx index 8f254382392..05e099032ba 100644 --- a/cupy_backends/cuda/libs/curand.pyx +++ b/cupy_backends/cuda/libs/curand.pyx @@ -3,7 +3,10 @@ """Thin wrapper of cuRAND.""" cimport cython # NOQA -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module ############################################################################### diff --git a/cupy_backends/cuda/libs/cusolver.pyx b/cupy_backends/cuda/libs/cusolver.pyx index a421fe46f46..c4f6c58ec1d 100644 --- a/cupy_backends/cuda/libs/cusolver.pyx +++ b/cupy_backends/cuda/libs/cusolver.pyx @@ -4,7 +4,10 @@ cimport cython # NOQA -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module IF CUPY_USE_GEN_HIP_CODE: diff --git a/cupy_backends/cuda/libs/cusparse.pyx b/cupy_backends/cuda/libs/cusparse.pyx index 0b7a99c7018..ad499ffe4ca 100644 --- a/cupy_backends/cuda/libs/cusparse.pyx +++ b/cupy_backends/cuda/libs/cusparse.pyx @@ -1,7 +1,10 @@ import sys as _sys # no-cython-lint cimport cython # NOQA -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda.api.runtime cimport _is_hip_environment from cupy_backends.cuda cimport stream as stream_module from cupy_backends.cuda._softlink cimport SoftLink diff --git a/cupy_backends/cuda/libs/nccl.pyx b/cupy_backends/cuda/libs/nccl.pyx index 5dd2b9c40ec..daa07aace8d 100644 --- a/cupy_backends/cuda/libs/nccl.pyx +++ b/cupy_backends/cuda/libs/nccl.pyx @@ -9,7 +9,10 @@ from libc.stdint cimport intptr_t from libcpp cimport vector from cupy_backends.cuda.api cimport driver -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime cdef extern from '../../cupy_nccl.h': ctypedef struct ncclComm: diff --git a/cupy_backends/cuda/libs/nvrtc.pyx b/cupy_backends/cuda/libs/nvrtc.pyx index 2199837be36..788ee86598a 100644 --- a/cupy_backends/cuda/libs/nvrtc.pyx +++ b/cupy_backends/cuda/libs/nvrtc.pyx @@ -14,7 +14,10 @@ There are four differences compared to the original C API. cimport cython # NOQA from libcpp cimport vector -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime ############################################################################### diff --git a/cupy_backends/cuda/libs/profiler.pyx b/cupy_backends/cuda/libs/profiler.pyx index db27cedc2bb..0188a729947 100644 --- a/cupy_backends/cuda/libs/profiler.pyx +++ b/cupy_backends/cuda/libs/profiler.pyx @@ -1,7 +1,10 @@ # distutils: language = c++ """Thin wrapper of cuda profiler.""" -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime # TODO(kmaehashi): cudaProfilerInitialize is deprecated thus unsupported in diff --git a/cupy_backends/cuda/stream.pyx b/cupy_backends/cuda/stream.pyx index ce8340089a0..9bd7448ad3a 100755 --- a/cupy_backends/cuda/stream.pyx +++ b/cupy_backends/cuda/stream.pyx @@ -1,7 +1,10 @@ import os as _os import threading as _threading -from cupy_backends.cuda.api cimport runtime +IF CUPY_HIP_VERSION > 0: + from cupy_backends.cuda.api cimport runtime_hip as runtime +ELSE: + from cupy_backends.cuda.api cimport runtime cdef object _thread_local = _threading.local() From 792e31219adc744135573f07a2423190ec937e08 Mon Sep 17 00:00:00 2001 From: pnunna93 <104791500+pnunna93@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:15:06 -0500 Subject: [PATCH 06/12] Fix api declaration for cuda --- cupy_backends/cuda/api/runtime.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd index 32b0b7dea00..45623ba2d5f 100644 --- a/cupy_backends/cuda/api/runtime.pxd +++ b/cupy_backends/cuda/api/runtime.pxd @@ -142,7 +142,7 @@ cdef int deviceAttributeComputeCapabilityMinor cdef bint _is_hip_environment -IF CUPY_DONT_USE_GEN_HIP_CODE: +IF CUPY_DONT_USE_GEN_HIP_CODE or CUPY_CUDA_VERSION != 0: ############################################################################### # Classes ############################################################################### From beabf96464f1fdee219e77c869405f777cee51cf Mon Sep 17 00:00:00 2001 From: pnunna93 <104791500+pnunna93@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:28:35 -0500 Subject: [PATCH 07/12] Update runtime.pxd --- cupy_backends/cuda/api/runtime.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd index 45623ba2d5f..20b1889969e 100644 --- a/cupy_backends/cuda/api/runtime.pxd +++ b/cupy_backends/cuda/api/runtime.pxd @@ -142,7 +142,7 @@ cdef int deviceAttributeComputeCapabilityMinor cdef bint _is_hip_environment -IF CUPY_DONT_USE_GEN_HIP_CODE or CUPY_CUDA_VERSION != 0: +IF CUPY_CUDA_VERSION != 0 or CUPY_DONT_USE_GEN_HIP_CODE: ############################################################################### # Classes ############################################################################### From a3cefaf0fa1f77e57d148220874b6fae46218f62 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 28 Mar 2024 14:02:42 +0000 Subject: [PATCH 08/12] Define DONT_USE env var for cuda and stub --- cupy_backends/cuda/api/runtime.pxd | 2 +- install/cupy_builder/_command.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd index 20b1889969e..32b0b7dea00 100644 --- a/cupy_backends/cuda/api/runtime.pxd +++ b/cupy_backends/cuda/api/runtime.pxd @@ -142,7 +142,7 @@ cdef int deviceAttributeComputeCapabilityMinor cdef bint _is_hip_environment -IF CUPY_CUDA_VERSION != 0 or CUPY_DONT_USE_GEN_HIP_CODE: +IF CUPY_DONT_USE_GEN_HIP_CODE: ############################################################################### # Classes ############################################################################### diff --git a/install/cupy_builder/_command.py b/install/cupy_builder/_command.py index 5526a926d63..b31c17488d6 100644 --- a/install/cupy_builder/_command.py +++ b/install/cupy_builder/_command.py @@ -123,6 +123,7 @@ def _cythonize(self, nthreads: int) -> None: compile_time_env['CUPY_CUDA_VERSION'] = 0 compile_time_env['CUPY_HIP_VERSION'] = 0 compile_time_env['CUPY_USE_GEN_HIP_CODE'] = 0 + compile_time_env['CUPY_DONT_USE_GEN_HIP_CODE'] = 1 elif ctx.use_hip: # on ROCm/HIP compile_time_env['CUPY_CUDA_VERSION'] = 0 compile_time_env['CUPY_HIP_VERSION'] = build.get_hip_version() @@ -133,6 +134,7 @@ def _cythonize(self, nthreads: int) -> None: ctx.features['cuda'].get_version()) compile_time_env['CUPY_HIP_VERSION'] = 0 compile_time_env['CUPY_USE_GEN_HIP_CODE'] = 0 + compile_time_env['CUPY_DONT_USE_GEN_HIP_CODE'] = 1 print('Compile-time constants: ' + json.dumps(compile_time_env, indent=4)) From b91eed488bfaa0b387b4027d99e93b279791d3f4 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 28 Mar 2024 15:57:13 +0000 Subject: [PATCH 09/12] Add runtime error for missing functions --- cupy_backends/cuda/api/runtime.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cupy_backends/cuda/api/runtime.pyx b/cupy_backends/cuda/api/runtime.pyx index 34f12afecb0..06ea5bff1d1 100644 --- a/cupy_backends/cuda/api/runtime.pyx +++ b/cupy_backends/cuda/api/runtime.pyx @@ -415,6 +415,8 @@ ELSE: return value cpdef deviceSetLimit(int limit, size_t value): + if 0 < CUPY_HIP_VERSION < 50300000: + raise RuntimeError('deviceSetLimit requires ROCm 5.3+') status = cudaDeviceSetLimit(limit, value) check_status(status) @@ -1120,6 +1122,8 @@ ELSE: check_status(status) cpdef graphUpload(intptr_t graphExec, intptr_t stream): + if 0 < CUPY_HIP_VERSION < 50300000: + raise RuntimeError('graphUpload requires ROCm 5.3+') if runtimeGetVersion() < 11010: raise RuntimeError('graphUpload is supported since CUDA 11.1+') with nogil: From 126dbae8e44e15bc1d7d3618a97e073dd633210d Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 28 Mar 2024 17:58:23 +0000 Subject: [PATCH 10/12] Fix pre-commit errors --- cupy_backends/cuda/api/runtime.pxd | 90 ++++++++++++++---------------- cupy_backends/cuda/api/runtime.pyx | 87 +++++++++++------------------ 2 files changed, 74 insertions(+), 103 deletions(-) diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd index 32b0b7dea00..88ff564bfd7 100644 --- a/cupy_backends/cuda/api/runtime.pxd +++ b/cupy_backends/cuda/api/runtime.pxd @@ -143,9 +143,9 @@ cdef int deviceAttributeComputeCapabilityMinor cdef bint _is_hip_environment IF CUPY_DONT_USE_GEN_HIP_CODE: - ############################################################################### + ########################################################################### # Classes - ############################################################################### + ########################################################################### cdef class PointerAttributes: cdef: @@ -162,24 +162,22 @@ IF CUPY_DONT_USE_GEN_HIP_CODE: int locationType int devId - ############################################################################### + ########################################################################### # Error handling - ############################################################################### + ########################################################################### cpdef check_status(int status) - - ############################################################################### + ########################################################################### # Initialization - ############################################################################### + ########################################################################### cpdef int driverGetVersion() except? -1 cpdef int runtimeGetVersion() except? -1 - - ############################################################################### + ########################################################################### # Device and context operations - ############################################################################### + ########################################################################### cpdef int getDevice() except? -1 cpdef int deviceGetAttribute(int attrib, int device) except? -1 @@ -197,17 +195,16 @@ IF CUPY_DONT_USE_GEN_HIP_CODE: cpdef size_t deviceGetLimit(int limit) except? -1 cpdef deviceSetLimit(int limit, size_t value) - - ############################################################################### + ########################################################################### # Memory management - ############################################################################### + ########################################################################### cpdef intptr_t malloc(size_t size) except? 0 cpdef intptr_t mallocManaged(size_t size, unsigned int flags=*) except? 0 cpdef intptr_t malloc3DArray(intptr_t desc, size_t width, size_t height, - size_t depth, unsigned int flags=*) except? 0 + size_t depth, unsigned int flags=*) except? 0 cpdef intptr_t mallocArray(intptr_t desc, size_t width, size_t height, - unsigned int flags=*) except? 0 + unsigned int flags=*) except? 0 cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0 cpdef intptr_t mallocFromPoolAsync(size_t, intptr_t, intptr_t) except? 0 cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0 @@ -220,35 +217,35 @@ IF CUPY_DONT_USE_GEN_HIP_CODE: cpdef memGetInfo() cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind) cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind, - intptr_t stream) + intptr_t stream) cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice, - size_t size) + size_t size) cpdef memcpyPeerAsync(intptr_t dst, int dstDevice, - intptr_t src, int srcDevice, - size_t size, intptr_t stream) + intptr_t src, int srcDevice, + size_t size, intptr_t stream) cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, - size_t width, size_t height, MemoryKind kind) - cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch, - size_t width, size_t height, MemoryKind kind, - intptr_t stream) + size_t width, size_t height, MemoryKind kind) + cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, + size_t spitch, size_t width, size_t height, + MemoryKind kind, intptr_t stream) cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src, - size_t wOffset, size_t hOffset, size_t width, - size_t height, int kind) + size_t wOffset, size_t hOffset, size_t width, + size_t height, int kind) cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src, - size_t wOffset, size_t hOffset, size_t width, - size_t height, int kind, intptr_t stream) + size_t wOffset, size_t hOffset, size_t width, + size_t height, int kind, intptr_t stream) cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset, - intptr_t src, size_t spitch, size_t width, size_t height, - int kind) + intptr_t src, size_t spitch, size_t width, + size_t height, int kind) cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset, - intptr_t src, size_t spitch, size_t width, - size_t height, int kind, intptr_t stream) + intptr_t src, size_t spitch, size_t width, + size_t height, int kind, intptr_t stream) cpdef memcpy3D(intptr_t Memcpy3DParmsPtr) cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream) cpdef memset(intptr_t ptr, int value, size_t size) cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream) cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice, - intptr_t stream) + intptr_t stream) cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device) cpdef PointerAttributes pointerGetAttributes(intptr_t ptr) cpdef intptr_t deviceGetDefaultMemPool(int) except? 0 @@ -260,20 +257,20 @@ IF CUPY_DONT_USE_GEN_HIP_CODE: cpdef memPoolGetAttribute(intptr_t, int) cpdef memPoolSetAttribute(intptr_t, int, object) - - ############################################################################### + ########################################################################### # Stream and Event - ############################################################################### + ########################################################################### cpdef intptr_t streamCreate() except? 0 cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0 cpdef streamDestroy(intptr_t stream) cpdef streamSynchronize(intptr_t stream) cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg, - unsigned int flags=*) + unsigned int flags=*) cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg) cpdef streamQuery(intptr_t stream) - cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=*) + cpdef streamWaitEvent(intptr_t stream, intptr_t event, + unsigned int flags=*) cpdef streamBeginCapture(intptr_t stream, int mode=*) cpdef intptr_t streamEndCapture(intptr_t stream) except? 0 cpdef bint streamIsCapturing(intptr_t stream) except* @@ -285,20 +282,18 @@ IF CUPY_DONT_USE_GEN_HIP_CODE: cpdef eventRecord(intptr_t event, intptr_t stream) cpdef eventSynchronize(intptr_t event) - - ############################################################################## + ########################################################################### # util - ############################################################################## + ########################################################################### cdef _ensure_context() - - ############################################################################## + ########################################################################### # Texture - ############################################################################## + ########################################################################### cpdef uintmax_t createTextureObject( - intptr_t ResDesc, intptr_t TexDesc) except? 0 + intptr_t ResDesc, intptr_t TexDesc) except? 0 cpdef destroyTextureObject(uintmax_t texObject) cdef ChannelFormatDesc getChannelDesc(intptr_t array) except* cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t texobj) except* @@ -306,16 +301,15 @@ IF CUPY_DONT_USE_GEN_HIP_CODE: cdef Extent make_Extent(size_t w, size_t h, size_t d) except* cdef Pos make_Pos(size_t x, size_t y, size_t z) except* cdef PitchedPtr make_PitchedPtr( - intptr_t d, size_t p, size_t xsz, size_t ysz) except* + intptr_t d, size_t p, size_t xsz, size_t ysz) except* cpdef uintmax_t createSurfaceObject(intptr_t ResDesc) except? 0 cpdef destroySurfaceObject(uintmax_t surfObject) # TODO(leofang): add cudaGetSurfaceObjectResourceDesc - - ############################################################################## + ########################################################################### # Graph - ############################################################################## + ########################################################################### cpdef graphDestroy(intptr_t graph) cpdef graphExecDestroy(intptr_t graphExec) diff --git a/cupy_backends/cuda/api/runtime.pyx b/cupy_backends/cuda/api/runtime.pyx index 06ea5bff1d1..c2988853aef 100644 --- a/cupy_backends/cuda/api/runtime.pyx +++ b/cupy_backends/cuda/api/runtime.pyx @@ -45,14 +45,12 @@ ELSE: self.locationType = locationType self.devId = devId - ########################################################################### # Thread-local storage ########################################################################### cdef object _thread_local = _threading.local() - cdef class _ThreadLocal: cdef list context_initialized @@ -69,7 +67,6 @@ ELSE: tls = _thread_local.tls = _ThreadLocal() return <_ThreadLocal>tls - ########################################################################### # Extern ########################################################################### @@ -86,7 +83,6 @@ ELSE: cdef extern from '../../cupy_backend_runtime.h' nogil: bint hip_environment - ########################################################################### # Constants ########################################################################### @@ -100,7 +96,6 @@ ELSE: deviceAttributeComputeCapabilityMajor = cudaDevAttrComputeCapabilityMajor deviceAttributeComputeCapabilityMinor = cudaDevAttrComputeCapabilityMinor - # Provide access to constants from Python. # TODO(kmaehashi): Deprecate aliases above so that we can just do: # from cupy_backends.cuda.api._runtime_enum import * @@ -112,10 +107,8 @@ ELSE: if not key.startswith('_'): setattr(this, key, getattr(_runtime_enum, key)) - _export_enum() - ########################################################################### # Constants (CuPy) ########################################################################### @@ -123,7 +116,6 @@ ELSE: _is_hip_environment = hip_environment # for runtime being cimport'd is_hip = hip_environment # for runtime being import'd - ########################################################################### # Error handling ########################################################################### @@ -140,7 +132,6 @@ ELSE: def __reduce__(self): return (type(self), (self.status,)) - @cython.profile(False) cpdef inline check_status(int status): if status != 0: @@ -148,7 +139,6 @@ ELSE: cudaGetLastError() raise CUDARuntimeError(status) - ########################################################################### # Initialization ########################################################################### @@ -173,7 +163,6 @@ ELSE: check_status(status) return version - ########################################################################### # Device and context operations ########################################################################### @@ -231,28 +220,34 @@ ELSE: 'memoryClockRate': props.memoryClockRate, 'memoryBusWidth': props.memoryBusWidth, 'l2CacheSize': props.l2CacheSize, - 'maxThreadsPerMultiProcessor': props.maxThreadsPerMultiProcessor, + 'maxThreadsPerMultiProcessor': ( + props.maxThreadsPerMultiProcessor), 'isMultiGpuBoard': props.isMultiGpuBoard, 'cooperativeLaunch': props.cooperativeLaunch, - 'cooperativeMultiDeviceLaunch': props.cooperativeMultiDeviceLaunch, + 'cooperativeMultiDeviceLaunch': ( + props.cooperativeMultiDeviceLaunch), } IF CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 9020: properties['deviceOverlap'] = props.deviceOverlap properties['maxTexture1DMipmap'] = props.maxTexture1DMipmap properties['maxTexture1DLinear'] = props.maxTexture1DLinear - properties['maxTexture1DLayered'] = tuple(props.maxTexture1DLayered) + properties['maxTexture1DLayered'] = tuple( + props.maxTexture1DLayered) properties['maxTexture2DMipmap'] = tuple(props.maxTexture2DMipmap) properties['maxTexture2DLinear'] = tuple(props.maxTexture2DLinear) - properties['maxTexture2DLayered'] = tuple(props.maxTexture2DLayered) + properties['maxTexture2DLayered'] = tuple( + props.maxTexture2DLayered) properties['maxTexture2DGather'] = tuple(props.maxTexture2DGather) properties['maxTexture3DAlt'] = tuple(props.maxTexture3DAlt) properties['maxTextureCubemap'] = props.maxTextureCubemap properties['maxTextureCubemapLayered'] = tuple( props.maxTextureCubemapLayered) properties['maxSurface1D'] = props.maxSurface1D - properties['maxSurface1DLayered'] = tuple(props.maxSurface1DLayered) + properties['maxSurface1DLayered'] = tuple( + props.maxSurface1DLayered) properties['maxSurface2D'] = tuple(props.maxSurface2D) - properties['maxSurface2DLayered'] = tuple(props.maxSurface2DLayered) + properties['maxSurface2DLayered'] = tuple( + props.maxSurface2DLayered) properties['maxSurface3D'] = tuple(props.maxSurface3D) properties['maxSurfaceCubemap'] = props.maxSurfaceCubemap properties['maxSurfaceCubemapLayered'] = tuple( @@ -274,7 +269,8 @@ ELSE: properties['singleToDoublePrecisionPerfRatio'] = ( props.singleToDoublePrecisionPerfRatio) properties['pageableMemoryAccess'] = props.pageableMemoryAccess - properties['concurrentManagedAccess'] = props.concurrentManagedAccess + properties['concurrentManagedAccess'] = ( + props.concurrentManagedAccess) properties['computePreemptionSupported'] = ( props.computePreemptionSupported) properties['canUseHostPointerForRegisteredMem'] = ( @@ -289,7 +285,8 @@ ELSE: properties['luid'] = props.luid properties['luidDeviceNodeMask'] = props.luidDeviceNodeMask if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 11000: - properties['persistingL2CacheMaxSize'] = props.persistingL2CacheMaxSize + properties['persistingL2CacheMaxSize'] = ( + props.persistingL2CacheMaxSize) properties['maxBlocksPerMultiProcessor'] = ( props.maxBlocksPerMultiProcessor) properties['accessPolicyMaxWindowSize'] = ( @@ -315,9 +312,11 @@ ELSE: cdef dict arch = {} # for hipDeviceArch_t arch['hasGlobalInt32Atomics'] = props.arch.hasGlobalInt32Atomics - arch['hasGlobalFloatAtomicExch'] = props.arch.hasGlobalFloatAtomicExch + arch['hasGlobalFloatAtomicExch'] = ( + props.arch.hasGlobalFloatAtomicExch) arch['hasSharedInt32Atomics'] = props.arch.hasSharedInt32Atomics - arch['hasSharedFloatAtomicExch'] = props.arch.hasSharedFloatAtomicExch + arch['hasSharedFloatAtomicExch'] = ( + props.arch.hasSharedFloatAtomicExch) arch['hasFloatAtomicAdd'] = props.arch.hasFloatAtomicAdd arch['hasGlobalInt64Atomics'] = props.arch.hasGlobalInt64Atomics arch['hasSharedInt64Atomics'] = props.arch.hasSharedInt64Atomics @@ -340,7 +339,8 @@ ELSE: properties['managedMemory'] = props.managedMemory properties['directManagedMemAccessFromHost'] = ( props.directManagedMemAccessFromHost) - properties['concurrentManagedAccess'] = props.concurrentManagedAccess + properties['concurrentManagedAccess'] = ( + props.concurrentManagedAccess) properties['pageableMemoryAccess'] = props.pageableMemoryAccess properties['pageableMemoryAccessUsesHostPageTables'] = ( props.pageableMemoryAccessUsesHostPageTables) @@ -420,7 +420,6 @@ ELSE: status = cudaDeviceSetLimit(limit, value) check_status(status) - ########################################################################### # IPC operations ########################################################################### @@ -471,7 +470,6 @@ ELSE: check_status(status) return devPtr - ########################################################################### # Memory management ########################################################################### @@ -624,7 +622,7 @@ ELSE: MemoryKind kind, intptr_t stream): with nogil: status = cudaMemcpy2DAsync(dst, dpitch, src, spitch, - width, height, kind, + width, height, kind, stream) check_status(status) @@ -711,14 +709,14 @@ ELSE: check_status(status) IF CUPY_HIP_VERSION >= 60000000: if attrs.type == 0: # hipMemoryTypeHost - attrs.type = 1 # cudaMemoryTypeHost + attrs.type = 1 # cudaMemoryTypeHost elif attrs.type == 1: # hipMemoryTypeDevice - attrs.type = 2 # cudaMemoryTypeDevice + attrs.type = 2 # cudaMemoryTypeDevice ELIF CUPY_HIP_VERSION > 0: if attrs.memoryType == 0: # hipMemoryTypeHost - attrs.memoryType = 1 # cudaMemoryTypeHost - elif attrs.memoryType == 1: # hipMemoryTypeDevice - attrs.memoryType = 2 # cudaMemoryTypeDevice + attrs.memoryType = 1 # cudaMemoryTypeHost + elif attrs.memoryType == 1: # hipMemoryTypeDevice + attrs.memoryType = 2 # cudaMemoryTypeDevice IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 60000000: return PointerAttributes( attrs.device, @@ -848,7 +846,6 @@ ELSE: out) check_status(status) - ########################################################################### # Stream and Event ########################################################################### @@ -859,25 +856,21 @@ ELSE: check_status(status) return stream - cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0: cdef driver.Stream stream status = cudaStreamCreateWithFlags(&stream, flags) check_status(status) return stream - cpdef streamDestroy(intptr_t stream): status = cudaStreamDestroy(stream) check_status(status) - cpdef streamSynchronize(intptr_t stream): with nogil: status = cudaStreamSynchronize(stream) check_status(status) - cdef _streamCallbackFunc(driver.Stream hStream, int status, void* func_arg) with gil: obj = func_arg @@ -885,14 +878,12 @@ ELSE: func(hStream, status, arg) cpython.Py_DECREF(obj) - cdef _HostFnFunc(void* func_arg) with gil: obj = func_arg func, arg = obj func(arg) cpython.Py_DECREF(obj) - cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg, unsigned int flags=0): if _is_hip_environment and stream == 0: @@ -906,10 +897,10 @@ ELSE: func_arg, flags) check_status(status) - cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg): if 0 < CUPY_HIP_VERSION < 50200000: - raise RuntimeError('This feature is supported on HIP since ROCm 5.2') + raise RuntimeError('This feature is supported on ' + 'HIP since ROCm 5.2') func_arg = (callback, arg) cpython.Py_INCREF(func_arg) @@ -919,11 +910,9 @@ ELSE: func_arg) check_status(status) - cpdef streamQuery(intptr_t stream): return cudaStreamQuery(stream) - cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=0): with nogil: @@ -931,7 +920,6 @@ ELSE: event, flags) check_status(status) - cpdef streamBeginCapture(intptr_t stream, int mode=streamCaptureModeRelaxed): if 0 < CUPY_HIP_VERSION < 40300000: @@ -942,7 +930,6 @@ ELSE: mode) check_status(status) - cpdef intptr_t streamEndCapture(intptr_t stream) except? 0: # TODO(leofang): check and raise if stream == 0? cdef Graph g @@ -953,21 +940,19 @@ ELSE: check_status(status) return g - cpdef bint streamIsCapturing(intptr_t stream) except*: cdef StreamCaptureStatus s if 0 < CUPY_HIP_VERSION < 50000000: raise RuntimeError('streamIsCapturing is not supported in ROCm') with nogil: status = cudaStreamIsCapturing(stream, &s) - check_status(status) # cudaErrorStreamCaptureImplicit could be - # raised here + # cudaErrorStreamCaptureImplicit could be raised here + check_status(status) if s == streamCaptureStatusInvalidated: raise RuntimeError('the stream was capturing, but an error has ' 'invalidated the capture sequence') return s - cpdef intptr_t eventCreate() except? 0: cdef driver.Event event status = cudaEventCreate(&event) @@ -980,12 +965,10 @@ ELSE: check_status(status) return event - cpdef eventDestroy(intptr_t event): status = cudaEventDestroy(event) check_status(status) - cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0: cdef float ms status = cudaEventElapsedTime(&ms, start, @@ -993,22 +976,18 @@ ELSE: check_status(status) return ms - cpdef eventQuery(intptr_t event): return cudaEventQuery(event) - cpdef eventRecord(intptr_t event, intptr_t stream): status = cudaEventRecord(event, stream) check_status(status) - cpdef eventSynchronize(intptr_t event): with nogil: status = cudaEventSynchronize(event) check_status(status) - ########################################################################### # util ########################################################################### @@ -1025,7 +1004,6 @@ ELSE: memGetInfo() tls.context_initialized[dev] = True - ########################################################################### # Texture ########################################################################### @@ -1091,7 +1069,6 @@ ELSE: intptr_t d, size_t p, size_t xsz, size_t ysz) except*: return make_cudaPitchedPtr(d, p, xsz, ysz) - ########################################################################### # Graph ########################################################################### From 4224f3291d6daa1a904d15249ecee9cd2e9538bd Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Wed, 10 Apr 2024 18:00:31 +0000 Subject: [PATCH 11/12] Remove dont_use to use mapping in custom json --- cupy_backends/cuda/api/runtime.pxd | 2 +- install/amd_build/rocm_custom_mapping.json | 3 +-- install/cupy_builder/_command.py | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd index 88ff564bfd7..3dd3d45356a 100644 --- a/cupy_backends/cuda/api/runtime.pxd +++ b/cupy_backends/cuda/api/runtime.pxd @@ -142,7 +142,7 @@ cdef int deviceAttributeComputeCapabilityMinor cdef bint _is_hip_environment -IF CUPY_DONT_USE_GEN_HIP_CODE: +IF not CUPY_USE_GEN_HIP_CODE: ########################################################################### # Classes ########################################################################### diff --git a/install/amd_build/rocm_custom_mapping.json b/install/amd_build/rocm_custom_mapping.json index 4ac4e0b978e..728ed421b85 100644 --- a/install/amd_build/rocm_custom_mapping.json +++ b/install/amd_build/rocm_custom_mapping.json @@ -1,6 +1,5 @@ { "custom_map": { - "CUPY_USE_GEN_HIP_CODE" : "CUPY_DONT_USE_GEN_HIP_CODE", - "CUPY_DONT_USE_GEN_HIP_CODE" : "CUPY_USE_GEN_HIP_CODE" + "CUPY_USE_GEN_HIP_CODE" : "CUPY_DONT_USE_GEN_HIP_CODE" } } diff --git a/install/cupy_builder/_command.py b/install/cupy_builder/_command.py index b31c17488d6..5526a926d63 100644 --- a/install/cupy_builder/_command.py +++ b/install/cupy_builder/_command.py @@ -123,7 +123,6 @@ def _cythonize(self, nthreads: int) -> None: compile_time_env['CUPY_CUDA_VERSION'] = 0 compile_time_env['CUPY_HIP_VERSION'] = 0 compile_time_env['CUPY_USE_GEN_HIP_CODE'] = 0 - compile_time_env['CUPY_DONT_USE_GEN_HIP_CODE'] = 1 elif ctx.use_hip: # on ROCm/HIP compile_time_env['CUPY_CUDA_VERSION'] = 0 compile_time_env['CUPY_HIP_VERSION'] = build.get_hip_version() @@ -134,7 +133,6 @@ def _cythonize(self, nthreads: int) -> None: ctx.features['cuda'].get_version()) compile_time_env['CUPY_HIP_VERSION'] = 0 compile_time_env['CUPY_USE_GEN_HIP_CODE'] = 0 - compile_time_env['CUPY_DONT_USE_GEN_HIP_CODE'] = 1 print('Compile-time constants: ' + json.dumps(compile_time_env, indent=4)) From 8d7cc141af520e89bf42e282e1d7ecf19ee3bfd5 Mon Sep 17 00:00:00 2001 From: pnunna93 <104791500+pnunna93@users.noreply.github.com> Date: Wed, 10 Apr 2024 13:17:57 -0500 Subject: [PATCH 12/12] Move runtime_hip to HIP list --- install/cupy_builder/_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index 75a79fe320c..f2f91cdf258 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -96,7 +96,6 @@ def _from_dict(d: Dict[str, Any], ctx: Context) -> Feature: 'cupy_backends.cuda.api._driver_enum', 'cupy_backends.cuda.api.runtime', 'cupy_backends.cuda.api._runtime_enum', - 'cupy_backends.cuda.api.runtime_hip', 'cupy_backends.cuda.libs.cublas', 'cupy_backends.cuda.libs.curand', 'cupy_backends.cuda.libs.cusparse', @@ -159,6 +158,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'name': 'cuda', 'required': True, 'file': _cuda_files + [ + 'cupy_backends.cuda.api.runtime_hip', 'cupy_backends.cuda.libs.nvtx', 'cupy_backends.cuda.libs.cusolver', 'cupy_backends.cuda.libs.cusolver_hip',