From 60e9cb89cabb36d507bdeea199c97a11f41787dd Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 27 Mar 2024 21:35:54 +0000
Subject: [PATCH 01/12] Enable runtime hipification

---
 cupy_backends/cuda/api/runtime.pyx   | 2106 +++++++++++++-------------
 cupy_backends/hip/cupy_hip_runtime.h |  493 +-----
 install/cupy_builder/_features.py    |    1 +
 3 files changed, 1072 insertions(+), 1528 deletions(-)

diff --git a/cupy_backends/cuda/api/runtime.pyx b/cupy_backends/cuda/api/runtime.pyx
index 63b11edc8c5..34f12afecb0 100644
--- a/cupy_backends/cuda/api/runtime.pyx
+++ b/cupy_backends/cuda/api/runtime.pyx
@@ -20,1079 +20,1109 @@ cimport cython  # NOQA
 from cupy_backends.cuda.api cimport driver  # NOQA
 from cupy_backends.cuda.libs cimport nvrtc  # no-cython-lint
 
+IF CUPY_USE_GEN_HIP_CODE:
+    from cupy_backends.cuda.api.runtime_hip import *
+ELSE:
+    ###########################################################################
+    # Classes
+    ###########################################################################
 
-###############################################################################
-# Classes
-###############################################################################
+    cdef class PointerAttributes:
 
-cdef class PointerAttributes:
+        def __init__(self, int device, intptr_t devicePointer,
+                     intptr_t hostPointer, int type=-1):
+            self.type = type
+            self.device = device
+            self.devicePointer = devicePointer
+            self.hostPointer = hostPointer
 
-    def __init__(self, int device, intptr_t devicePointer,
-                 intptr_t hostPointer, int type=-1):
-        self.type = type
-        self.device = device
-        self.devicePointer = devicePointer
-        self.hostPointer = hostPointer
+    cdef class MemPoolProps:
 
-cdef class MemPoolProps:
+        def __init__(self, int allocType, int handleType,
+                     int locationType, int devId):
+            self.allocType = allocType
+            self.handleType = handleType
+            self.locationType = locationType
+            self.devId = devId
 
-    def __init__(
-            self, int allocType, int handleType, int locationType, int devId):
-        self.allocType = allocType
-        self.handleType = handleType
-        self.locationType = locationType
-        self.devId = devId
 
+    ###########################################################################
+    # Thread-local storage
+    ###########################################################################
 
-###############################################################################
-# Thread-local storage
-###############################################################################
+    cdef object _thread_local = _threading.local()
 
-cdef object _thread_local = _threading.local()
 
+    cdef class _ThreadLocal:
 
-cdef class _ThreadLocal:
+        cdef list context_initialized
 
-    cdef list context_initialized
+        def __init__(self):
+            cdef int i
+            self.context_initialized = [False for i in range(getDeviceCount())]
 
-    def __init__(self):
-        cdef int i
-        self.context_initialized = [False for i in range(getDeviceCount())]
+        @staticmethod
+        cdef _ThreadLocal get():
+            try:
+                tls = _thread_local.tls
+            except AttributeError:
+                tls = _thread_local.tls = _ThreadLocal()
+            return <_ThreadLocal>tls
 
-    @staticmethod
-    cdef _ThreadLocal get():
-        try:
-            tls = _thread_local.tls
-        except AttributeError:
-            tls = _thread_local.tls = _ThreadLocal()
-        return <_ThreadLocal>tls
 
+    ###########################################################################
+    # Extern
+    ###########################################################################
 
-###############################################################################
-# Extern
-###############################################################################
+    IF CUPY_USE_CUDA_PYTHON:
+        from cuda.ccudart cimport *
+    ELSE:
+        IF CUPY_HIP_VERSION > 0:
+            include '_runtime_extern_hip.pxi'
+        ELSE:
+            include '_runtime_extern.pxi'
+        pass  # for cython-lint
 
-IF CUPY_USE_CUDA_PYTHON:
-    from cuda.ccudart cimport *
-ELSE:
-    include '_runtime_extern.pxi'
-    pass  # for cython-lint
+    cdef extern from '../../cupy_backend_runtime.h' nogil:
+        bint hip_environment
 
-cdef extern from '../../cupy_backend_runtime.h' nogil:
-    bint hip_environment
 
+    ###########################################################################
+    # Constants
+    ###########################################################################
 
-###############################################################################
-# Constants
-###############################################################################
+    # TODO(kmaehashi): Deprecate these aliases and use `cuda*`.
+    errorInvalidValue = cudaErrorInvalidValue
+    errorMemoryAllocation = cudaErrorMemoryAllocation
+    errorPeerAccessAlreadyEnabled = cudaErrorPeerAccessAlreadyEnabled
+    errorContextIsDestroyed = cudaErrorContextIsDestroyed
+    errorInvalidResourceHandle = cudaErrorInvalidResourceHandle
+    deviceAttributeComputeCapabilityMajor = cudaDevAttrComputeCapabilityMajor
+    deviceAttributeComputeCapabilityMinor = cudaDevAttrComputeCapabilityMinor
 
-# TODO(kmaehashi): Deprecate these aliases and use `cuda*`.
-errorInvalidValue = cudaErrorInvalidValue
-errorMemoryAllocation = cudaErrorMemoryAllocation
-errorPeerAccessAlreadyEnabled = cudaErrorPeerAccessAlreadyEnabled
-errorContextIsDestroyed = cudaErrorContextIsDestroyed
-errorInvalidResourceHandle = cudaErrorInvalidResourceHandle
-deviceAttributeComputeCapabilityMajor = cudaDevAttrComputeCapabilityMajor
-deviceAttributeComputeCapabilityMinor = cudaDevAttrComputeCapabilityMinor
 
+    # Provide access to constants from Python.
+    # TODO(kmaehashi): Deprecate aliases above so that we can just do:
+    # from cupy_backends.cuda.api._runtime_enum import *
+    def _export_enum():
+        import sys
+        import cupy_backends.cuda.api._runtime_enum as _runtime_enum
+        this = sys.modules[__name__]
+        for key in dir(_runtime_enum):
+            if not key.startswith('_'):
+                setattr(this, key, getattr(_runtime_enum, key))
 
-# Provide access to constants from Python.
-# TODO(kmaehashi): Deprecate aliases above so that we can just do:
-# from cupy_backends.cuda.api._runtime_enum import *
-def _export_enum():
-    import sys
-    import cupy_backends.cuda.api._runtime_enum as _runtime_enum
-    this = sys.modules[__name__]
-    for key in dir(_runtime_enum):
-        if not key.startswith('_'):
-            setattr(this, key, getattr(_runtime_enum, key))
 
+    _export_enum()
 
-_export_enum()
 
+    ###########################################################################
+    # Constants (CuPy)
+    ###########################################################################
 
-###############################################################################
-# Constants (CuPy)
-###############################################################################
+    _is_hip_environment = hip_environment  # for runtime being cimport'd
+    is_hip = hip_environment  # for runtime being import'd
 
-_is_hip_environment = hip_environment  # for runtime being cimport'd
-is_hip = hip_environment  # for runtime being import'd
 
+    ###########################################################################
+    # Error handling
+    ###########################################################################
 
-###############################################################################
-# Error handling
-###############################################################################
+    class CUDARuntimeError(RuntimeError):
 
-class CUDARuntimeError(RuntimeError):
+        def __init__(self, status):
+            self.status = status
+            cdef bytes name = cudaGetErrorName(<Error>status)
+            cdef bytes msg = cudaGetErrorString(<Error>status)
+            super(CUDARuntimeError, self).__init__(
+                '%s: %s' % (name.decode(), msg.decode()))
 
-    def __init__(self, status):
-        self.status = status
-        cdef bytes name = cudaGetErrorName(<Error>status)
-        cdef bytes msg = cudaGetErrorString(<Error>status)
-        super(CUDARuntimeError, self).__init__(
-            '%s: %s' % (name.decode(), msg.decode()))
+        def __reduce__(self):
+            return (type(self), (self.status,))
 
-    def __reduce__(self):
-        return (type(self), (self.status,))
 
+    @cython.profile(False)
+    cpdef inline check_status(int status):
+        if status != 0:
+            # to reset error status
+            cudaGetLastError()
+            raise CUDARuntimeError(status)
 
-@cython.profile(False)
-cpdef inline check_status(int status):
-    if status != 0:
-        # to reset error status
-        cudaGetLastError()
-        raise CUDARuntimeError(status)
 
+    ###########################################################################
+    # Initialization
+    ###########################################################################
 
-###############################################################################
-# Initialization
-###############################################################################
+    cpdef int driverGetVersion() except? -1:
+        cdef int version
+        status = cudaDriverGetVersion(&version)
+        check_status(status)
+        return version
+
+    cpdef int runtimeGetVersion() except? -1:
+        cdef int version
+        IF CUPY_USE_CUDA_PYTHON:
+            # Workarounds an issue that cuda-python returns its version instead
+            # of the real runtime version.
+            # https://github.com/NVIDIA/cuda-python/issues/16
+            cdef int major, minor
+            (major, minor) = nvrtc.getVersion()
+            version = major * 1000 + minor * 10
+        ELSE:
+            status = cudaRuntimeGetVersion(&version)
+            check_status(status)
+        return version
+
+
+    ###########################################################################
+    # Device and context operations
+    ###########################################################################
+
+    cpdef int getDevice() except? -1:
+        cdef int device
+        status = cudaGetDevice(&device)
+        check_status(status)
+        return device
 
-cpdef int driverGetVersion() except? -1:
-    cdef int version
-    status = cudaDriverGetVersion(&version)
-    check_status(status)
-    return version
+    cpdef int deviceGetAttribute(int attrib, int device) except? -1:
+        cdef int ret
+        status = cudaDeviceGetAttribute(&ret, <DeviceAttr>attrib, device)
+        check_status(status)
+        return ret
 
-cpdef int runtimeGetVersion() except? -1:
-    cdef int version
-    IF CUPY_USE_CUDA_PYTHON:
-        # Workarounds an issue that cuda-python returns its version instead of
-        # the real runtime version.
-        # https://github.com/NVIDIA/cuda-python/issues/16
-        cdef int major, minor
-        (major, minor) = nvrtc.getVersion()
-        version = major * 1000 + minor * 10
-    ELSE:
-        status = cudaRuntimeGetVersion(&version)
-        check_status(status)
-    return version
-
-
-###############################################################################
-# Device and context operations
-###############################################################################
-
-cpdef int getDevice() except? -1:
-    cdef int device
-    status = cudaGetDevice(&device)
-    check_status(status)
-    return device
-
-cpdef int deviceGetAttribute(int attrib, int device) except? -1:
-    cdef int ret
-    status = cudaDeviceGetAttribute(&ret, <DeviceAttr>attrib, device)
-    check_status(status)
-    return ret
-
-cpdef getDeviceProperties(int device):
-    cdef DeviceProp props
-    cdef int status = cudaGetDeviceProperties(&props, device)
-    check_status(status)
-
-    cdef dict properties = {'name': b'UNAVAILABLE'}  # for RTD
-
-    # Common properties to CUDA 9.0, 9.2, 10.x, 11.x, and HIP
-    IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 0:
-        properties = {
-            'name': props.name,
-            'totalGlobalMem': props.totalGlobalMem,
-            'sharedMemPerBlock': props.sharedMemPerBlock,
-            'regsPerBlock': props.regsPerBlock,
-            'warpSize': props.warpSize,
-            'maxThreadsPerBlock': props.maxThreadsPerBlock,
-            'maxThreadsDim': tuple(props.maxThreadsDim),
-            'maxGridSize': tuple(props.maxGridSize),
-            'clockRate': props.clockRate,
-            'totalConstMem': props.totalConstMem,
-            'major': props.major,
-            'minor': props.minor,
-            'textureAlignment': props.textureAlignment,
-            'texturePitchAlignment': props.texturePitchAlignment,
-            'multiProcessorCount': props.multiProcessorCount,
-            'kernelExecTimeoutEnabled': props.kernelExecTimeoutEnabled,
-            'integrated': props.integrated,
-            'canMapHostMemory': props.canMapHostMemory,
-            'computeMode': props.computeMode,
-            'maxTexture1D': props.maxTexture1D,
-            'maxTexture2D': tuple(props.maxTexture2D),
-            'maxTexture3D': tuple(props.maxTexture3D),
-            'concurrentKernels': props.concurrentKernels,
-            'ECCEnabled': props.ECCEnabled,
-            'pciBusID': props.pciBusID,
-            'pciDeviceID': props.pciDeviceID,
-            'pciDomainID': props.pciDomainID,
-            'tccDriver': props.tccDriver,
-            'memoryClockRate': props.memoryClockRate,
-            'memoryBusWidth': props.memoryBusWidth,
-            'l2CacheSize': props.l2CacheSize,
-            'maxThreadsPerMultiProcessor': props.maxThreadsPerMultiProcessor,
-            'isMultiGpuBoard': props.isMultiGpuBoard,
-            'cooperativeLaunch': props.cooperativeLaunch,
-            'cooperativeMultiDeviceLaunch': props.cooperativeMultiDeviceLaunch,
-        }
-    IF CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 9020:
-        properties['deviceOverlap'] = props.deviceOverlap
-        properties['maxTexture1DMipmap'] = props.maxTexture1DMipmap
-        properties['maxTexture1DLinear'] = props.maxTexture1DLinear
-        properties['maxTexture1DLayered'] = tuple(props.maxTexture1DLayered)
-        properties['maxTexture2DMipmap'] = tuple(props.maxTexture2DMipmap)
-        properties['maxTexture2DLinear'] = tuple(props.maxTexture2DLinear)
-        properties['maxTexture2DLayered'] = tuple(props.maxTexture2DLayered)
-        properties['maxTexture2DGather'] = tuple(props.maxTexture2DGather)
-        properties['maxTexture3DAlt'] = tuple(props.maxTexture3DAlt)
-        properties['maxTextureCubemap'] = props.maxTextureCubemap
-        properties['maxTextureCubemapLayered'] = tuple(
-            props.maxTextureCubemapLayered)
-        properties['maxSurface1D'] = props.maxSurface1D
-        properties['maxSurface1DLayered'] = tuple(props.maxSurface1DLayered)
-        properties['maxSurface2D'] = tuple(props.maxSurface2D)
-        properties['maxSurface2DLayered'] = tuple(props.maxSurface2DLayered)
-        properties['maxSurface3D'] = tuple(props.maxSurface3D)
-        properties['maxSurfaceCubemap'] = props.maxSurfaceCubemap
-        properties['maxSurfaceCubemapLayered'] = tuple(
-            props.maxSurfaceCubemapLayered)
-        properties['surfaceAlignment'] = props.surfaceAlignment
-        properties['asyncEngineCount'] = props.asyncEngineCount
-        properties['unifiedAddressing'] = props.unifiedAddressing
-        properties['streamPrioritiesSupported'] = (
-            props.streamPrioritiesSupported)
-        properties['globalL1CacheSupported'] = props.globalL1CacheSupported
-        properties['localL1CacheSupported'] = props.localL1CacheSupported
-        properties['sharedMemPerMultiprocessor'] = (
-            props.sharedMemPerMultiprocessor)
-        properties['regsPerMultiprocessor'] = props.regsPerMultiprocessor
-        properties['managedMemory'] = props.managedMemory
-        properties['multiGpuBoardGroupID'] = props.multiGpuBoardGroupID
-        properties['hostNativeAtomicSupported'] = (
-            props.hostNativeAtomicSupported)
-        properties['singleToDoublePrecisionPerfRatio'] = (
-            props.singleToDoublePrecisionPerfRatio)
-        properties['pageableMemoryAccess'] = props.pageableMemoryAccess
-        properties['concurrentManagedAccess'] = props.concurrentManagedAccess
-        properties['computePreemptionSupported'] = (
-            props.computePreemptionSupported)
-        properties['canUseHostPointerForRegisteredMem'] = (
-            props.canUseHostPointerForRegisteredMem)
-        properties['sharedMemPerBlockOptin'] = props.sharedMemPerBlockOptin
-        properties['pageableMemoryAccessUsesHostPageTables'] = (
-            props.pageableMemoryAccessUsesHostPageTables)
-        properties['directManagedMemAccessFromHost'] = (
-            props.directManagedMemAccessFromHost)
-    if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >=10000:
-        properties['uuid'] = props.uuid.bytes
-        properties['luid'] = props.luid
-        properties['luidDeviceNodeMask'] = props.luidDeviceNodeMask
-    if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 11000:
-        properties['persistingL2CacheMaxSize'] = props.persistingL2CacheMaxSize
-        properties['maxBlocksPerMultiProcessor'] = (
-            props.maxBlocksPerMultiProcessor)
-        properties['accessPolicyMaxWindowSize'] = (
-            props.accessPolicyMaxWindowSize)
-        properties['reservedSharedMemPerBlock'] = (
-            props.reservedSharedMemPerBlock)
-    IF CUPY_HIP_VERSION > 0:  # HIP-only props
-        properties['clockInstructionRate'] = props.clockInstructionRate
-        properties['maxSharedMemoryPerMultiProcessor'] = (
-            props.maxSharedMemoryPerMultiProcessor)
-        properties['hdpMemFlushCntl'] = <intptr_t>(props.hdpMemFlushCntl)
-        properties['hdpRegFlushCntl'] = <intptr_t>(props.hdpRegFlushCntl)
-        properties['memPitch'] = props.memPitch
-        properties['cooperativeMultiDeviceUnmatchedFunc'] = (
-            props.cooperativeMultiDeviceUnmatchedFunc)
-        properties['cooperativeMultiDeviceUnmatchedGridDim'] = (
-            props.cooperativeMultiDeviceUnmatchedGridDim)
-        properties['cooperativeMultiDeviceUnmatchedBlockDim'] = (
-            props.cooperativeMultiDeviceUnmatchedBlockDim)
-        properties['cooperativeMultiDeviceUnmatchedSharedMem'] = (
-            props.cooperativeMultiDeviceUnmatchedSharedMem)
-        properties['isLargeBar'] = props.isLargeBar
-
-        cdef dict arch = {}  # for hipDeviceArch_t
-        arch['hasGlobalInt32Atomics'] = props.arch.hasGlobalInt32Atomics
-        arch['hasGlobalFloatAtomicExch'] = props.arch.hasGlobalFloatAtomicExch
-        arch['hasSharedInt32Atomics'] = props.arch.hasSharedInt32Atomics
-        arch['hasSharedFloatAtomicExch'] = props.arch.hasSharedFloatAtomicExch
-        arch['hasFloatAtomicAdd'] = props.arch.hasFloatAtomicAdd
-        arch['hasGlobalInt64Atomics'] = props.arch.hasGlobalInt64Atomics
-        arch['hasSharedInt64Atomics'] = props.arch.hasSharedInt64Atomics
-        arch['hasDoubles'] = props.arch.hasDoubles
-        arch['hasWarpVote'] = props.arch.hasWarpVote
-        arch['hasWarpBallot'] = props.arch.hasWarpBallot
-        arch['hasWarpShuffle'] = props.arch.hasWarpShuffle
-        arch['hasFunnelShift'] = props.arch.hasFunnelShift
-        arch['hasThreadFenceSystem'] = props.arch.hasThreadFenceSystem
-        arch['hasSyncThreadsExt'] = props.arch.hasSyncThreadsExt
-        arch['hasSurfaceFuncs'] = props.arch.hasSurfaceFuncs
-        arch['has3dGrid'] = props.arch.has3dGrid
-        arch['hasDynamicParallelism'] = props.arch.hasDynamicParallelism
-        properties['arch'] = arch
-    IF 0 < CUPY_HIP_VERSION < 310:  # gcnArchName used after ROCm 3.1+
-        properties['gcnArch'] = props.gcnArch
-    IF CUPY_HIP_VERSION >= 310:
-        properties['gcnArchName'] = props.gcnArchName
-        properties['asicRevision'] = props.asicRevision
-        properties['managedMemory'] = props.managedMemory
-        properties['directManagedMemAccessFromHost'] = (
-            props.directManagedMemAccessFromHost)
-        properties['concurrentManagedAccess'] = props.concurrentManagedAccess
-        properties['pageableMemoryAccess'] = props.pageableMemoryAccess
-        properties['pageableMemoryAccessUsesHostPageTables'] = (
-            props.pageableMemoryAccessUsesHostPageTables)
-    return properties
-
-cpdef int deviceGetByPCIBusId(str pci_bus_id) except? -1:
-    # Encode the python string before passing to native code
-    byte_pci_bus_id = pci_bus_id.encode('ascii')
-    cdef const char* c_pci_bus_id = byte_pci_bus_id
-
-    cdef int device = -1
-    cdef int status
-    status = cudaDeviceGetByPCIBusId(&device, c_pci_bus_id)
-    check_status(status)
-    # on ROCm, it might fail silently, so we also need to check if the
-    # device is meaningful or not
-    if hip_environment and device == -1:
-        check_status(cudaErrorInvalidValue)
-    return device
-
-cpdef str deviceGetPCIBusId(int device):
-    # The PCI Bus ID string must be able to store 13 characters including the
-    # NULL-terminator according to the CUDA documentation.
-    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html
-    cdef char pci_bus_id[13]
-    status = cudaDeviceGetPCIBusId(pci_bus_id, 13, device)
-    check_status(status)
-    return pci_bus_id.decode('ascii')
-
-cpdef int getDeviceCount() except? -1:
-    cdef int count
-    status = cudaGetDeviceCount(&count)
-    check_status(status)
-    return count
-
-cpdef setDevice(int device):
-    status = cudaSetDevice(device)
-    check_status(status)
-
-cpdef deviceSynchronize():
-    with nogil:
-        status = cudaDeviceSynchronize()
-    check_status(status)
-
-cpdef int deviceCanAccessPeer(int device, int peerDevice) except? -1:
-    cdef int ret
-    status = cudaDeviceCanAccessPeer(&ret, device, peerDevice)
-    check_status(status)
-    return ret
-
-cpdef deviceEnablePeerAccess(int peerDevice):
-    status = cudaDeviceEnablePeerAccess(peerDevice, 0)
-    check_status(status)
-
-cpdef deviceDisablePeerAccess(int peerDevice):
-    status = cudaDeviceDisablePeerAccess(peerDevice)
-    check_status(status)
-
-cpdef _deviceEnsurePeerAccess(int peerDevice):
-    status = cudaDeviceEnablePeerAccess(peerDevice, 0)
-    if status == 0:
-        return
-    elif status == errorPeerAccessAlreadyEnabled:
-        cudaGetLastError()  # clear error status
-        return
-    check_status(status)
-
-cpdef size_t deviceGetLimit(int limit) except? -1:
-    cdef size_t value
-    status = cudaDeviceGetLimit(&value, <Limit>limit)
-    check_status(status)
-    return value
-
-cpdef deviceSetLimit(int limit, size_t value):
-    status = cudaDeviceSetLimit(<Limit>limit, value)
-    check_status(status)
-
-
-###############################################################################
-# IPC operations
-###############################################################################
-
-cpdef ipcCloseMemHandle(intptr_t devPtr):
-    status = cudaIpcCloseMemHandle(<void*>devPtr)
-    check_status(status)
-
-cpdef ipcGetEventHandle(intptr_t event):
-    cdef IpcEventHandle handle
-    status = cudaIpcGetEventHandle(&handle, <driver.Event>event)
-    check_status(status)
-    # We need to do this due to a bug in Cython that
-    # cuts out the 0 bytes in an array of chars when
-    # constructing the python object
-    # resulting in different sizes assignment errors
-    # when recreating the struct from the python
-    # array of bytes
-    reserved = [<unsigned char>handle.reserved[i] for i in range(64)]
-    return bytes(reserved)
-
-cpdef ipcGetMemHandle(intptr_t devPtr):
-    cdef IpcMemHandle handle
-    status = cudaIpcGetMemHandle(&handle, <void*>devPtr)
-    check_status(status)
-    # We need to do this due to a bug in Cython that
-    # when converting an array of chars in C to a python object
-    # it discards the data after the first 0 value
-    # resulting in a loss of data, as this is not a string
-    # but a buffer of bytes
-    reserved = [<unsigned char>handle.reserved[i] for i in range(64)]
-    return bytes(reserved)
-
-cpdef ipcOpenEventHandle(bytes handle):
-    cdef driver.Event event
-    cdef IpcEventHandle handle_
-    handle_.reserved = handle
-    status = cudaIpcOpenEventHandle(&event, handle_)
-    check_status(status)
-    return <intptr_t>event
-
-cpdef ipcOpenMemHandle(bytes handle,
-                       unsigned int flags=cudaIpcMemLazyEnablePeerAccess):
-    cdef void* devPtr
-    cdef IpcMemHandle handle_
-    handle_.reserved = handle
-    status = cudaIpcOpenMemHandle(&devPtr, handle_, flags)
-    check_status(status)
-    return <intptr_t>devPtr
-
-
-###############################################################################
-# Memory management
-###############################################################################
-
-cpdef intptr_t malloc(size_t size) except? 0:
-    cdef void* ptr
-    with nogil:
-        status = cudaMalloc(&ptr, size)
-    check_status(status)
-    return <intptr_t>ptr
-
-cpdef intptr_t mallocManaged(
-        size_t size, unsigned int flags=cudaMemAttachGlobal) except? 0:
-    if 0 < CUPY_HIP_VERSION < 40300000:
-        raise RuntimeError('Managed memory requires ROCm 4.3+')
-    cdef void* ptr
-    with nogil:
-        status = cudaMallocManaged(&ptr, size, flags)
-    check_status(status)
-    return <intptr_t>ptr
-
-cpdef intptr_t malloc3DArray(intptr_t descPtr, size_t width, size_t height,
-                             size_t depth, unsigned int flags=0) except? 0:
-    cdef Array ptr
-    cdef Extent extent = make_cudaExtent(width, height, depth)
-    with nogil:
-        status = cudaMalloc3DArray(&ptr, <ChannelFormatDesc*>descPtr, extent,
-                                   flags)
-    check_status(status)
-    return <intptr_t>ptr
-
-cpdef intptr_t mallocArray(intptr_t descPtr, size_t width, size_t height,
-                           unsigned int flags=0) except? 0:
-    cdef Array ptr
-    with nogil:
-        status = cudaMallocArray(&ptr, <ChannelFormatDesc*>descPtr, width,
-                                 height, flags)
-    check_status(status)
-    return <intptr_t>ptr
-
-cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0:
-    cdef void* ptr
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support mallocAsync')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('mallocAsync is supported since CUDA 11.2')
-    with nogil:
-        status = cudaMallocAsync(&ptr, size, <driver.Stream>stream)
-    check_status(status)
-    return <intptr_t>ptr
-
-cpdef intptr_t mallocFromPoolAsync(
-        size_t size, intptr_t pool, intptr_t stream) except? 0:
-    cdef void* ptr
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support mallocFromPoolAsync')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('mallocFromPoolAsync is supported since CUDA 11.2')
-    with nogil:
-        status = cudaMallocFromPoolAsync(
-            &ptr, size, <MemPool>pool, <driver.Stream>stream)
-    check_status(status)
-    return <intptr_t>ptr
-
-cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0:
-    cdef void* ptr
-    with nogil:
-        status = cudaHostAlloc(&ptr, size, flags)
-    check_status(status)
-    return <intptr_t>ptr
-
-cpdef hostRegister(intptr_t ptr, size_t size, unsigned int flags):
-    with nogil:
-        status = cudaHostRegister(<void*>ptr, size, flags)
-    check_status(status)
-
-cpdef hostUnregister(intptr_t ptr):
-    with nogil:
-        status = cudaHostUnregister(<void*>ptr)
-    check_status(status)
-
-cpdef free(intptr_t ptr):
-    with nogil:
-        status = cudaFree(<void*>ptr)
-    check_status(status)
-
-cpdef freeHost(intptr_t ptr):
-    with nogil:
-        status = cudaFreeHost(<void*>ptr)
-    check_status(status)
-
-cpdef freeArray(intptr_t ptr):
-    with nogil:
-        status = cudaFreeArray(<Array>ptr)
-    check_status(status)
-
-cpdef freeAsync(intptr_t ptr, intptr_t stream):
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support freeAsync')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('freeAsync is supported since CUDA 11.2')
-    with nogil:
-        status = cudaFreeAsync(<void*>ptr, <driver.Stream>stream)
-    check_status(status)
-
-cpdef memGetInfo():
-    cdef size_t free, total
-    status = cudaMemGetInfo(&free, &total)
-    check_status(status)
-    return free, total
-
-cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind):
-    with nogil:
-        status = cudaMemcpy(<void*>dst, <void*>src, size, <MemoryKind>kind)
-    check_status(status)
-
-cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind,
-                  intptr_t stream):
-    with nogil:
-        status = cudaMemcpyAsync(
-            <void*>dst, <void*>src, size, <MemoryKind>kind,
-            <driver.Stream>stream)
-    check_status(status)
-
-cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice,
-                 size_t size):
-    with nogil:
-        status = cudaMemcpyPeer(<void*>dst, dstDevice, <void*>src, srcDevice,
-                                size)
-    check_status(status)
-
-cpdef memcpyPeerAsync(intptr_t dst, int dstDevice, intptr_t src, int srcDevice,
-                      size_t size, intptr_t stream):
-    with nogil:
-        status = cudaMemcpyPeerAsync(<void*>dst, dstDevice, <void*>src,
-                                     srcDevice, size, <driver.Stream> stream)
-    check_status(status)
-
-cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
-               size_t width, size_t height, MemoryKind kind):
-    with nogil:
-        status = cudaMemcpy2D(<void*>dst, dpitch, <void*>src, spitch, width,
-                              height, kind)
-    check_status(status)
-
-cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
-                    size_t width, size_t height, MemoryKind kind,
-                    intptr_t stream):
-    with nogil:
-        status = cudaMemcpy2DAsync(<void*>dst, dpitch, <void*>src, spitch,
-                                   width, height, kind, <driver.Stream>stream)
-    check_status(status)
-
-cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src,
-                        size_t wOffset, size_t hOffset, size_t width,
-                        size_t height, int kind):
-    with nogil:
-        status = cudaMemcpy2DFromArray(<void*>dst, dpitch, <Array>src, wOffset,
-                                       hOffset, width, height,
-                                       <MemoryKind>kind)
-    check_status(status)
-
-cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src,
-                             size_t wOffset, size_t hOffset, size_t width,
-                             size_t height, int kind, intptr_t stream):
-    with nogil:
-        status = cudaMemcpy2DFromArrayAsync(<void*>dst, dpitch, <Array>src,
-                                            wOffset, hOffset, width, height,
-                                            <MemoryKind>kind,
-                                            <driver.Stream>stream)
-    check_status(status)
-
-cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset,
-                      intptr_t src, size_t spitch, size_t width, size_t height,
-                      int kind):
-    with nogil:
-        status = cudaMemcpy2DToArray(<Array>dst, wOffset, hOffset, <void*>src,
-                                     spitch, width, height, <MemoryKind>kind)
-    check_status(status)
-
-cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset,
-                           intptr_t src, size_t spitch, size_t width,
-                           size_t height, int kind, intptr_t stream):
-    with nogil:
-        status = cudaMemcpy2DToArrayAsync(<Array>dst, wOffset, hOffset,
-                                          <void*>src, spitch, width, height,
-                                          <MemoryKind>kind,
-                                          <driver.Stream>stream)
-    check_status(status)
-
-cpdef memcpy3D(intptr_t Memcpy3DParmsPtr):
-    with nogil:
-        status = cudaMemcpy3D(<Memcpy3DParms*>Memcpy3DParmsPtr)
-    check_status(status)
-
-cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream):
-    with nogil:
-        status = cudaMemcpy3DAsync(<Memcpy3DParms*>Memcpy3DParmsPtr,
-                                   <driver.Stream> stream)
-    check_status(status)
-
-cpdef memset(intptr_t ptr, int value, size_t size):
-    with nogil:
-        status = cudaMemset(<void*>ptr, value, size)
-    check_status(status)
-
-cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream):
-    with nogil:
-        status = cudaMemsetAsync(<void*>ptr, value, size,
-                                 <driver.Stream> stream)
-    check_status(status)
-
-cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice,
-                       intptr_t stream):
-    if 0 < CUPY_HIP_VERSION < 40300000:
-        raise RuntimeError('Managed memory requires ROCm 4.3+')
-    with nogil:
-        status = cudaMemPrefetchAsync(<void*>devPtr, count, dstDevice,
-                                      <driver.Stream> stream)
-    check_status(status)
-
-cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device):
-    if 0 < CUPY_HIP_VERSION < 40300000:
-        raise RuntimeError('Managed memory requires ROCm 4.3+')
-    with nogil:
-        status = cudaMemAdvise(<void*>devPtr, count,
-                               <MemoryAdvise>advice, device)
-    check_status(status)
-
-cpdef PointerAttributes pointerGetAttributes(intptr_t ptr):
-    cdef _PointerAttributes attrs
-    status = cudaPointerGetAttributes(&attrs, <void*>ptr)
-    check_status(status)
-    IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 60000000:
-        return PointerAttributes(
-            attrs.device,
-            <intptr_t>attrs.devicePointer,
-            <intptr_t>attrs.hostPointer,
-            attrs.type)
-    ELIF 0 < CUPY_HIP_VERSION < 60000000:
-        return PointerAttributes(
-            attrs.device,
-            <intptr_t>attrs.devicePointer,
-            <intptr_t>attrs.hostPointer,
-            attrs.memoryType)
-    ELSE:  # for RTD
-        return None
-
-cpdef intptr_t deviceGetDefaultMemPool(int device) except? 0:
-    '''Get the default mempool on the current device.'''
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support deviceGetDefaultMemPool')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('deviceGetDefaultMemPool is supported since '
-                           'CUDA 11.2')
-    cdef MemPool pool
-    with nogil:
-        status = cudaDeviceGetDefaultMemPool(&pool, device)
-    check_status(status)
-    return <intptr_t>(pool)
-
-cpdef intptr_t deviceGetMemPool(int device) except? 0:
-    '''Get the current mempool on the current device.'''
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support deviceGetMemPool')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('deviceGetMemPool is supported since '
-                           'CUDA 11.2')
-    cdef MemPool pool
-    with nogil:
-        status = cudaDeviceGetMemPool(&pool, device)
-    check_status(status)
-    return <intptr_t>(pool)
-
-cpdef deviceSetMemPool(int device, intptr_t pool):
-    '''Set the current mempool on the current device to pool.'''
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support deviceSetMemPool')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('deviceSetMemPool is supported since '
-                           'CUDA 11.2')
-    with nogil:
-        status = cudaDeviceSetMemPool(device, <MemPool>pool)
-    check_status(status)
-
-cpdef intptr_t memPoolCreate(MemPoolProps props) except? 0:
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support memPoolCreate')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('memPoolCreate is supported since CUDA 11.2')
-
-    cdef MemPool pool
-    cdef _MemPoolProps props_c
-    c_memset(&props_c, 0, sizeof(_MemPoolProps))
-    props_c.allocType = <MemAllocationType>props.allocType
-    props_c.handleTypes = <MemAllocationHandleType>props.handleType
-    props_c.location.type = <MemLocationType>props.locationType
-    props_c.location.id = props.devId
-
-    with nogil:
-        status = cudaMemPoolCreate(&pool, &props_c)
-    check_status(status)
-    return <intptr_t>pool
-
-cpdef memPoolDestroy(intptr_t pool):
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support memPoolDestroy')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('memPoolDestroy is supported since CUDA 11.2')
-    with nogil:
-        status = cudaMemPoolDestroy(<MemPool>pool)
-    check_status(status)
-
-cpdef memPoolTrimTo(intptr_t pool, size_t size):
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support memPoolTrimTo')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('memPoolTrimTo is supported since CUDA 11.2')
-    with nogil:
-        status = cudaMemPoolTrimTo(<MemPool>pool, size)
-    check_status(status)
-
-cpdef memPoolGetAttribute(intptr_t pool, int attr):
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support memPoolGetAttribute')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('memPoolGetAttribute is supported since CUDA 11.2')
-    cdef int val1
-    cdef uint64_t val2
-    cdef void* out
-    # TODO(leofang): check this hack when more cudaMemPoolAttr are added!
-    out = <void*>(&val1) if attr <= 0x3 else <void*>(&val2)
-    with nogil:
-        status = cudaMemPoolGetAttribute(<MemPool>pool, <MemPoolAttr>attr, out)
-    check_status(status)
-    # TODO(leofang): check this hack when more cudaMemPoolAttr are added!
-    # cast to Python int regardless of C types
-    return val1 if attr <= 0x3 else val2
-
-cpdef memPoolSetAttribute(intptr_t pool, int attr, object value):
-    if _is_hip_environment:
-        raise RuntimeError('HIP does not support memPoolSetAttribute')
-    if runtimeGetVersion() < 11020:
-        raise RuntimeError('memPoolSetAttribute is supported since CUDA 11.2')
-    cdef int val1
-    cdef uint64_t val2
-    cdef void* out
-    # TODO(leofang): check this hack when more cudaMemPoolAttr are added!
-    if attr <= 0x3:
-        val1 = value
-        out = <void*>(&val1)
-    else:
-        val2 = value
-        out = <void*>(&val2)
-    with nogil:
-        status = cudaMemPoolSetAttribute(<MemPool>pool, <MemPoolAttr>attr, out)
-    check_status(status)
-
-
-###############################################################################
-# Stream and Event
-###############################################################################
-
-cpdef intptr_t streamCreate() except? 0:
-    cdef driver.Stream stream
-    status = cudaStreamCreate(&stream)
-    check_status(status)
-    return <intptr_t>stream
-
-
-cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0:
-    cdef driver.Stream stream
-    status = cudaStreamCreateWithFlags(&stream, flags)
-    check_status(status)
-    return <intptr_t>stream
-
-
-cpdef streamDestroy(intptr_t stream):
-    status = cudaStreamDestroy(<driver.Stream>stream)
-    check_status(status)
-
-
-cpdef streamSynchronize(intptr_t stream):
-    with nogil:
-        status = cudaStreamSynchronize(<driver.Stream>stream)
-    check_status(status)
-
-
-cdef _streamCallbackFunc(driver.Stream hStream, int status,
-                         void* func_arg) with gil:
-    obj = <object>func_arg
-    func, arg = obj
-    func(<intptr_t>hStream, status, arg)
-    cpython.Py_DECREF(obj)
-
-
-cdef _HostFnFunc(void* func_arg) with gil:
-    obj = <object>func_arg
-    func, arg = obj
-    func(arg)
-    cpython.Py_DECREF(obj)
-
-
-cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg,
-                        unsigned int flags=0):
-    if _is_hip_environment and stream == 0:
-        raise RuntimeError('HIP does not allow adding callbacks to the '
-                           'default (null) stream')
-    func_arg = (callback, arg)
-    cpython.Py_INCREF(func_arg)
-    with nogil:
-        status = cudaStreamAddCallback(
-            <driver.Stream>stream, <StreamCallback>_streamCallbackFunc,
-            <void*>func_arg, flags)
-    check_status(status)
-
-
-cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg):
-    if _is_hip_environment:
-        raise RuntimeError('This feature is not supported on HIP')
-
-    func_arg = (callback, arg)
-    cpython.Py_INCREF(func_arg)
-    with nogil:
-        status = cudaLaunchHostFunc(
-            <driver.Stream>stream, <HostFn>_HostFnFunc,
-            <void*>func_arg)
-    check_status(status)
-
-
-cpdef streamQuery(intptr_t stream):
-    return cudaStreamQuery(<driver.Stream>stream)
-
-
-cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=0):
-    with nogil:
-        status = cudaStreamWaitEvent(<driver.Stream>stream,
-                                     <driver.Event>event, flags)
-    check_status(status)
-
-
-cpdef streamBeginCapture(intptr_t stream, int mode=streamCaptureModeRelaxed):
-    if _is_hip_environment:
-        raise RuntimeError('streamBeginCapture is not supported in ROCm')
-    # TODO(leofang): check and raise if stream == 0?
-    with nogil:
-        status = cudaStreamBeginCapture(<driver.Stream>stream,
-                                        <StreamCaptureMode>mode)
-    check_status(status)
-
-
-cpdef intptr_t streamEndCapture(intptr_t stream) except? 0:
-    # TODO(leofang): check and raise if stream == 0?
-    cdef Graph g
-    if _is_hip_environment:
-        raise RuntimeError('streamEndCapture is not supported in ROCm')
-    with nogil:
-        status = cudaStreamEndCapture(<driver.Stream>stream, &g)
-    check_status(status)
-    return <intptr_t>g
-
-
-cpdef bint streamIsCapturing(intptr_t stream) except*:
-    cdef StreamCaptureStatus s
-    if _is_hip_environment:
-        raise RuntimeError('streamIsCapturing is not supported in ROCm')
-    with nogil:
-        status = cudaStreamIsCapturing(<driver.Stream>stream, &s)
-    check_status(status)  # cudaErrorStreamCaptureImplicit could be raised here
-    if s == <StreamCaptureStatus>streamCaptureStatusInvalidated:
-        raise RuntimeError('the stream was capturing, but an error has '
-                           'invalidated the capture sequence')
-    return <bint>s
-
-
-cpdef intptr_t eventCreate() except? 0:
-    cdef driver.Event event
-    status = cudaEventCreate(&event)
-    check_status(status)
-    return <intptr_t>event
-
-cpdef intptr_t eventCreateWithFlags(unsigned int flags) except? 0:
-    cdef driver.Event event
-    status = cudaEventCreateWithFlags(&event, flags)
-    check_status(status)
-    return <intptr_t>event
-
-
-cpdef eventDestroy(intptr_t event):
-    status = cudaEventDestroy(<driver.Event>event)
-    check_status(status)
-
-
-cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0:
-    cdef float ms
-    status = cudaEventElapsedTime(&ms, <driver.Event>start, <driver.Event>end)
-    check_status(status)
-    return ms
-
-
-cpdef eventQuery(intptr_t event):
-    return cudaEventQuery(<driver.Event>event)
-
-
-cpdef eventRecord(intptr_t event, intptr_t stream):
-    status = cudaEventRecord(<driver.Event>event, <driver.Stream>stream)
-    check_status(status)
-
-
-cpdef eventSynchronize(intptr_t event):
-    with nogil:
-        status = cudaEventSynchronize(<driver.Event>event)
-    check_status(status)
-
-
-##############################################################################
-# util
-##############################################################################
-
-cdef _ensure_context():
-    """Ensure that CUcontext bound to the calling host thread exists.
-
-    See discussion on https://github.com/cupy/cupy/issues/72 for details.
-    """
-    tls = _ThreadLocal.get()
-    cdef int dev = getDevice()
-    if not tls.context_initialized[dev]:
-        # Call Runtime API to establish context on this host thread.
-        memGetInfo()
-        tls.context_initialized[dev] = True
-
-
-##############################################################################
-# Texture
-##############################################################################
-
-cpdef uintmax_t createTextureObject(
-        intptr_t ResDescPtr, intptr_t TexDescPtr) except? 0:
-    cdef uintmax_t texobj = 0
-    with nogil:
-        status = cudaCreateTextureObject(<TextureObject*>(&texobj),
-                                         <ResourceDesc*>ResDescPtr,
-                                         <TextureDesc*>TexDescPtr,
-                                         <ResourceViewDesc*>NULL)
-    check_status(status)
-    return texobj
-
-cpdef destroyTextureObject(uintmax_t texObject):
-    with nogil:
-        status = cudaDestroyTextureObject(<TextureObject>texObject)
-    check_status(status)
-
-cpdef uintmax_t createSurfaceObject(intptr_t ResDescPtr) except? 0:
-    cdef uintmax_t surfobj = 0
-    with nogil:
-        status = cudaCreateSurfaceObject(<SurfaceObject*>(&surfobj),
-                                         <ResourceDesc*>ResDescPtr)
-    check_status(status)
-    return surfobj
-
-cpdef destroySurfaceObject(uintmax_t surfObject):
-    with nogil:
-        status = cudaDestroySurfaceObject(<SurfaceObject>surfObject)
-    check_status(status)
-
-cdef ChannelFormatDesc getChannelDesc(intptr_t array) except*:
-    cdef ChannelFormatDesc desc
-    with nogil:
-        status = cudaGetChannelDesc(&desc, <Array>array)
-    check_status(status)
-    return desc
-
-cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t obj) except*:
-    cdef ResourceDesc desc
-    with nogil:
-        status = cudaGetTextureObjectResourceDesc(&desc, <TextureObject>obj)
-    check_status(status)
-    return desc
-
-cdef TextureDesc getTextureObjectTextureDesc(uintmax_t obj) except*:
-    cdef TextureDesc desc
-    with nogil:
-        status = cudaGetTextureObjectTextureDesc(&desc, <TextureObject>obj)
-    check_status(status)
-    return desc
-
-cdef Extent make_Extent(size_t w, size_t h, size_t d) except*:
-    return make_cudaExtent(w, h, d)
-
-cdef Pos make_Pos(size_t x, size_t y, size_t z) except*:
-    return make_cudaPos(x, y, z)
-
-cdef PitchedPtr make_PitchedPtr(
-        intptr_t d, size_t p, size_t xsz, size_t ysz) except*:
-    return make_cudaPitchedPtr(<void*>d, p, xsz, ysz)
-
-
-##############################################################################
-# Graph
-##############################################################################
-
-cpdef graphDestroy(intptr_t graph):
-    with nogil:
-        status = cudaGraphDestroy(<Graph>graph)
-    check_status(status)
-
-cpdef graphExecDestroy(intptr_t graphExec):
-    with nogil:
-        status = cudaGraphExecDestroy(<GraphExec>graphExec)
-    check_status(status)
-
-cpdef intptr_t graphInstantiate(intptr_t graph) except? 0:
-    # TODO(leofang): support reporting error log?
-    cdef GraphExec ge
-    with nogil:
-        status = cudaGraphInstantiate(<GraphExec*>(&ge), <Graph>graph,
-                                      NULL, NULL, 0)
-    check_status(status)
-    return <intptr_t>ge
-
-cpdef graphLaunch(intptr_t graphExec, intptr_t stream):
-    with nogil:
-        status = cudaGraphLaunch(<GraphExec>(graphExec), <driver.Stream>stream)
-    check_status(status)
-
-cpdef graphUpload(intptr_t graphExec, intptr_t stream):
-    if runtimeGetVersion() < 11010:
-        raise RuntimeError('graphUpload is supported since CUDA 11.1+')
-    with nogil:
-        status = cudaGraphUpload(<GraphExec>(graphExec), <driver.Stream>stream)
-    check_status(status)
+    cpdef getDeviceProperties(int device):
+        cdef DeviceProp props
+        cdef int status = cudaGetDeviceProperties(&props, device)
+        check_status(status)
+
+        cdef dict properties = {'name': b'UNAVAILABLE'}  # for RTD
+
+        # Common properties to CUDA 9.0, 9.2, 10.x, 11.x, and HIP
+        IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 0:
+            properties = {
+                'name': props.name,
+                'totalGlobalMem': props.totalGlobalMem,
+                'sharedMemPerBlock': props.sharedMemPerBlock,
+                'regsPerBlock': props.regsPerBlock,
+                'warpSize': props.warpSize,
+                'maxThreadsPerBlock': props.maxThreadsPerBlock,
+                'maxThreadsDim': tuple(props.maxThreadsDim),
+                'maxGridSize': tuple(props.maxGridSize),
+                'clockRate': props.clockRate,
+                'totalConstMem': props.totalConstMem,
+                'major': props.major,
+                'minor': props.minor,
+                'textureAlignment': props.textureAlignment,
+                'texturePitchAlignment': props.texturePitchAlignment,
+                'multiProcessorCount': props.multiProcessorCount,
+                'kernelExecTimeoutEnabled': props.kernelExecTimeoutEnabled,
+                'integrated': props.integrated,
+                'canMapHostMemory': props.canMapHostMemory,
+                'computeMode': props.computeMode,
+                'maxTexture1D': props.maxTexture1D,
+                'maxTexture2D': tuple(props.maxTexture2D),
+                'maxTexture3D': tuple(props.maxTexture3D),
+                'concurrentKernels': props.concurrentKernels,
+                'ECCEnabled': props.ECCEnabled,
+                'pciBusID': props.pciBusID,
+                'pciDeviceID': props.pciDeviceID,
+                'pciDomainID': props.pciDomainID,
+                'tccDriver': props.tccDriver,
+                'memoryClockRate': props.memoryClockRate,
+                'memoryBusWidth': props.memoryBusWidth,
+                'l2CacheSize': props.l2CacheSize,
+                'maxThreadsPerMultiProcessor': props.maxThreadsPerMultiProcessor,
+                'isMultiGpuBoard': props.isMultiGpuBoard,
+                'cooperativeLaunch': props.cooperativeLaunch,
+                'cooperativeMultiDeviceLaunch': props.cooperativeMultiDeviceLaunch,
+            }
+        IF CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 9020:
+            properties['deviceOverlap'] = props.deviceOverlap
+            properties['maxTexture1DMipmap'] = props.maxTexture1DMipmap
+            properties['maxTexture1DLinear'] = props.maxTexture1DLinear
+            properties['maxTexture1DLayered'] = tuple(props.maxTexture1DLayered)
+            properties['maxTexture2DMipmap'] = tuple(props.maxTexture2DMipmap)
+            properties['maxTexture2DLinear'] = tuple(props.maxTexture2DLinear)
+            properties['maxTexture2DLayered'] = tuple(props.maxTexture2DLayered)
+            properties['maxTexture2DGather'] = tuple(props.maxTexture2DGather)
+            properties['maxTexture3DAlt'] = tuple(props.maxTexture3DAlt)
+            properties['maxTextureCubemap'] = props.maxTextureCubemap
+            properties['maxTextureCubemapLayered'] = tuple(
+                props.maxTextureCubemapLayered)
+            properties['maxSurface1D'] = props.maxSurface1D
+            properties['maxSurface1DLayered'] = tuple(props.maxSurface1DLayered)
+            properties['maxSurface2D'] = tuple(props.maxSurface2D)
+            properties['maxSurface2DLayered'] = tuple(props.maxSurface2DLayered)
+            properties['maxSurface3D'] = tuple(props.maxSurface3D)
+            properties['maxSurfaceCubemap'] = props.maxSurfaceCubemap
+            properties['maxSurfaceCubemapLayered'] = tuple(
+                props.maxSurfaceCubemapLayered)
+            properties['surfaceAlignment'] = props.surfaceAlignment
+            properties['asyncEngineCount'] = props.asyncEngineCount
+            properties['unifiedAddressing'] = props.unifiedAddressing
+            properties['streamPrioritiesSupported'] = (
+                props.streamPrioritiesSupported)
+            properties['globalL1CacheSupported'] = props.globalL1CacheSupported
+            properties['localL1CacheSupported'] = props.localL1CacheSupported
+            properties['sharedMemPerMultiprocessor'] = (
+                props.sharedMemPerMultiprocessor)
+            properties['regsPerMultiprocessor'] = props.regsPerMultiprocessor
+            properties['managedMemory'] = props.managedMemory
+            properties['multiGpuBoardGroupID'] = props.multiGpuBoardGroupID
+            properties['hostNativeAtomicSupported'] = (
+                props.hostNativeAtomicSupported)
+            properties['singleToDoublePrecisionPerfRatio'] = (
+                props.singleToDoublePrecisionPerfRatio)
+            properties['pageableMemoryAccess'] = props.pageableMemoryAccess
+            properties['concurrentManagedAccess'] = props.concurrentManagedAccess
+            properties['computePreemptionSupported'] = (
+                props.computePreemptionSupported)
+            properties['canUseHostPointerForRegisteredMem'] = (
+                props.canUseHostPointerForRegisteredMem)
+            properties['sharedMemPerBlockOptin'] = props.sharedMemPerBlockOptin
+            properties['pageableMemoryAccessUsesHostPageTables'] = (
+                props.pageableMemoryAccessUsesHostPageTables)
+            properties['directManagedMemAccessFromHost'] = (
+                props.directManagedMemAccessFromHost)
+        if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >=10000:
+            properties['uuid'] = props.uuid.bytes
+            properties['luid'] = props.luid
+            properties['luidDeviceNodeMask'] = props.luidDeviceNodeMask
+        if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 11000:
+            properties['persistingL2CacheMaxSize'] = props.persistingL2CacheMaxSize
+            properties['maxBlocksPerMultiProcessor'] = (
+                props.maxBlocksPerMultiProcessor)
+            properties['accessPolicyMaxWindowSize'] = (
+                props.accessPolicyMaxWindowSize)
+            properties['reservedSharedMemPerBlock'] = (
+                props.reservedSharedMemPerBlock)
+        IF CUPY_HIP_VERSION > 0:  # HIP-only props
+            properties['clockInstructionRate'] = props.clockInstructionRate
+            properties['maxSharedMemoryPerMultiProcessor'] = (
+                props.maxSharedMemoryPerMultiProcessor)
+            properties['hdpMemFlushCntl'] = <intptr_t>(props.hdpMemFlushCntl)
+            properties['hdpRegFlushCntl'] = <intptr_t>(props.hdpRegFlushCntl)
+            properties['memPitch'] = props.memPitch
+            properties['cooperativeMultiDeviceUnmatchedFunc'] = (
+                props.cooperativeMultiDeviceUnmatchedFunc)
+            properties['cooperativeMultiDeviceUnmatchedGridDim'] = (
+                props.cooperativeMultiDeviceUnmatchedGridDim)
+            properties['cooperativeMultiDeviceUnmatchedBlockDim'] = (
+                props.cooperativeMultiDeviceUnmatchedBlockDim)
+            properties['cooperativeMultiDeviceUnmatchedSharedMem'] = (
+                props.cooperativeMultiDeviceUnmatchedSharedMem)
+            properties['isLargeBar'] = props.isLargeBar
+
+            cdef dict arch = {}  # for hipDeviceArch_t
+            arch['hasGlobalInt32Atomics'] = props.arch.hasGlobalInt32Atomics
+            arch['hasGlobalFloatAtomicExch'] = props.arch.hasGlobalFloatAtomicExch
+            arch['hasSharedInt32Atomics'] = props.arch.hasSharedInt32Atomics
+            arch['hasSharedFloatAtomicExch'] = props.arch.hasSharedFloatAtomicExch
+            arch['hasFloatAtomicAdd'] = props.arch.hasFloatAtomicAdd
+            arch['hasGlobalInt64Atomics'] = props.arch.hasGlobalInt64Atomics
+            arch['hasSharedInt64Atomics'] = props.arch.hasSharedInt64Atomics
+            arch['hasDoubles'] = props.arch.hasDoubles
+            arch['hasWarpVote'] = props.arch.hasWarpVote
+            arch['hasWarpBallot'] = props.arch.hasWarpBallot
+            arch['hasWarpShuffle'] = props.arch.hasWarpShuffle
+            arch['hasFunnelShift'] = props.arch.hasFunnelShift
+            arch['hasThreadFenceSystem'] = props.arch.hasThreadFenceSystem
+            arch['hasSyncThreadsExt'] = props.arch.hasSyncThreadsExt
+            arch['hasSurfaceFuncs'] = props.arch.hasSurfaceFuncs
+            arch['has3dGrid'] = props.arch.has3dGrid
+            arch['hasDynamicParallelism'] = props.arch.hasDynamicParallelism
+            properties['arch'] = arch
+        IF 0 < CUPY_HIP_VERSION < 310:  # gcnArchName used after ROCm 3.1+
+            properties['gcnArch'] = props.gcnArch
+        IF CUPY_HIP_VERSION >= 310:
+            properties['gcnArchName'] = props.gcnArchName
+            properties['asicRevision'] = props.asicRevision
+            properties['managedMemory'] = props.managedMemory
+            properties['directManagedMemAccessFromHost'] = (
+                props.directManagedMemAccessFromHost)
+            properties['concurrentManagedAccess'] = props.concurrentManagedAccess
+            properties['pageableMemoryAccess'] = props.pageableMemoryAccess
+            properties['pageableMemoryAccessUsesHostPageTables'] = (
+                props.pageableMemoryAccessUsesHostPageTables)
+        return properties
+
+    cpdef int deviceGetByPCIBusId(str pci_bus_id) except? -1:
+        # Encode the python string before passing to native code
+        byte_pci_bus_id = pci_bus_id.encode('ascii')
+        cdef const char* c_pci_bus_id = byte_pci_bus_id
+
+        cdef int device = -1
+        cdef int status
+        status = cudaDeviceGetByPCIBusId(&device, c_pci_bus_id)
+        check_status(status)
+        # on ROCm, it might fail silently, so we also need to check if the
+        # device is meaningful or not
+        if hip_environment and device == -1:
+            check_status(cudaErrorInvalidValue)
+        return device
+
+    cpdef str deviceGetPCIBusId(int device):
+        # The PCI Bus ID string must be able to store 13 characters including
+        # the NULL-terminator according to the CUDA documentation.
+        # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html
+        cdef char pci_bus_id[13]
+        status = cudaDeviceGetPCIBusId(pci_bus_id, 13, device)
+        check_status(status)
+        return pci_bus_id.decode('ascii')
+
+    cpdef int getDeviceCount() except? -1:
+        cdef int count
+        status = cudaGetDeviceCount(&count)
+        check_status(status)
+        return count
+
+    cpdef setDevice(int device):
+        status = cudaSetDevice(device)
+        check_status(status)
+
+    cpdef deviceSynchronize():
+        with nogil:
+            status = cudaDeviceSynchronize()
+        check_status(status)
+
+    cpdef int deviceCanAccessPeer(int device, int peerDevice) except? -1:
+        cdef int ret
+        status = cudaDeviceCanAccessPeer(&ret, device, peerDevice)
+        check_status(status)
+        return ret
+
+    cpdef deviceEnablePeerAccess(int peerDevice):
+        status = cudaDeviceEnablePeerAccess(peerDevice, 0)
+        check_status(status)
+
+    cpdef deviceDisablePeerAccess(int peerDevice):
+        status = cudaDeviceDisablePeerAccess(peerDevice)
+        check_status(status)
+
+    cpdef _deviceEnsurePeerAccess(int peerDevice):
+        status = cudaDeviceEnablePeerAccess(peerDevice, 0)
+        if status == 0:
+            return
+        elif status == errorPeerAccessAlreadyEnabled:
+            cudaGetLastError()  # clear error status
+            return
+        check_status(status)
+
+    cpdef size_t deviceGetLimit(int limit) except? -1:
+        cdef size_t value
+        status = cudaDeviceGetLimit(&value, <Limit>limit)
+        check_status(status)
+        return value
+
+    cpdef deviceSetLimit(int limit, size_t value):
+        status = cudaDeviceSetLimit(<Limit>limit, value)
+        check_status(status)
+
+
+    ###########################################################################
+    # IPC operations
+    ###########################################################################
+
+    cpdef ipcCloseMemHandle(intptr_t devPtr):
+        status = cudaIpcCloseMemHandle(<void*>devPtr)
+        check_status(status)
+
+    cpdef ipcGetEventHandle(intptr_t event):
+        cdef IpcEventHandle handle
+        status = cudaIpcGetEventHandle(&handle, <driver.Event>event)
+        check_status(status)
+        # We need to do this due to a bug in Cython that
+        # cuts out the 0 bytes in an array of chars when
+        # constructing the python object
+        # resulting in different sizes assignment errors
+        # when recreating the struct from the python
+        # array of bytes
+        reserved = [<unsigned char>handle.reserved[i] for i in range(64)]
+        return bytes(reserved)
+
+    cpdef ipcGetMemHandle(intptr_t devPtr):
+        cdef IpcMemHandle handle
+        status = cudaIpcGetMemHandle(&handle, <void*>devPtr)
+        check_status(status)
+        # We need to do this due to a bug in Cython that
+        # when converting an array of chars in C to a python object
+        # it discards the data after the first 0 value
+        # resulting in a loss of data, as this is not a string
+        # but a buffer of bytes
+        reserved = [<unsigned char>handle.reserved[i] for i in range(64)]
+        return bytes(reserved)
+
+    cpdef ipcOpenEventHandle(bytes handle):
+        cdef driver.Event event
+        cdef IpcEventHandle handle_
+        handle_.reserved = handle
+        status = cudaIpcOpenEventHandle(&event, handle_)
+        check_status(status)
+        return <intptr_t>event
+
+    cpdef ipcOpenMemHandle(bytes handle,
+                           unsigned int flags=cudaIpcMemLazyEnablePeerAccess):
+        cdef void* devPtr
+        cdef IpcMemHandle handle_
+        handle_.reserved = handle
+        status = cudaIpcOpenMemHandle(&devPtr, handle_, flags)
+        check_status(status)
+        return <intptr_t>devPtr
+
+
+    ###########################################################################
+    # Memory management
+    ###########################################################################
+
+    cpdef intptr_t malloc(size_t size) except? 0:
+        cdef void* ptr
+        with nogil:
+            status = cudaMalloc(&ptr, size)
+        check_status(status)
+        return <intptr_t>ptr
+
+    cpdef intptr_t mallocManaged(
+            size_t size, unsigned int flags=cudaMemAttachGlobal) except? 0:
+        if 0 < CUPY_HIP_VERSION < 40300000:
+            raise RuntimeError('Managed memory requires ROCm 4.3+')
+        cdef void* ptr
+        with nogil:
+            status = cudaMallocManaged(&ptr, size, flags)
+        check_status(status)
+        return <intptr_t>ptr
+
+    cpdef intptr_t malloc3DArray(intptr_t descPtr, size_t width, size_t height,
+                                 size_t depth, unsigned int flags=0) except? 0:
+        cdef Array ptr
+        cdef Extent extent = make_cudaExtent(width, height, depth)
+        with nogil:
+            status = cudaMalloc3DArray(&ptr, <ChannelFormatDesc*>descPtr,
+                                       extent, flags)
+        check_status(status)
+        return <intptr_t>ptr
+
+    cpdef intptr_t mallocArray(intptr_t descPtr, size_t width, size_t height,
+                               unsigned int flags=0) except? 0:
+        cdef Array ptr
+        with nogil:
+            status = cudaMallocArray(&ptr, <ChannelFormatDesc*>descPtr, width,
+                                     height, flags)
+        check_status(status)
+        return <intptr_t>ptr
+
+    cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0:
+        cdef void* ptr
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('mallocAsync requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('mallocAsync is supported since CUDA 11.2')
+        with nogil:
+            status = cudaMallocAsync(&ptr, size, <driver.Stream>stream)
+        check_status(status)
+        return <intptr_t>ptr
+
+    cpdef intptr_t mallocFromPoolAsync(
+            size_t size, intptr_t pool, intptr_t stream) except? 0:
+        cdef void* ptr
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('mallocFromPoolAsync require ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('mallocFromPoolAsync is supported '
+                               'since CUDA 11.2')
+        with nogil:
+            status = cudaMallocFromPoolAsync(
+                &ptr, size, <MemPool>pool, <driver.Stream>stream)
+        check_status(status)
+        return <intptr_t>ptr
+
+    cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0:
+        cdef void* ptr
+        with nogil:
+            status = cudaHostAlloc(&ptr, size, flags)
+        check_status(status)
+        return <intptr_t>ptr
+
+    cpdef hostRegister(intptr_t ptr, size_t size, unsigned int flags):
+        with nogil:
+            status = cudaHostRegister(<void*>ptr, size, flags)
+        check_status(status)
+
+    cpdef hostUnregister(intptr_t ptr):
+        with nogil:
+            status = cudaHostUnregister(<void*>ptr)
+        check_status(status)
+
+    cpdef free(intptr_t ptr):
+        with nogil:
+            status = cudaFree(<void*>ptr)
+        check_status(status)
+
+    cpdef freeHost(intptr_t ptr):
+        with nogil:
+            status = cudaFreeHost(<void*>ptr)
+        check_status(status)
+
+    cpdef freeArray(intptr_t ptr):
+        with nogil:
+            status = cudaFreeArray(<Array>ptr)
+        check_status(status)
+
+    cpdef freeAsync(intptr_t ptr, intptr_t stream):
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('freeAsync requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('freeAsync is supported since CUDA 11.2')
+        with nogil:
+            status = cudaFreeAsync(<void*>ptr, <driver.Stream>stream)
+        check_status(status)
+
+    cpdef memGetInfo():
+        cdef size_t free, total
+        status = cudaMemGetInfo(&free, &total)
+        check_status(status)
+        return free, total
+
+    cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind):
+        with nogil:
+            status = cudaMemcpy(<void*>dst, <void*>src, size, <MemoryKind>kind)
+        check_status(status)
+
+    cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind,
+                      intptr_t stream):
+        with nogil:
+            status = cudaMemcpyAsync(
+                <void*>dst, <void*>src, size, <MemoryKind>kind,
+                <driver.Stream>stream)
+        check_status(status)
+
+    cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice,
+                     size_t size):
+        with nogil:
+            status = cudaMemcpyPeer(<void*>dst, dstDevice, <void*>src,
+                                    srcDevice, size)
+        check_status(status)
+
+    cpdef memcpyPeerAsync(intptr_t dst, int dstDevice, intptr_t src,
+                          int srcDevice, size_t size, intptr_t stream):
+        with nogil:
+            status = cudaMemcpyPeerAsync(<void*>dst, dstDevice, <void*>src,
+                                         srcDevice, size,
+                                         <driver.Stream> stream)
+        check_status(status)
+
+    cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
+                   size_t width, size_t height, MemoryKind kind):
+        with nogil:
+            status = cudaMemcpy2D(<void*>dst, dpitch, <void*>src, spitch,
+                                  width, height, kind)
+        check_status(status)
+
+    cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src,
+                        size_t spitch, size_t width, size_t height,
+                        MemoryKind kind, intptr_t stream):
+        with nogil:
+            status = cudaMemcpy2DAsync(<void*>dst, dpitch, <void*>src, spitch,
+                                       width, height, kind, 
+                                       <driver.Stream>stream)
+        check_status(status)
+
+    cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src,
+                            size_t wOffset, size_t hOffset, size_t width,
+                            size_t height, int kind):
+        with nogil:
+            status = cudaMemcpy2DFromArray(<void*>dst, dpitch, <Array>src,
+                                           wOffset, hOffset, width, height,
+                                           <MemoryKind>kind)
+        check_status(status)
+
+    cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src,
+                                 size_t wOffset, size_t hOffset, size_t width,
+                                 size_t height, int kind, intptr_t stream):
+        with nogil:
+            status = cudaMemcpy2DFromArrayAsync(<void*>dst, dpitch, <Array>src,
+                                                wOffset, hOffset, width,
+                                                height, <MemoryKind>kind,
+                                                <driver.Stream>stream)
+        check_status(status)
+
+    cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset,
+                          intptr_t src, size_t spitch, size_t width,
+                          size_t height, int kind):
+        with nogil:
+            status = cudaMemcpy2DToArray(<Array>dst, wOffset, hOffset,
+                                         <void*>src, spitch, width, height,
+                                         <MemoryKind>kind)
+        check_status(status)
+
+    cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset,
+                               intptr_t src, size_t spitch, size_t width,
+                               size_t height, int kind, intptr_t stream):
+        with nogil:
+            status = cudaMemcpy2DToArrayAsync(<Array>dst, wOffset, hOffset,
+                                              <void*>src, spitch, width,
+                                              height, <MemoryKind>kind,
+                                              <driver.Stream>stream)
+        check_status(status)
+
+    cpdef memcpy3D(intptr_t Memcpy3DParmsPtr):
+        with nogil:
+            status = cudaMemcpy3D(<Memcpy3DParms*>Memcpy3DParmsPtr)
+        check_status(status)
+
+    cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream):
+        with nogil:
+            status = cudaMemcpy3DAsync(<Memcpy3DParms*>Memcpy3DParmsPtr,
+                                       <driver.Stream> stream)
+        check_status(status)
+
+    cpdef memset(intptr_t ptr, int value, size_t size):
+        with nogil:
+            status = cudaMemset(<void*>ptr, value, size)
+        check_status(status)
+
+    cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream):
+        with nogil:
+            status = cudaMemsetAsync(<void*>ptr, value, size,
+                                     <driver.Stream> stream)
+        check_status(status)
+
+    cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice,
+                           intptr_t stream):
+        if 0 < CUPY_HIP_VERSION < 40300000:
+            raise RuntimeError('Managed memory requires ROCm 4.3+')
+        with nogil:
+            status = cudaMemPrefetchAsync(<void*>devPtr, count, dstDevice,
+                                          <driver.Stream> stream)
+        check_status(status)
+
+    cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device):
+        if 0 < CUPY_HIP_VERSION < 40300000:
+            raise RuntimeError('Managed memory requires ROCm 4.3+')
+        with nogil:
+            status = cudaMemAdvise(<void*>devPtr, count,
+                                   <MemoryAdvise>advice, device)
+        check_status(status)
+
+    cpdef PointerAttributes pointerGetAttributes(intptr_t ptr):
+        cdef _PointerAttributes attrs
+        status = cudaPointerGetAttributes(&attrs, <void*>ptr)
+        check_status(status)
+        IF CUPY_HIP_VERSION >= 60000000:
+            if attrs.type == 0:         # hipMemoryTypeHost
+                attrs.type = <MemoryType>1 # cudaMemoryTypeHost
+            elif attrs.type == 1:       # hipMemoryTypeDevice
+                attrs.type = <MemoryType>2 # cudaMemoryTypeDevice
+        ELIF CUPY_HIP_VERSION > 0:
+            if attrs.memoryType == 0:   # hipMemoryTypeHost
+                attrs.memoryType = <MemoryType>1 # cudaMemoryTypeHost
+            elif attrs.memoryType == 1: # hipMemoryTypeDevice
+                attrs.memoryType = <MemoryType>2 # cudaMemoryTypeDevice
+        IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 60000000:
+            return PointerAttributes(
+                attrs.device,
+                <intptr_t>attrs.devicePointer,
+                <intptr_t>attrs.hostPointer,
+                attrs.type)
+        ELIF 0 < CUPY_HIP_VERSION < 60000000:
+            return PointerAttributes(
+                attrs.device,
+                <intptr_t>attrs.devicePointer,
+                <intptr_t>attrs.hostPointer,
+                attrs.memoryType)
+        ELSE:  # for RTD
+            return None
+
+    cpdef intptr_t deviceGetDefaultMemPool(int device) except? 0:
+        '''Get the default mempool on the current device.'''
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('deviceGetDefaultMemPool requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('deviceGetDefaultMemPool is supported since '
+                               'CUDA 11.2')
+        cdef MemPool pool
+        with nogil:
+            status = cudaDeviceGetDefaultMemPool(&pool, device)
+        check_status(status)
+        return <intptr_t>(pool)
+
+    cpdef intptr_t deviceGetMemPool(int device) except? 0:
+        '''Get the current mempool on the current device.'''
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('deviceGetMemPool requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('deviceGetMemPool is supported since '
+                               'CUDA 11.2')
+        cdef MemPool pool
+        with nogil:
+            status = cudaDeviceGetMemPool(&pool, device)
+        check_status(status)
+        return <intptr_t>(pool)
+
+    cpdef deviceSetMemPool(int device, intptr_t pool):
+        '''Set the current mempool on the current device to pool.'''
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('deviceSetMemPool requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('deviceSetMemPool is supported since '
+                               'CUDA 11.2')
+        with nogil:
+            status = cudaDeviceSetMemPool(device, <MemPool>pool)
+        check_status(status)
+
+    cpdef intptr_t memPoolCreate(MemPoolProps props) except? 0:
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('memPoolCreate requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('memPoolCreate is supported since CUDA 11.2')
+
+        cdef MemPool pool
+        cdef _MemPoolProps props_c
+        c_memset(&props_c, 0, sizeof(_MemPoolProps))
+        props_c.allocType = <MemAllocationType>props.allocType
+        props_c.handleTypes = <MemAllocationHandleType>props.handleType
+        props_c.location.type = <MemLocationType>props.locationType
+        props_c.location.id = props.devId
+
+        with nogil:
+            status = cudaMemPoolCreate(&pool, &props_c)
+        check_status(status)
+        return <intptr_t>pool
+
+    cpdef memPoolDestroy(intptr_t pool):
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('memPoolDestroy requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('memPoolDestroy is supported since CUDA 11.2')
+        with nogil:
+            status = cudaMemPoolDestroy(<MemPool>pool)
+        check_status(status)
+
+    cpdef memPoolTrimTo(intptr_t pool, size_t size):
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('memPoolTrimTo requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('memPoolTrimTo is supported since CUDA 11.2')
+        with nogil:
+            status = cudaMemPoolTrimTo(<MemPool>pool, size)
+        check_status(status)
+
+    cpdef memPoolGetAttribute(intptr_t pool, int attr):
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('memPoolGetAttribute requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('memPoolGetAttribute is supported '
+                               'since CUDA 11.2')
+        cdef int val1
+        cdef uint64_t val2
+        cdef void* out
+        # TODO(leofang): check this hack when more cudaMemPoolAttr are added!
+        out = <void*>(&val1) if attr <= 0x3 else <void*>(&val2)
+        with nogil:
+            status = cudaMemPoolGetAttribute(<MemPool>pool, <MemPoolAttr>attr,
+                                             out)
+        check_status(status)
+        # TODO(leofang): check this hack when more cudaMemPoolAttr are added!
+        # cast to Python int regardless of C types
+        return val1 if attr <= 0x3 else val2
+
+    cpdef memPoolSetAttribute(intptr_t pool, int attr, object value):
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('memPoolSetAttribute requires ROCm 5.2+')
+        if runtimeGetVersion() < 11020:
+            raise RuntimeError('memPoolSetAttribute is supported '
+                               'since CUDA 11.2')
+        cdef int val1
+        cdef uint64_t val2
+        cdef void* out
+        # TODO(leofang): check this hack when more cudaMemPoolAttr are added!
+        if attr <= 0x3:
+            val1 = value
+            out = <void*>(&val1)
+        else:
+            val2 = value
+            out = <void*>(&val2)
+        with nogil:
+            status = cudaMemPoolSetAttribute(<MemPool>pool, <MemPoolAttr>attr,
+                                             out)
+        check_status(status)
+
+
+    ###########################################################################
+    # Stream and Event
+    ###########################################################################
+
+    cpdef intptr_t streamCreate() except? 0:
+        cdef driver.Stream stream
+        status = cudaStreamCreate(&stream)
+        check_status(status)
+        return <intptr_t>stream
+
+
+    cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0:
+        cdef driver.Stream stream
+        status = cudaStreamCreateWithFlags(&stream, flags)
+        check_status(status)
+        return <intptr_t>stream
+
+
+    cpdef streamDestroy(intptr_t stream):
+        status = cudaStreamDestroy(<driver.Stream>stream)
+        check_status(status)
+
+
+    cpdef streamSynchronize(intptr_t stream):
+        with nogil:
+            status = cudaStreamSynchronize(<driver.Stream>stream)
+        check_status(status)
+
+
+    cdef _streamCallbackFunc(driver.Stream hStream, int status,
+                             void* func_arg) with gil:
+        obj = <object>func_arg
+        func, arg = obj
+        func(<intptr_t>hStream, status, arg)
+        cpython.Py_DECREF(obj)
+
+
+    cdef _HostFnFunc(void* func_arg) with gil:
+        obj = <object>func_arg
+        func, arg = obj
+        func(arg)
+        cpython.Py_DECREF(obj)
+
+
+    cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg,
+                            unsigned int flags=0):
+        if _is_hip_environment and stream == 0:
+            raise RuntimeError('HIP does not allow adding callbacks to the '
+                               'default (null) stream')
+        func_arg = (callback, arg)
+        cpython.Py_INCREF(func_arg)
+        with nogil:
+            status = cudaStreamAddCallback(
+                <driver.Stream>stream, <StreamCallback>_streamCallbackFunc,
+                <void*>func_arg, flags)
+        check_status(status)
+
+
+    cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg):
+        if 0 < CUPY_HIP_VERSION < 50200000:
+            raise RuntimeError('This feature is supported on HIP since ROCm 5.2')
+
+        func_arg = (callback, arg)
+        cpython.Py_INCREF(func_arg)
+        with nogil:
+            status = cudaLaunchHostFunc(
+                <driver.Stream>stream, <HostFn>_HostFnFunc,
+                <void*>func_arg)
+        check_status(status)
+
+
+    cpdef streamQuery(intptr_t stream):
+        return cudaStreamQuery(<driver.Stream>stream)
+
+
+    cpdef streamWaitEvent(intptr_t stream, intptr_t event,
+                          unsigned int flags=0):
+        with nogil:
+            status = cudaStreamWaitEvent(<driver.Stream>stream,
+                                         <driver.Event>event, flags)
+        check_status(status)
+
+
+    cpdef streamBeginCapture(intptr_t stream,
+                             int mode=streamCaptureModeRelaxed):
+        if 0 < CUPY_HIP_VERSION < 40300000:
+            raise RuntimeError('streamBeginCapture is not supported in ROCm')
+        # TODO(leofang): check and raise if stream == 0?
+        with nogil:
+            status = cudaStreamBeginCapture(<driver.Stream>stream,
+                                            <StreamCaptureMode>mode)
+        check_status(status)
+
+
+    cpdef intptr_t streamEndCapture(intptr_t stream) except? 0:
+        # TODO(leofang): check and raise if stream == 0?
+        cdef Graph g
+        if 0 < CUPY_HIP_VERSION < 40300000:
+            raise RuntimeError('streamEndCapture is not supported in ROCm')
+        with nogil:
+            status = cudaStreamEndCapture(<driver.Stream>stream, &g)
+        check_status(status)
+        return <intptr_t>g
+
+
+    cpdef bint streamIsCapturing(intptr_t stream) except*:
+        cdef StreamCaptureStatus s
+        if 0 < CUPY_HIP_VERSION < 50000000:
+            raise RuntimeError('streamIsCapturing is not supported in ROCm')
+        with nogil:
+            status = cudaStreamIsCapturing(<driver.Stream>stream, &s)
+        check_status(status)  # cudaErrorStreamCaptureImplicit could be 
+                              # raised here
+        if s == <StreamCaptureStatus>streamCaptureStatusInvalidated:
+            raise RuntimeError('the stream was capturing, but an error has '
+                               'invalidated the capture sequence')
+        return <bint>s
+
+
+    cpdef intptr_t eventCreate() except? 0:
+        cdef driver.Event event
+        status = cudaEventCreate(&event)
+        check_status(status)
+        return <intptr_t>event
+
+    cpdef intptr_t eventCreateWithFlags(unsigned int flags) except? 0:
+        cdef driver.Event event
+        status = cudaEventCreateWithFlags(&event, flags)
+        check_status(status)
+        return <intptr_t>event
+
+
+    cpdef eventDestroy(intptr_t event):
+        status = cudaEventDestroy(<driver.Event>event)
+        check_status(status)
+
+
+    cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0:
+        cdef float ms
+        status = cudaEventElapsedTime(&ms, <driver.Event>start,
+                                      <driver.Event>end)
+        check_status(status)
+        return ms
+
+
+    cpdef eventQuery(intptr_t event):
+        return cudaEventQuery(<driver.Event>event)
+
+
+    cpdef eventRecord(intptr_t event, intptr_t stream):
+        status = cudaEventRecord(<driver.Event>event, <driver.Stream>stream)
+        check_status(status)
+
+
+    cpdef eventSynchronize(intptr_t event):
+        with nogil:
+            status = cudaEventSynchronize(<driver.Event>event)
+        check_status(status)
+
+
+    ###########################################################################
+    # util
+    ###########################################################################
+
+    cdef _ensure_context():
+        """Ensure that CUcontext bound to the calling host thread exists.
+
+        See discussion on https://github.com/cupy/cupy/issues/72 for details.
+        """
+        tls = _ThreadLocal.get()
+        cdef int dev = getDevice()
+        if not tls.context_initialized[dev]:
+            # Call Runtime API to establish context on this host thread.
+            memGetInfo()
+            tls.context_initialized[dev] = True
+
+
+    ###########################################################################
+    # Texture
+    ###########################################################################
+
+    cpdef uintmax_t createTextureObject(
+            intptr_t ResDescPtr, intptr_t TexDescPtr) except? 0:
+        cdef uintmax_t texobj = 0
+        with nogil:
+            status = cudaCreateTextureObject(<TextureObject*>(&texobj),
+                                             <ResourceDesc*>ResDescPtr,
+                                             <TextureDesc*>TexDescPtr,
+                                             <ResourceViewDesc*>NULL)
+        check_status(status)
+        return texobj
+
+    cpdef destroyTextureObject(uintmax_t texObject):
+        with nogil:
+            status = cudaDestroyTextureObject(<TextureObject>texObject)
+        check_status(status)
+
+    cpdef uintmax_t createSurfaceObject(intptr_t ResDescPtr) except? 0:
+        cdef uintmax_t surfobj = 0
+        with nogil:
+            status = cudaCreateSurfaceObject(<SurfaceObject*>(&surfobj),
+                                             <ResourceDesc*>ResDescPtr)
+        check_status(status)
+        return surfobj
+
+    cpdef destroySurfaceObject(uintmax_t surfObject):
+        with nogil:
+            status = cudaDestroySurfaceObject(<SurfaceObject>surfObject)
+        check_status(status)
+
+    cdef ChannelFormatDesc getChannelDesc(intptr_t array) except*:
+        cdef ChannelFormatDesc desc
+        with nogil:
+            status = cudaGetChannelDesc(&desc, <Array>array)
+        check_status(status)
+        return desc
+
+    cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t obj) except*:
+        cdef ResourceDesc desc
+        with nogil:
+            status = cudaGetTextureObjectResourceDesc(&desc,
+                                                      <TextureObject>obj)
+        check_status(status)
+        return desc
+
+    cdef TextureDesc getTextureObjectTextureDesc(uintmax_t obj) except*:
+        cdef TextureDesc desc
+        with nogil:
+            status = cudaGetTextureObjectTextureDesc(&desc, <TextureObject>obj)
+        check_status(status)
+        return desc
+
+    cdef Extent make_Extent(size_t w, size_t h, size_t d) except*:
+        return make_cudaExtent(w, h, d)
+
+    cdef Pos make_Pos(size_t x, size_t y, size_t z) except*:
+        return make_cudaPos(x, y, z)
+
+    cdef PitchedPtr make_PitchedPtr(
+            intptr_t d, size_t p, size_t xsz, size_t ysz) except*:
+        return make_cudaPitchedPtr(<void*>d, p, xsz, ysz)
+
+
+    ###########################################################################
+    # Graph
+    ###########################################################################
+
+    cpdef graphDestroy(intptr_t graph):
+        with nogil:
+            status = cudaGraphDestroy(<Graph>graph)
+        check_status(status)
+
+    cpdef graphExecDestroy(intptr_t graphExec):
+        with nogil:
+            status = cudaGraphExecDestroy(<GraphExec>graphExec)
+        check_status(status)
+
+    cpdef intptr_t graphInstantiate(intptr_t graph) except? 0:
+        # TODO(leofang): support reporting error log?
+        cdef GraphExec ge
+        with nogil:
+            status = cudaGraphInstantiate(<GraphExec*>(&ge), <Graph>graph,
+                                          NULL, NULL, 0)
+        check_status(status)
+        return <intptr_t>ge
+
+    cpdef graphLaunch(intptr_t graphExec, intptr_t stream):
+        with nogil:
+            status = cudaGraphLaunch(<GraphExec>(graphExec),
+                                     <driver.Stream>stream)
+        check_status(status)
+
+    cpdef graphUpload(intptr_t graphExec, intptr_t stream):
+        if runtimeGetVersion() < 11010:
+            raise RuntimeError('graphUpload is supported since CUDA 11.1+')
+        with nogil:
+            status = cudaGraphUpload(<GraphExec>(graphExec),
+                                     <driver.Stream>stream)
+        check_status(status)
diff --git a/cupy_backends/hip/cupy_hip_runtime.h b/cupy_backends/hip/cupy_hip_runtime.h
index 0d1eb208528..ef43172e23a 100644
--- a/cupy_backends/hip/cupy_hip_runtime.h
+++ b/cupy_backends/hip/cupy_hip_runtime.h
@@ -8,342 +8,6 @@ extern "C" {
 
 bool hip_environment = true;
 
-// Error handling
-const char* cudaGetErrorName(cudaError_t hipError) {
-    return hipGetErrorName(hipError);
-}
-
-const char* cudaGetErrorString(cudaError_t hipError) {
-    return hipGetErrorString(hipError);
-}
-
-cudaError_t cudaGetLastError() {
-    return hipGetLastError();
-}
-
-
-// Initialization
-cudaError_t cudaDriverGetVersion(int *driverVersion) {
-    return hipDriverGetVersion(driverVersion);
-}
-
-cudaError_t cudaRuntimeGetVersion(int *runtimeVersion) {
-    return hipRuntimeGetVersion(runtimeVersion);
-}
-
-
-// CUdevice operations
-cudaError_t cudaGetDevice(int *deviceId) {
-    return hipGetDevice(deviceId);
-}
-
-cudaError_t cudaDeviceGetAttribute(int* pi, cudaDeviceAttr attr,
-                                   int deviceId) {
-    return hipDeviceGetAttribute(pi, attr, deviceId);
-}
-
-cudaError_t cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-    return hipDeviceGetByPCIBusId(device, pciBusId);
-}
-
-cudaError_t cudaDeviceGetPCIBusId(char *pciBusId, int len, int device) {
-    return hipDeviceGetPCIBusId(pciBusId, len, device);
-}
-
-cudaError_t cudaGetDeviceCount(int *count) {
-    return hipGetDeviceCount(count);
-}
-
-cudaError_t cudaSetDevice(int deviceId) {
-    return hipSetDevice(deviceId);
-}
-
-cudaError_t cudaDeviceSynchronize() {
-    return hipDeviceSynchronize();
-}
-
-cudaError_t cudaDeviceCanAccessPeer(int* canAccessPeer, int deviceId,
-                                    int peerDeviceId) {
-    return hipDeviceCanAccessPeer(canAccessPeer, deviceId, peerDeviceId);
-}
-
-cudaError_t cudaDeviceEnablePeerAccess(int peerDeviceId, unsigned int flags) {
-    return hipDeviceEnablePeerAccess(peerDeviceId, flags);
-}
-
-cudaError_t cudaDeviceDisablePeerAccess(int peerDeviceId) {
-    return hipDeviceDisablePeerAccess(peerDeviceId);
-}
-
-cudaError_t cudaDeviceGetLimit(size_t* pValue, cudaLimit limit) {
-    return hipDeviceGetLimit(pValue, limit);
-}
-
-cudaError_t cudaDeviceSetLimit(cudaLimit limit, size_t value) {
-    // see https://github.com/ROCm-Developer-Tools/HIP/issues/1632
-    return hipErrorUnknown;
-}
-
-// IPC operations
-cudaError_t cudaIpcCloseMemHandle(void* devPtr) {
-    return hipIpcCloseMemHandle(devPtr);
-}
-
-cudaError_t cudaIpcGetEventHandle(cudaIpcEventHandle_t* handle, cudaEvent_t event) {
-    return hipErrorUnknown;
-
-    // TODO(leofang): this is supported after ROCm-Developer-Tools/HIP#1996 is released;
-    // as of ROCm 3.5.0 it is still not supported
-    //return hipIpcGetEventHandle(handle, event);
-}
-
-cudaError_t cudaIpcGetMemHandle(cudaIpcMemHandle_t* handle, void* devPtr) {
-    return hipIpcGetMemHandle(handle, devPtr);
-}
-
-cudaError_t cudaIpcOpenEventHandle(cudaEvent_t* event, cudaIpcEventHandle_t handle) {
-    return hipErrorUnknown;
-
-    // TODO(leofang): this is supported after ROCm-Developer-Tools/HIP#1996 is released;
-    // as of ROCm 3.5.0 it is still not supported
-    //return hipIpcOpenEventHandle(event, handle);
-}
-
-cudaError_t cudaIpcOpenMemHandle(void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-    return hipIpcOpenMemHandle(devPtr, handle, flags);
-}
-
-// Memory management
-enum cudaMemAllocationType {};  // stub
-enum cudaMemAllocationHandleType {};  // stub
-enum cudaMemLocationType {};  // stub
-struct cudaMemLocation {  // stub
-    int id;
-    cudaMemLocationType type;
-};
-struct cudaMemPoolProps {  // stub
-    cudaMemAllocationType allocType;
-    cudaMemAllocationHandleType handleTypes;
-    struct cudaMemLocation location;
-    unsigned char reserved[64];
-    void* win32SecurityAttributes;
-};
-
-cudaError_t cudaMalloc(void** ptr, size_t size) {
-    return hipMalloc(ptr, size);
-}
-
-cudaError_t cudaMalloc3DArray(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMallocArray(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMallocAsync(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaHostAlloc(void** ptr, size_t size, unsigned int flags) {
-    return hipHostMalloc(ptr, size, flags);
-}
-
-cudaError_t cudaHostRegister(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaHostUnregister(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMallocManaged(void** ptr, size_t size, unsigned int flags) {
-#if HIP_VERSION >= 40300000
-    return hipMallocManaged(ptr, size, flags);
-#else
-    return hipErrorUnknown;
-#endif
-}
-
-int cudaFree(void* ptr) {
-    return hipFree(ptr);
-}
-
-cudaError_t cudaFreeArray(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaFreeHost(void* ptr) {
-    return hipHostFree(ptr);
-}
-
-cudaError_t cudaFreeAsync(...) {
-    return hipErrorUnknown;
-}
-
-int cudaMemGetInfo(size_t* free, size_t* total) {
-    return hipMemGetInfo(free, total);
-}
-
-cudaError_t cudaMemcpy(void* dst, const void* src, size_t sizeBytes,
-                       hipMemcpyKind kind) {
-    return hipMemcpy(dst, src, sizeBytes, kind);
-}
-
-cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
-                            cudaMemcpyKind kind, cudaStream_t stream) {
-    return hipMemcpyAsync(dst, src, sizeBytes, kind, stream);
-}
-
-cudaError_t cudaMemcpyPeer(void* dst, int dstDeviceId, const void* src,
-                           int srcDeviceId, size_t sizeBytes) {
-    return hipMemcpyPeer(dst, dstDeviceId, src, srcDeviceId, sizeBytes);
-}
-
-cudaError_t cudaMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
-                                int srcDevice, size_t sizeBytes,
-                                cudaStream_t stream) {
-    return hipMemcpyPeerAsync(dst, dstDevice, src, srcDevice, sizeBytes,
-                              stream);
-}
-
-cudaError_t cudaMemcpy2D(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemcpy2DAsync(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemcpy2DFromArray(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemcpy2DFromArrayAsync(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemcpy2DToArray(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemcpy2DToArrayAsync(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemcpy3D(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemcpy3DAsync(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemset(void* dst, int value, size_t sizeBytes) {
-    return hipMemset(dst, value, sizeBytes);
-}
-
-cudaError_t cudaMemsetAsync(void* dst, int value, size_t sizeBytes,
-                            cudaStream_t stream) {
-    return hipMemsetAsync(dst, value, sizeBytes, stream);
-}
-
-cudaError_t cudaMemAdvise(const void *devPtr, size_t count,
-                          cudaMemoryAdvise advice, int device) {
-#if HIP_VERSION >= 40300000
-    return hipMemAdvise(devPtr, count, advice, device);
-#else
-    return hipErrorUnknown;
-#endif
-}
-
-cudaError_t cudaMemPrefetchAsync(const void *devPtr, size_t count,
-				 int dstDevice, cudaStream_t stream) {
-#if HIP_VERSION >= 40300000
-    return hipMemPrefetchAsync(devPtr, count, dstDevice, stream);
-#else
-    return hipErrorUnknown;
-#endif
-}
-
-cudaError_t cudaPointerGetAttributes(cudaPointerAttributes *attributes,
-                                     const void* ptr) {
-    cudaError_t status = hipPointerGetAttributes(attributes, ptr);
-#if HIP_VERSION >= 60000000
-    if (status == cudaSuccess) {
-        switch (attributes->type) {
-            case 0 /* hipMemoryTypeHost */:
-                attributes->type = (hipMemoryType)1; /* cudaMemoryTypeHost */
-                return status;
-            case 1 /* hipMemoryTypeDevice */:
-                attributes->type = (hipMemoryType)2; /* cudaMemoryTypeDevice */
-                return status;
-            default:
-                /* we don't care the rest of possibilities */
-                return status;
-        }
-    }
-#else
-    if (status == cudaSuccess) {
-        switch (attributes->memoryType) {
-            case 0 /* hipMemoryTypeHost */:
-                attributes->memoryType = (hipMemoryType)1; /* cudaMemoryTypeHost */
-                return status;
-            case 1 /* hipMemoryTypeDevice */:
-                attributes->memoryType = (hipMemoryType)2; /* cudaMemoryTypeDevice */
-                return status;
-            default:
-                /* we don't care the rest of possibilities */
-                return status;
-        }
-    }
-#endif
-    else {
-        return status;
-    }
-}
-
-cudaError_t cudaGetDeviceProperties(cudaDeviceProp *prop, int device) {
-    return hipGetDeviceProperties(prop, device);
-}
-
-cudaError_t cudaMallocFromPoolAsync(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemPoolCreate(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemPoolDestroy(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaDeviceGetDefaultMemPool(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaDeviceGetMemPool(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaDeviceSetMemPool(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemPoolTrimTo(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemPoolGetAttribute(...) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaMemPoolSetAttribute(...) {
-    return hipErrorUnknown;
-}
-
-
 // Stream and Event
 #if HIP_VERSION >= 40300000
 typedef hipStreamCaptureMode cudaStreamCaptureMode;
@@ -353,185 +17,34 @@ enum cudaStreamCaptureMode {};
 enum cudaStreamCaptureStatus {};
 #endif
 
-cudaError_t cudaStreamCreate(cudaStream_t *stream) {
-    return hipStreamCreate(stream);
-}
-
-cudaError_t cudaStreamCreateWithFlags(cudaStream_t *stream,
-                                      unsigned int flags) {
-    return hipStreamCreateWithFlags(stream, flags);
-}
-
-cudaError_t cudaStreamDestroy(cudaStream_t stream) {
-    return hipStreamDestroy(stream);
-}
-
-cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
-    return hipStreamSynchronize(stream);
-}
-
-cudaError_t cudaStreamAddCallback(cudaStream_t stream,
-                                  cudaStreamCallback_t callback,
-                                  void *userData, unsigned int flags) {
-    return hipStreamAddCallback(stream, callback, userData, flags);
-}
-
-cudaError_t cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void* userData) {
-    return hipErrorUnknown;
-}
-
-cudaError_t cudaStreamQuery(cudaStream_t stream) {
-    return hipStreamQuery(stream);
-}
-
-cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event,
-                                unsigned int flags) {
-    return hipStreamWaitEvent(stream, event, flags);
-}
-
-cudaError_t cudaEventCreate(cudaEvent_t* event) {
-    return hipEventCreate(event);
-}
-
-cudaError_t cudaEventCreateWithFlags(cudaEvent_t* event, unsigned flags) {
-    return hipEventCreateWithFlags(event, flags);
-}
-
-cudaError_t cudaEventDestroy(cudaEvent_t event) {
-    return hipEventDestroy(event);
-}
-
-cudaError_t cudaEventElapsedTime(float *ms, cudaEvent_t start,
-                                 cudaEvent_t stop){
-    return hipEventElapsedTime(ms, start, stop);
-}
-
-cudaError_t cudaEventQuery(cudaEvent_t event) {
-    return hipEventQuery(event);
-}
-
-cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream) {
-    return hipEventRecord(event, stream);
-}
-
-cudaError_t cudaEventSynchronize(cudaEvent_t event) {
-    return hipEventSynchronize(event);
-}
-
-cudaError_t cudaStreamBeginCapture(cudaStream_t stream,
-                                   cudaStreamCaptureMode mode) {
-#if HIP_VERSION >= 40300000
-    return hipStreamBeginCapture(stream, mode);
-#else
-    return hipErrorUnknown;
-#endif
-}
-
-cudaError_t cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph) {
-#if HIP_VERSION >= 40300000
-    return hipStreamEndCapture(stream, pGraph);
-#else
-    return hipErrorUnknown;
-#endif
-}
-
-cudaError_t cudaStreamIsCapturing(cudaStream_t stream,
-                                  cudaStreamCaptureStatus* pCaptureStatus) {
-#if HIP_VERSION >= 50000000
-    return hipStreamIsCapturing(stream, pCaptureStatus);
-#else
-    return hipErrorUnknown;
-#endif
-}
-
-
-// Texture
-cudaError_t cudaCreateTextureObject(...) {
-    return cudaSuccess;
-}
-
-cudaError_t cudaDestroyTextureObject(...) {
-    return cudaSuccess;
-}
-
-cudaError_t cudaGetChannelDesc(...) {
-    return cudaSuccess;
-}
-
-cudaError_t cudaGetTextureObjectResourceDesc(...) {
-    return cudaSuccess;
-}
-
-cudaError_t cudaGetTextureObjectTextureDesc(...) {
-    return cudaSuccess;
-}
-
-cudaExtent make_cudaExtent(...) {
-    cudaExtent ex = {};
-    return ex;
-}
-
-cudaPitchedPtr make_cudaPitchedPtr(...) {
-    cudaPitchedPtr ptr = {};
-    return ptr;
-}
-
-cudaPos make_cudaPos(...) {
-    cudaPos pos = {};
-    return pos;
-}
-
-// Surface
-cudaError_t cudaCreateSurfaceObject(cudaSurfaceObject_t* pSurfObject,
-                                    const cudaResourceDesc* pResDesc) {
-    return hipCreateSurfaceObject(pSurfObject, pResDesc);
-}
-
-cudaError_t cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-    return hipDestroySurfaceObject(surfObject);
-}
-
 // CUDA Graph
+#if 0 < HIP_VERSION < 40300000
 cudaError_t cudaGraphInstantiate(
 	cudaGraphExec_t* pGraphExec,
 	cudaGraph_t graph,
 	cudaGraphNode_t* pErrorNode,
 	char* pLogBuffer,
 	size_t bufferSize) {
-#if HIP_VERSION >= 40300000
-    return hipGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
-#else
     return hipErrorUnknown;
-#endif
 }
 
 cudaError_t cudaGraphDestroy(cudaGraph_t graph) {
-#if HIP_VERSION >= 40300000
-    return hipGraphDestroy(graph);
-#else
     return hipErrorUnknown;
-#endif
 }
 
 cudaError_t cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-#if HIP_VERSION >= 40300000
-    return hipGraphExecDestroy(graphExec);
-#else
     return hipErrorUnknown;
-#endif
 }
 
 cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream) {
-#if HIP_VERSION >= 40300000
-    return hipGraphLaunch(graphExec, stream);
-#else
     return hipErrorUnknown;
-#endif
 }
 
+#elif 0 < HIP_VERSION < 50300000
 cudaError_t cudaGraphUpload(...) {
     return hipErrorUnknown;
 }
+#endif
 
 } // extern "C"
 
diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py
index 88f30e59c60..75a79fe320c 100644
--- a/install/cupy_builder/_features.py
+++ b/install/cupy_builder/_features.py
@@ -96,6 +96,7 @@ def _from_dict(d: Dict[str, Any], ctx: Context) -> Feature:
     'cupy_backends.cuda.api._driver_enum',
     'cupy_backends.cuda.api.runtime',
     'cupy_backends.cuda.api._runtime_enum',
+    'cupy_backends.cuda.api.runtime_hip',
     'cupy_backends.cuda.libs.cublas',
     'cupy_backends.cuda.libs.curand',
     'cupy_backends.cuda.libs.cusparse',

From b799e4ef17aeb533d53f37097a54972923eb29d6 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 27 Mar 2024 21:36:37 +0000
Subject: [PATCH 02/12] Fix error with MemoryType

---
 cupy_backends/cuda/api/_runtime_typedef.pxi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cupy_backends/cuda/api/_runtime_typedef.pxi b/cupy_backends/cuda/api/_runtime_typedef.pxi
index a4a129e0087..8538eaf189e 100644
--- a/cupy_backends/cuda/api/_runtime_typedef.pxi
+++ b/cupy_backends/cuda/api/_runtime_typedef.pxi
@@ -116,6 +116,9 @@ cdef extern from *:
     ctypedef int MemAllocationType 'cudaMemAllocationType'
     ctypedef int MemAllocationHandleType 'cudaMemAllocationHandleType'
     ctypedef int MemLocationType 'cudaMemLocationType'
+    IF CUPY_HIP_VERSION > 0:
+        ctypedef int MemoryType 'hipMemoryType'
+
     IF CUPY_CUDA_VERSION > 0:
         # This is for the annoying nested struct, which is not
         # perfectly supprted in Cython

From ab74117a0436a7a3bde8730b9a25787a5cdbb8d7 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 27 Mar 2024 21:37:00 +0000
Subject: [PATCH 03/12] Update runtime enums

---
 cupy_backends/cuda/api/_runtime_enum.pxd | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/cupy_backends/cuda/api/_runtime_enum.pxd b/cupy_backends/cuda/api/_runtime_enum.pxd
index 0e29e87d141..df1870d441a 100644
--- a/cupy_backends/cuda/api/_runtime_enum.pxd
+++ b/cupy_backends/cuda/api/_runtime_enum.pxd
@@ -33,9 +33,6 @@ cpdef enum:
     cudaLimitStackSize = 0x00
     cudaLimitPrintfFifoSize = 0x01
     cudaLimitMallocHeapSize = 0x02
-    cudaLimitDevRuntimeSyncDepth = 0x03
-    cudaLimitDevRuntimePendingLaunchCount = 0x04
-    cudaLimitMaxL2FetchGranularity = 0x05
 
     # cudaChannelFormatKind
     cudaChannelFormatKindSigned = 0
@@ -94,7 +91,6 @@ cpdef enum:
     # cudaMemLocationType
     cudaMemLocationTypeDevice = 1
 
-
 # This was a legacy mistake: the prefix "cuda" should have been removed
 # so that we can directly assign their C counterparts here. Now because
 # of backward compatibility and no flexible Cython macro (IF/ELSE), we
@@ -228,6 +224,10 @@ IF CUPY_HIP_VERSION > 0:
 ELSE:
     # For CUDA/RTD
     cpdef enum:
+        cudaLimitDevRuntimeSyncDepth = 0x03
+        cudaLimitDevRuntimePendingLaunchCount = 0x04
+        cudaLimitMaxL2FetchGranularity = 0x05
+
         cudaDevAttrMaxThreadsPerBlock = 1
         cudaDevAttrMaxBlockDimX
         cudaDevAttrMaxBlockDimY
@@ -343,3 +343,8 @@ ELSE:
         cudaDevAttrGPUDirectRDMAFlushWritesOptions
         cudaDevAttrGPUDirectRDMAWritesOrdering
         cudaDevAttrMemoryPoolSupportedHandleTypes
+
+IF CUPY_HIP_VERSION > 0:
+    cpdef enum:
+        hipIpcMemLazyEnablePeerAccess = 1
+        hipMemAttachGlobal = 1

From 3e6b9b6b260577463c58b187481f3914f66e1325 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 27 Mar 2024 21:38:17 +0000
Subject: [PATCH 04/12] Update runtime api declaration for HIP

---
 cupy_backends/cuda/api/runtime.pxd         | 357 +++++++++++----------
 install/amd_build/rocm_custom_mapping.json |   3 +-
 2 files changed, 182 insertions(+), 178 deletions(-)

diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd
index 40f2968e446..32b0b7dea00 100644
--- a/cupy_backends/cuda/api/runtime.pxd
+++ b/cupy_backends/cuda/api/runtime.pxd
@@ -1,26 +1,6 @@
 from libc.stdint cimport intptr_t, uintmax_t
 
 
-###############################################################################
-# Classes
-###############################################################################
-
-cdef class PointerAttributes:
-    cdef:
-        readonly int device
-        readonly intptr_t devicePointer
-        readonly intptr_t hostPointer
-        readonly int type
-
-cdef class MemPoolProps:
-    # flatten the struct & list meaningful members
-    cdef:
-        int allocType
-        int handleType
-        int locationType
-        int devId
-
-
 ###############################################################################
 # Types and Enums
 ###############################################################################
@@ -89,11 +69,15 @@ IF CUPY_USE_CUDA_PYTHON:
     ctypedef cudaMemAllocationHandleType MemAllocationHandleType
     ctypedef cudaMemLocationType MemLocationType
 
+    IF CUPY_HIP_VERSION > 0:
+        ctypedef hipMemoryType MemoryType
 ELSE:
-    include "_runtime_typedef.pxi"
+    IF CUPY_HIP_VERSION > 0:
+        include "_runtime_typedef_hip.pxi"
+    ELSE:
+        include "_runtime_typedef.pxi"
     from cupy_backends.cuda.api._runtime_enum cimport *
 
-
 # For backward compatibility, keep APIs not prefixed with "cuda".
 cpdef enum:
     memcpyHostToHost = 0
@@ -158,164 +142,183 @@ cdef int deviceAttributeComputeCapabilityMinor
 
 cdef bint _is_hip_environment
 
-
-###############################################################################
-# Error handling
-###############################################################################
-
-cpdef check_status(int status)
-
-
-###############################################################################
-# Initialization
-###############################################################################
-
-cpdef int driverGetVersion() except? -1
-cpdef int runtimeGetVersion() except? -1
-
-
-###############################################################################
-# Device and context operations
-###############################################################################
-
-cpdef int getDevice() except? -1
-cpdef int deviceGetAttribute(int attrib, int device) except? -1
-cpdef int deviceGetByPCIBusId(str pci_bus_id) except? -1
-cpdef str deviceGetPCIBusId(int device)
-cpdef int getDeviceCount() except? -1
-cpdef setDevice(int device)
-cpdef deviceSynchronize()
-cpdef getDeviceProperties(int device)
-
-cpdef int deviceCanAccessPeer(int device, int peerDevice) except? -1
-cpdef deviceEnablePeerAccess(int peerDevice)
-cpdef _deviceEnsurePeerAccess(int peerDevice)
-
-cpdef size_t deviceGetLimit(int limit) except? -1
-cpdef deviceSetLimit(int limit, size_t value)
-
-
-###############################################################################
-# Memory management
-###############################################################################
-
-cpdef intptr_t malloc(size_t size) except? 0
-cpdef intptr_t mallocManaged(size_t size, unsigned int flags=*) except? 0
-cpdef intptr_t malloc3DArray(intptr_t desc, size_t width, size_t height,
-                             size_t depth, unsigned int flags=*) except? 0
-cpdef intptr_t mallocArray(intptr_t desc, size_t width, size_t height,
-                           unsigned int flags=*) except? 0
-cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0
-cpdef intptr_t mallocFromPoolAsync(size_t, intptr_t, intptr_t) except? 0
-cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0
-cpdef hostRegister(intptr_t ptr, size_t size, unsigned int flags)
-cpdef hostUnregister(intptr_t ptr)
-cpdef free(intptr_t ptr)
-cpdef freeHost(intptr_t ptr)
-cpdef freeArray(intptr_t ptr)
-cpdef freeAsync(intptr_t ptr, intptr_t stream)
-cpdef memGetInfo()
-cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind)
-cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind,
-                  intptr_t stream)
-cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice,
-                 size_t size)
-cpdef memcpyPeerAsync(intptr_t dst, int dstDevice,
-                      intptr_t src, int srcDevice,
-                      size_t size, intptr_t stream)
-cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
-               size_t width, size_t height, MemoryKind kind)
-cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
-                    size_t width, size_t height, MemoryKind kind,
-                    intptr_t stream)
-cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src,
-                        size_t wOffset, size_t hOffset, size_t width,
-                        size_t height, int kind)
-cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src,
-                             size_t wOffset, size_t hOffset, size_t width,
-                             size_t height, int kind, intptr_t stream)
-cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset,
-                      intptr_t src, size_t spitch, size_t width, size_t height,
-                      int kind)
-cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset,
-                           intptr_t src, size_t spitch, size_t width,
-                           size_t height, int kind, intptr_t stream)
-cpdef memcpy3D(intptr_t Memcpy3DParmsPtr)
-cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream)
-cpdef memset(intptr_t ptr, int value, size_t size)
-cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream)
-cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice,
-                       intptr_t stream)
-cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device)
-cpdef PointerAttributes pointerGetAttributes(intptr_t ptr)
-cpdef intptr_t deviceGetDefaultMemPool(int) except? 0
-cpdef intptr_t deviceGetMemPool(int) except? 0
-cpdef deviceSetMemPool(int, intptr_t)
-cpdef intptr_t memPoolCreate(MemPoolProps) except? 0
-cpdef memPoolDestroy(intptr_t)
-cpdef memPoolTrimTo(intptr_t, size_t)
-cpdef memPoolGetAttribute(intptr_t, int)
-cpdef memPoolSetAttribute(intptr_t, int, object)
-
-
-###############################################################################
-# Stream and Event
-###############################################################################
-
-cpdef intptr_t streamCreate() except? 0
-cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0
-cpdef streamDestroy(intptr_t stream)
-cpdef streamSynchronize(intptr_t stream)
-cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg,
-                        unsigned int flags=*)
-cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg)
-cpdef streamQuery(intptr_t stream)
-cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=*)
-cpdef streamBeginCapture(intptr_t stream, int mode=*)
-cpdef intptr_t streamEndCapture(intptr_t stream) except? 0
-cpdef bint streamIsCapturing(intptr_t stream) except*
-cpdef intptr_t eventCreate() except? 0
-cpdef intptr_t eventCreateWithFlags(unsigned int flags) except? 0
-cpdef eventDestroy(intptr_t event)
-cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0
-cpdef eventQuery(intptr_t event)
-cpdef eventRecord(intptr_t event, intptr_t stream)
-cpdef eventSynchronize(intptr_t event)
-
-
-##############################################################################
-# util
-##############################################################################
-
-cdef _ensure_context()
-
-
-##############################################################################
-# Texture
-##############################################################################
-
-cpdef uintmax_t createTextureObject(
+IF CUPY_DONT_USE_GEN_HIP_CODE:
+    ###############################################################################
+    # Classes
+    ###############################################################################
+
+    cdef class PointerAttributes:
+        cdef:
+            readonly int device
+            readonly intptr_t devicePointer
+            readonly intptr_t hostPointer
+            readonly int type
+
+    cdef class MemPoolProps:
+        # flatten the struct & list meaningful members
+        cdef:
+            int allocType
+            int handleType
+            int locationType
+            int devId
+
+    ###############################################################################
+    # Error handling
+    ###############################################################################
+
+    cpdef check_status(int status)
+
+
+    ###############################################################################
+    # Initialization
+    ###############################################################################
+
+    cpdef int driverGetVersion() except? -1
+    cpdef int runtimeGetVersion() except? -1
+
+
+    ###############################################################################
+    # Device and context operations
+    ###############################################################################
+
+    cpdef int getDevice() except? -1
+    cpdef int deviceGetAttribute(int attrib, int device) except? -1
+    cpdef int deviceGetByPCIBusId(str pci_bus_id) except? -1
+    cpdef str deviceGetPCIBusId(int device)
+    cpdef int getDeviceCount() except? -1
+    cpdef setDevice(int device)
+    cpdef deviceSynchronize()
+    cpdef getDeviceProperties(int device)
+
+    cpdef int deviceCanAccessPeer(int device, int peerDevice) except? -1
+    cpdef deviceEnablePeerAccess(int peerDevice)
+    cpdef _deviceEnsurePeerAccess(int peerDevice)
+
+    cpdef size_t deviceGetLimit(int limit) except? -1
+    cpdef deviceSetLimit(int limit, size_t value)
+
+
+    ###############################################################################
+    # Memory management
+    ###############################################################################
+
+    cpdef intptr_t malloc(size_t size) except? 0
+    cpdef intptr_t mallocManaged(size_t size, unsigned int flags=*) except? 0
+    cpdef intptr_t malloc3DArray(intptr_t desc, size_t width, size_t height,
+     size_t depth, unsigned int flags=*) except? 0
+    cpdef intptr_t mallocArray(intptr_t desc, size_t width, size_t height,
+       unsigned int flags=*) except? 0
+    cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0
+    cpdef intptr_t mallocFromPoolAsync(size_t, intptr_t, intptr_t) except? 0
+    cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0
+    cpdef hostRegister(intptr_t ptr, size_t size, unsigned int flags)
+    cpdef hostUnregister(intptr_t ptr)
+    cpdef free(intptr_t ptr)
+    cpdef freeHost(intptr_t ptr)
+    cpdef freeArray(intptr_t ptr)
+    cpdef freeAsync(intptr_t ptr, intptr_t stream)
+    cpdef memGetInfo()
+    cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind)
+    cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind,
+      intptr_t stream)
+    cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice,
+     size_t size)
+    cpdef memcpyPeerAsync(intptr_t dst, int dstDevice,
+      intptr_t src, int srcDevice,
+      size_t size, intptr_t stream)
+    cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
+       size_t width, size_t height, MemoryKind kind)
+    cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
+    size_t width, size_t height, MemoryKind kind,
+    intptr_t stream)
+    cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src,
+    size_t wOffset, size_t hOffset, size_t width,
+    size_t height, int kind)
+    cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src,
+     size_t wOffset, size_t hOffset, size_t width,
+     size_t height, int kind, intptr_t stream)
+    cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset,
+      intptr_t src, size_t spitch, size_t width, size_t height,
+      int kind)
+    cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset,
+       intptr_t src, size_t spitch, size_t width,
+       size_t height, int kind, intptr_t stream)
+    cpdef memcpy3D(intptr_t Memcpy3DParmsPtr)
+    cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream)
+    cpdef memset(intptr_t ptr, int value, size_t size)
+    cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream)
+    cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice,
+       intptr_t stream)
+    cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device)
+    cpdef PointerAttributes pointerGetAttributes(intptr_t ptr)
+    cpdef intptr_t deviceGetDefaultMemPool(int) except? 0
+    cpdef intptr_t deviceGetMemPool(int) except? 0
+    cpdef deviceSetMemPool(int, intptr_t)
+    cpdef intptr_t memPoolCreate(MemPoolProps) except? 0
+    cpdef memPoolDestroy(intptr_t)
+    cpdef memPoolTrimTo(intptr_t, size_t)
+    cpdef memPoolGetAttribute(intptr_t, int)
+    cpdef memPoolSetAttribute(intptr_t, int, object)
+
+
+    ###############################################################################
+    # Stream and Event
+    ###############################################################################
+
+    cpdef intptr_t streamCreate() except? 0
+    cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0
+    cpdef streamDestroy(intptr_t stream)
+    cpdef streamSynchronize(intptr_t stream)
+    cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg,
+    unsigned int flags=*)
+    cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg)
+    cpdef streamQuery(intptr_t stream)
+    cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=*)
+    cpdef streamBeginCapture(intptr_t stream, int mode=*)
+    cpdef intptr_t streamEndCapture(intptr_t stream) except? 0
+    cpdef bint streamIsCapturing(intptr_t stream) except*
+    cpdef intptr_t eventCreate() except? 0
+    cpdef intptr_t eventCreateWithFlags(unsigned int flags) except? 0
+    cpdef eventDestroy(intptr_t event)
+    cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0
+    cpdef eventQuery(intptr_t event)
+    cpdef eventRecord(intptr_t event, intptr_t stream)
+    cpdef eventSynchronize(intptr_t event)
+
+
+    ##############################################################################
+    # util
+    ##############################################################################
+
+    cdef _ensure_context()
+
+
+    ##############################################################################
+    # Texture
+    ##############################################################################
+
+    cpdef uintmax_t createTextureObject(
     intptr_t ResDesc, intptr_t TexDesc) except? 0
-cpdef destroyTextureObject(uintmax_t texObject)
-cdef ChannelFormatDesc getChannelDesc(intptr_t array) except*
-cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t texobj) except*
-cdef TextureDesc getTextureObjectTextureDesc(uintmax_t texobj) except*
-cdef Extent make_Extent(size_t w, size_t h, size_t d) except*
-cdef Pos make_Pos(size_t x, size_t y, size_t z) except*
-cdef PitchedPtr make_PitchedPtr(
+    cpdef destroyTextureObject(uintmax_t texObject)
+    cdef ChannelFormatDesc getChannelDesc(intptr_t array) except*
+    cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t texobj) except*
+    cdef TextureDesc getTextureObjectTextureDesc(uintmax_t texobj) except*
+    cdef Extent make_Extent(size_t w, size_t h, size_t d) except*
+    cdef Pos make_Pos(size_t x, size_t y, size_t z) except*
+    cdef PitchedPtr make_PitchedPtr(
     intptr_t d, size_t p, size_t xsz, size_t ysz) except*
 
-cpdef uintmax_t createSurfaceObject(intptr_t ResDesc) except? 0
-cpdef destroySurfaceObject(uintmax_t surfObject)
-# TODO(leofang): add cudaGetSurfaceObjectResourceDesc
+    cpdef uintmax_t createSurfaceObject(intptr_t ResDesc) except? 0
+    cpdef destroySurfaceObject(uintmax_t surfObject)
+    # TODO(leofang): add cudaGetSurfaceObjectResourceDesc
 
 
-##############################################################################
-# Graph
-##############################################################################
+    ##############################################################################
+    # Graph
+    ##############################################################################
 
-cpdef graphDestroy(intptr_t graph)
-cpdef graphExecDestroy(intptr_t graphExec)
-cpdef intptr_t graphInstantiate(intptr_t graph) except? 0
-cpdef graphLaunch(intptr_t graphExec, intptr_t stream)
-cpdef graphUpload(intptr_t graphExec, intptr_t stream)
+    cpdef graphDestroy(intptr_t graph)
+    cpdef graphExecDestroy(intptr_t graphExec)
+    cpdef intptr_t graphInstantiate(intptr_t graph) except? 0
+    cpdef graphLaunch(intptr_t graphExec, intptr_t stream)
+    cpdef graphUpload(intptr_t graphExec, intptr_t stream)
diff --git a/install/amd_build/rocm_custom_mapping.json b/install/amd_build/rocm_custom_mapping.json
index 728ed421b85..4ac4e0b978e 100644
--- a/install/amd_build/rocm_custom_mapping.json
+++ b/install/amd_build/rocm_custom_mapping.json
@@ -1,5 +1,6 @@
 {
     "custom_map": {
-        "CUPY_USE_GEN_HIP_CODE" : "CUPY_DONT_USE_GEN_HIP_CODE"
+        "CUPY_USE_GEN_HIP_CODE" : "CUPY_DONT_USE_GEN_HIP_CODE",
+        "CUPY_DONT_USE_GEN_HIP_CODE" : "CUPY_USE_GEN_HIP_CODE"
     }
 }

From 0afc224434728ee9b4b3f5761a37f2ddeefa84bf Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 27 Mar 2024 21:39:12 +0000
Subject: [PATCH 05/12] Fix cimport error with runtime

---
 cupy/_core/_accelerator.pyx          | 5 ++++-
 cupy/_core/_cub_reduction.pyx        | 5 ++++-
 cupy/_core/_dtype.pyx                | 5 ++++-
 cupy/_core/_fusion_kernel.pyx        | 5 ++++-
 cupy/_core/_kernel.pyx               | 5 ++++-
 cupy/_core/_reduction.pyx            | 5 ++++-
 cupy/_core/_routines_linalg.pyx      | 5 ++++-
 cupy/_core/_routines_math.pyx        | 5 ++++-
 cupy/_core/core.pyx                  | 5 ++++-
 cupy/_core/dlpack.pyx                | 5 ++++-
 cupy/_core/raw.pyx                   | 5 ++++-
 cupy/cuda/common.pyx                 | 5 ++++-
 cupy/cuda/cub.pyx                    | 5 ++++-
 cupy/cuda/device.pyx                 | 5 ++++-
 cupy/cuda/function.pyx               | 5 ++++-
 cupy/cuda/graph.pyx                  | 5 ++++-
 cupy/cuda/memory.pyx                 | 5 ++++-
 cupy/cuda/pinned_memory.pyx          | 5 ++++-
 cupy/cuda/stream.pyx                 | 5 ++++-
 cupy/cuda/texture.pyx                | 5 ++++-
 cupy/fft/_cache.pyx                  | 5 ++++-
 cupy/fft/_callback.pyx               | 5 ++++-
 cupy_backends/cuda/libs/_cnvrtc.pxi  | 5 ++++-
 cupy_backends/cuda/libs/cublas.pyx   | 5 ++++-
 cupy_backends/cuda/libs/cudnn.pyx    | 5 ++++-
 cupy_backends/cuda/libs/curand.pyx   | 5 ++++-
 cupy_backends/cuda/libs/cusolver.pyx | 5 ++++-
 cupy_backends/cuda/libs/cusparse.pyx | 5 ++++-
 cupy_backends/cuda/libs/nccl.pyx     | 5 ++++-
 cupy_backends/cuda/libs/nvrtc.pyx    | 5 ++++-
 cupy_backends/cuda/libs/profiler.pyx | 5 ++++-
 cupy_backends/cuda/stream.pyx        | 5 ++++-
 32 files changed, 128 insertions(+), 32 deletions(-)

diff --git a/cupy/_core/_accelerator.pyx b/cupy/_core/_accelerator.pyx
index 6ce2991e786..00e2bfc6ebf 100644
--- a/cupy/_core/_accelerator.pyx
+++ b/cupy/_core/_accelerator.pyx
@@ -1,6 +1,9 @@
 import os
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 
 cdef list _elementwise_accelerators = []
diff --git a/cupy/_core/_cub_reduction.pyx b/cupy/_core/_cub_reduction.pyx
index 95ed9bce145..d3336fb1a4a 100644
--- a/cupy/_core/_cub_reduction.pyx
+++ b/cupy/_core/_cub_reduction.pyx
@@ -10,7 +10,10 @@ from cupy._core cimport internal
 from cupy.cuda cimport cub
 from cupy.cuda cimport function
 from cupy.cuda cimport memory
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 import math
 import string
diff --git a/cupy/_core/_dtype.pyx b/cupy/_core/_dtype.pyx
index e5bc7251c14..57963c40792 100644
--- a/cupy/_core/_dtype.pyx
+++ b/cupy/_core/_dtype.pyx
@@ -2,7 +2,10 @@ cimport cython  # NOQA
 import numpy
 import warnings
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 
 all_type_chars = '?bhilqBHILQefdFD'
diff --git a/cupy/_core/_fusion_kernel.pyx b/cupy/_core/_fusion_kernel.pyx
index ed8f83738b4..928dbb43c5b 100644
--- a/cupy/_core/_fusion_kernel.pyx
+++ b/cupy/_core/_fusion_kernel.pyx
@@ -10,7 +10,10 @@ from cupy._core.core cimport _ndarray_base
 from cupy._core cimport internal
 from cupy._core cimport _routines_manipulation as _manipulation
 from cupy_backends.cuda.api cimport driver
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 import cupy as _cupy
 from cupy._core import _dtype
diff --git a/cupy/_core/_kernel.pyx b/cupy/_core/_kernel.pyx
index 1e719f6f6fc..f17af4120a1 100644
--- a/cupy/_core/_kernel.pyx
+++ b/cupy/_core/_kernel.pyx
@@ -27,7 +27,10 @@ from cupy._core.core cimport _ndarray_init
 from cupy._core.core cimport compile_with_cache
 from cupy._core.core cimport _ndarray_base
 from cupy._core cimport internal
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 try:
     import cupy_backends.cuda.libs.cutensor as cuda_cutensor
diff --git a/cupy/_core/_reduction.pyx b/cupy/_core/_reduction.pyx
index 0046ce801ca..374a9978b48 100644
--- a/cupy/_core/_reduction.pyx
+++ b/cupy/_core/_reduction.pyx
@@ -25,7 +25,10 @@ from cupy._core.core cimport _ndarray_base
 from cupy._core cimport internal
 from cupy.cuda cimport device
 from cupy.cuda cimport function
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 import math
 import string
diff --git a/cupy/_core/_routines_linalg.pyx b/cupy/_core/_routines_linalg.pyx
index 25fde4cf839..7b1373a6e55 100644
--- a/cupy/_core/_routines_linalg.pyx
+++ b/cupy/_core/_routines_linalg.pyx
@@ -26,7 +26,10 @@ from cupy._core cimport _memory_range
 from cupy._core cimport _routines_manipulation as _manipulation
 from cupy._core cimport _routines_math as _math
 from cupy.cuda cimport device
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 
 cdef extern from '../../cupy_backends/cupy_complex.h':
diff --git a/cupy/_core/_routines_math.pyx b/cupy/_core/_routines_math.pyx
index 440c8cbfa70..b040a34b6cc 100644
--- a/cupy/_core/_routines_math.pyx
+++ b/cupy/_core/_routines_math.pyx
@@ -11,7 +11,10 @@ import cupy._core.core as core
 from cupy._core cimport internal
 from cupy import _util
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy._core cimport _accelerator
 from cupy._core._dtype cimport get_dtype
 from cupy._core.core cimport _ndarray_init
diff --git a/cupy/_core/core.pyx b/cupy/_core/core.pyx
index 8f44ceafe5b..7276b8fda47 100644
--- a/cupy/_core/core.pyx
+++ b/cupy/_core/core.pyx
@@ -47,7 +47,10 @@ from cupy.cuda cimport pinned_memory
 from cupy.cuda cimport memory
 from cupy.cuda cimport stream as stream_module
 from cupy_backends.cuda cimport stream as _stream_module
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 
 # If rop of cupy.ndarray is called, cupy's op is the last chance.
diff --git a/cupy/_core/dlpack.pyx b/cupy/_core/dlpack.pyx
index 1da7155b87a..68eac765c87 100644
--- a/cupy/_core/dlpack.pyx
+++ b/cupy/_core/dlpack.pyx
@@ -9,7 +9,10 @@ from libc.stdint cimport uint64_t
 from libc.stdint cimport intptr_t
 from libcpp.vector cimport vector
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda cimport stream as stream_module
 from cupy._core.core cimport _ndarray_base
 from cupy.cuda cimport memory
diff --git a/cupy/_core/raw.pyx b/cupy/_core/raw.pyx
index c9905382d96..2a766037e33 100644
--- a/cupy/_core/raw.pyx
+++ b/cupy/_core/raw.pyx
@@ -3,7 +3,10 @@ import pickle
 import cupy
 
 from cupy_backends.cuda.api cimport driver
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy.cuda.function cimport Function, Module
 
 
diff --git a/cupy/cuda/common.pyx b/cupy/cuda/common.pyx
index 120c2399da7..caa1f2db0ac 100644
--- a/cupy/cuda/common.pyx
+++ b/cupy/cuda/common.pyx
@@ -1,4 +1,7 @@
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy.cuda cimport device
 
 import numpy
diff --git a/cupy/cuda/cub.pyx b/cupy/cuda/cub.pyx
index 8b62e7bc5b6..ebc7b84c4d3 100644
--- a/cupy/cuda/cub.pyx
+++ b/cupy/cuda/cub.pyx
@@ -5,7 +5,10 @@
 from cpython cimport sequence
 from libc.stdint cimport intptr_t
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy._core.core cimport _internal_ascontiguousarray
 from cupy._core.internal cimport _contig_axes, is_in
 from cupy.cuda cimport common
diff --git a/cupy/cuda/device.pyx b/cupy/cuda/device.pyx
index 034d3f6aefd..cdde85dd397 100644
--- a/cupy/cuda/device.pyx
+++ b/cupy/cuda/device.pyx
@@ -3,7 +3,10 @@
 import threading
 
 from cupy._core import syncdetect
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda.api import runtime as runtime_module
 from cupy import _util
 
diff --git a/cupy/cuda/function.pyx b/cupy/cuda/function.pyx
index 9d7c4db7fbe..b1389d86e01 100644
--- a/cupy/cuda/function.pyx
+++ b/cupy/cuda/function.pyx
@@ -14,7 +14,10 @@ from libcpp cimport vector
 from cupy._core cimport _carray
 from cupy._core.core cimport _ndarray_base
 from cupy_backends.cuda.api cimport driver
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy.cuda cimport stream as stream_module
 from cupy.cuda.memory cimport MemoryPointer
 from cupy.cuda.texture cimport TextureObject, SurfaceObject
diff --git a/cupy/cuda/graph.pyx b/cupy/cuda/graph.pyx
index 005cf0be11b..a5a2a9a5597 100644
--- a/cupy/cuda/graph.pyx
+++ b/cupy/cuda/graph.pyx
@@ -1,4 +1,7 @@
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda cimport stream as stream_module
 
 
diff --git a/cupy/cuda/memory.pyx b/cupy/cuda/memory.pyx
index e17cbcdd2e7..1c0a7f2251d 100644
--- a/cupy/cuda/memory.pyx
+++ b/cupy/cuda/memory.pyx
@@ -20,7 +20,10 @@ from cupy.cuda cimport device
 from cupy.cuda cimport memory_hook
 from cupy.cuda cimport stream as stream_module
 from cupy_backends.cuda.api cimport driver
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 from cupy_backends.cuda.api.runtime import CUDARuntimeError
 from cupy import _util
diff --git a/cupy/cuda/pinned_memory.pyx b/cupy/cuda/pinned_memory.pyx
index 3e88580364c..4e3e0220510 100644
--- a/cupy/cuda/pinned_memory.pyx
+++ b/cupy/cuda/pinned_memory.pyx
@@ -8,7 +8,10 @@ from fastrlock cimport rlock
 from cupy_backends.cuda.api import runtime
 
 from cupy._core cimport internal
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy import _util
 
 
diff --git a/cupy/cuda/stream.pyx b/cupy/cuda/stream.pyx
index 96e34e6618f..6809d34e399 100644
--- a/cupy/cuda/stream.pyx
+++ b/cupy/cuda/stream.pyx
@@ -1,6 +1,9 @@
 import threading
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda cimport stream as backends_stream
 from cupy.cuda cimport graph
 
diff --git a/cupy/cuda/texture.pyx b/cupy/cuda/texture.pyx
index d66cbb1ad82..33a2d3f7ec9 100644
--- a/cupy/cuda/texture.pyx
+++ b/cupy/cuda/texture.pyx
@@ -5,7 +5,10 @@ import numpy
 
 from cupy._core.core cimport _ndarray_base
 from cupy._core.core cimport _internal_ascontiguousarray
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda.api.runtime cimport Array, \
     ChannelFormatDesc, ChannelFormatKind, \
     Memcpy3DParms, MemoryKind, PitchedPtr, ResourceDesc, ResourceType, \
diff --git a/cupy/fft/_cache.pyx b/cupy/fft/_cache.pyx
index 9eae6e273d0..8160877c829 100644
--- a/cupy/fft/_cache.pyx
+++ b/cupy/fft/_cache.pyx
@@ -4,7 +4,10 @@ import gc
 import warnings
 import weakref
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 import threading
 
diff --git a/cupy/fft/_callback.pyx b/cupy/fft/_callback.pyx
index 443835680d4..80a274cce32 100644
--- a/cupy/fft/_callback.pyx
+++ b/cupy/fft/_callback.pyx
@@ -1,6 +1,9 @@
 from libc.stdint cimport intptr_t
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy._core.core cimport _ndarray_base
 from cupy.cuda.device cimport get_compute_capability
 
diff --git a/cupy_backends/cuda/libs/_cnvrtc.pxi b/cupy_backends/cuda/libs/_cnvrtc.pxi
index 0d22cd26486..4184189c53c 100644
--- a/cupy_backends/cuda/libs/_cnvrtc.pxi
+++ b/cupy_backends/cuda/libs/_cnvrtc.pxi
@@ -1,6 +1,9 @@
 import sys as _sys
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda._softlink cimport SoftLink
 
 
diff --git a/cupy_backends/cuda/libs/cublas.pyx b/cupy_backends/cuda/libs/cublas.pyx
index 05690ad3bf5..5aea68b0412 100644
--- a/cupy_backends/cuda/libs/cublas.pyx
+++ b/cupy_backends/cuda/libs/cublas.pyx
@@ -4,7 +4,10 @@
 
 cimport cython  # NOQA
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda cimport stream as stream_module
 
 IF CUPY_USE_GEN_HIP_CODE:
diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx
index 464c59d8a00..479c19cc772 100644
--- a/cupy_backends/cuda/libs/cudnn.pyx
+++ b/cupy_backends/cuda/libs/cudnn.pyx
@@ -6,7 +6,10 @@ cimport cython  # NOQA
 from libcpp cimport vector
 
 from cupy_backends.cuda.api cimport driver
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda cimport stream as stream_module
 
 ###############################################################################
diff --git a/cupy_backends/cuda/libs/curand.pyx b/cupy_backends/cuda/libs/curand.pyx
index 8f254382392..05e099032ba 100644
--- a/cupy_backends/cuda/libs/curand.pyx
+++ b/cupy_backends/cuda/libs/curand.pyx
@@ -3,7 +3,10 @@
 """Thin wrapper of cuRAND."""
 cimport cython  # NOQA
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda cimport stream as stream_module
 
 ###############################################################################
diff --git a/cupy_backends/cuda/libs/cusolver.pyx b/cupy_backends/cuda/libs/cusolver.pyx
index a421fe46f46..c4f6c58ec1d 100644
--- a/cupy_backends/cuda/libs/cusolver.pyx
+++ b/cupy_backends/cuda/libs/cusolver.pyx
@@ -4,7 +4,10 @@
 
 cimport cython  # NOQA
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda cimport stream as stream_module
 
 IF CUPY_USE_GEN_HIP_CODE:
diff --git a/cupy_backends/cuda/libs/cusparse.pyx b/cupy_backends/cuda/libs/cusparse.pyx
index 0b7a99c7018..ad499ffe4ca 100644
--- a/cupy_backends/cuda/libs/cusparse.pyx
+++ b/cupy_backends/cuda/libs/cusparse.pyx
@@ -1,7 +1,10 @@
 import sys as _sys  # no-cython-lint
 cimport cython  # NOQA
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 from cupy_backends.cuda.api.runtime cimport _is_hip_environment
 from cupy_backends.cuda cimport stream as stream_module
 from cupy_backends.cuda._softlink cimport SoftLink
diff --git a/cupy_backends/cuda/libs/nccl.pyx b/cupy_backends/cuda/libs/nccl.pyx
index 5dd2b9c40ec..daa07aace8d 100644
--- a/cupy_backends/cuda/libs/nccl.pyx
+++ b/cupy_backends/cuda/libs/nccl.pyx
@@ -9,7 +9,10 @@ from libc.stdint cimport intptr_t
 from libcpp cimport vector
 
 from cupy_backends.cuda.api cimport driver
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 cdef extern from '../../cupy_nccl.h':
     ctypedef struct ncclComm:
diff --git a/cupy_backends/cuda/libs/nvrtc.pyx b/cupy_backends/cuda/libs/nvrtc.pyx
index 2199837be36..788ee86598a 100644
--- a/cupy_backends/cuda/libs/nvrtc.pyx
+++ b/cupy_backends/cuda/libs/nvrtc.pyx
@@ -14,7 +14,10 @@ There are four differences compared to the original C API.
 cimport cython  # NOQA
 from libcpp cimport vector
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 
 ###############################################################################
diff --git a/cupy_backends/cuda/libs/profiler.pyx b/cupy_backends/cuda/libs/profiler.pyx
index db27cedc2bb..0188a729947 100644
--- a/cupy_backends/cuda/libs/profiler.pyx
+++ b/cupy_backends/cuda/libs/profiler.pyx
@@ -1,7 +1,10 @@
 # distutils: language = c++
 
 """Thin wrapper of cuda profiler."""
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 
 # TODO(kmaehashi): cudaProfilerInitialize is deprecated thus unsupported in
diff --git a/cupy_backends/cuda/stream.pyx b/cupy_backends/cuda/stream.pyx
index ce8340089a0..9bd7448ad3a 100755
--- a/cupy_backends/cuda/stream.pyx
+++ b/cupy_backends/cuda/stream.pyx
@@ -1,7 +1,10 @@
 import os as _os
 import threading as _threading
 
-from cupy_backends.cuda.api cimport runtime
+IF CUPY_HIP_VERSION > 0:
+    from cupy_backends.cuda.api cimport runtime_hip as runtime
+ELSE:
+    from cupy_backends.cuda.api cimport runtime
 
 
 cdef object _thread_local = _threading.local()

From 792e31219adc744135573f07a2423190ec937e08 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Wed, 27 Mar 2024 17:15:06 -0500
Subject: [PATCH 06/12] Fix api declaration for cuda

---
 cupy_backends/cuda/api/runtime.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd
index 32b0b7dea00..45623ba2d5f 100644
--- a/cupy_backends/cuda/api/runtime.pxd
+++ b/cupy_backends/cuda/api/runtime.pxd
@@ -142,7 +142,7 @@ cdef int deviceAttributeComputeCapabilityMinor
 
 cdef bint _is_hip_environment
 
-IF CUPY_DONT_USE_GEN_HIP_CODE:
+IF CUPY_DONT_USE_GEN_HIP_CODE or CUPY_CUDA_VERSION != 0:
     ###############################################################################
     # Classes
     ###############################################################################

From beabf96464f1fdee219e77c869405f777cee51cf Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Wed, 27 Mar 2024 17:28:35 -0500
Subject: [PATCH 07/12] Update runtime.pxd

---
 cupy_backends/cuda/api/runtime.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd
index 45623ba2d5f..20b1889969e 100644
--- a/cupy_backends/cuda/api/runtime.pxd
+++ b/cupy_backends/cuda/api/runtime.pxd
@@ -142,7 +142,7 @@ cdef int deviceAttributeComputeCapabilityMinor
 
 cdef bint _is_hip_environment
 
-IF CUPY_DONT_USE_GEN_HIP_CODE or CUPY_CUDA_VERSION != 0:
+IF CUPY_CUDA_VERSION != 0 or CUPY_DONT_USE_GEN_HIP_CODE:
     ###############################################################################
     # Classes
     ###############################################################################

From a3cefaf0fa1f77e57d148220874b6fae46218f62 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Thu, 28 Mar 2024 14:02:42 +0000
Subject: [PATCH 08/12] Define DONT_USE env var for cuda and stub

---
 cupy_backends/cuda/api/runtime.pxd | 2 +-
 install/cupy_builder/_command.py   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd
index 20b1889969e..32b0b7dea00 100644
--- a/cupy_backends/cuda/api/runtime.pxd
+++ b/cupy_backends/cuda/api/runtime.pxd
@@ -142,7 +142,7 @@ cdef int deviceAttributeComputeCapabilityMinor
 
 cdef bint _is_hip_environment
 
-IF CUPY_CUDA_VERSION != 0 or CUPY_DONT_USE_GEN_HIP_CODE:
+IF CUPY_DONT_USE_GEN_HIP_CODE:
     ###############################################################################
     # Classes
     ###############################################################################
diff --git a/install/cupy_builder/_command.py b/install/cupy_builder/_command.py
index 5526a926d63..b31c17488d6 100644
--- a/install/cupy_builder/_command.py
+++ b/install/cupy_builder/_command.py
@@ -123,6 +123,7 @@ def _cythonize(self, nthreads: int) -> None:
             compile_time_env['CUPY_CUDA_VERSION'] = 0
             compile_time_env['CUPY_HIP_VERSION'] = 0
             compile_time_env['CUPY_USE_GEN_HIP_CODE'] = 0
+            compile_time_env['CUPY_DONT_USE_GEN_HIP_CODE'] = 1
         elif ctx.use_hip:  # on ROCm/HIP
             compile_time_env['CUPY_CUDA_VERSION'] = 0
             compile_time_env['CUPY_HIP_VERSION'] = build.get_hip_version()
@@ -133,6 +134,7 @@ def _cythonize(self, nthreads: int) -> None:
                 ctx.features['cuda'].get_version())
             compile_time_env['CUPY_HIP_VERSION'] = 0
             compile_time_env['CUPY_USE_GEN_HIP_CODE'] = 0
+            compile_time_env['CUPY_DONT_USE_GEN_HIP_CODE'] = 1
 
         print('Compile-time constants: ' +
               json.dumps(compile_time_env, indent=4))

From b91eed488bfaa0b387b4027d99e93b279791d3f4 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Thu, 28 Mar 2024 15:57:13 +0000
Subject: [PATCH 09/12] Add runtime error for missing functions

---
 cupy_backends/cuda/api/runtime.pyx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cupy_backends/cuda/api/runtime.pyx b/cupy_backends/cuda/api/runtime.pyx
index 34f12afecb0..06ea5bff1d1 100644
--- a/cupy_backends/cuda/api/runtime.pyx
+++ b/cupy_backends/cuda/api/runtime.pyx
@@ -415,6 +415,8 @@ ELSE:
         return value
 
     cpdef deviceSetLimit(int limit, size_t value):
+        if 0 < CUPY_HIP_VERSION < 50300000:
+            raise RuntimeError('deviceSetLimit requires ROCm 5.3+')
         status = cudaDeviceSetLimit(<Limit>limit, value)
         check_status(status)
 
@@ -1120,6 +1122,8 @@ ELSE:
         check_status(status)
 
     cpdef graphUpload(intptr_t graphExec, intptr_t stream):
+        if 0 < CUPY_HIP_VERSION < 50300000:
+            raise RuntimeError('graphUpload requires ROCm 5.3+')
         if runtimeGetVersion() < 11010:
             raise RuntimeError('graphUpload is supported since CUDA 11.1+')
         with nogil:

From 126dbae8e44e15bc1d7d3618a97e073dd633210d Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Thu, 28 Mar 2024 17:58:23 +0000
Subject: [PATCH 10/12] Fix pre-commit errors

---
 cupy_backends/cuda/api/runtime.pxd | 90 ++++++++++++++----------------
 cupy_backends/cuda/api/runtime.pyx | 87 +++++++++++------------------
 2 files changed, 74 insertions(+), 103 deletions(-)

diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd
index 32b0b7dea00..88ff564bfd7 100644
--- a/cupy_backends/cuda/api/runtime.pxd
+++ b/cupy_backends/cuda/api/runtime.pxd
@@ -143,9 +143,9 @@ cdef int deviceAttributeComputeCapabilityMinor
 cdef bint _is_hip_environment
 
 IF CUPY_DONT_USE_GEN_HIP_CODE:
-    ###############################################################################
+    ###########################################################################
     # Classes
-    ###############################################################################
+    ###########################################################################
 
     cdef class PointerAttributes:
         cdef:
@@ -162,24 +162,22 @@ IF CUPY_DONT_USE_GEN_HIP_CODE:
             int locationType
             int devId
 
-    ###############################################################################
+    ###########################################################################
     # Error handling
-    ###############################################################################
+    ###########################################################################
 
     cpdef check_status(int status)
 
-
-    ###############################################################################
+    ###########################################################################
     # Initialization
-    ###############################################################################
+    ###########################################################################
 
     cpdef int driverGetVersion() except? -1
     cpdef int runtimeGetVersion() except? -1
 
-
-    ###############################################################################
+    ###########################################################################
     # Device and context operations
-    ###############################################################################
+    ###########################################################################
 
     cpdef int getDevice() except? -1
     cpdef int deviceGetAttribute(int attrib, int device) except? -1
@@ -197,17 +195,16 @@ IF CUPY_DONT_USE_GEN_HIP_CODE:
     cpdef size_t deviceGetLimit(int limit) except? -1
     cpdef deviceSetLimit(int limit, size_t value)
 
-
-    ###############################################################################
+    ###########################################################################
     # Memory management
-    ###############################################################################
+    ###########################################################################
 
     cpdef intptr_t malloc(size_t size) except? 0
     cpdef intptr_t mallocManaged(size_t size, unsigned int flags=*) except? 0
     cpdef intptr_t malloc3DArray(intptr_t desc, size_t width, size_t height,
-     size_t depth, unsigned int flags=*) except? 0
+                                 size_t depth, unsigned int flags=*) except? 0
     cpdef intptr_t mallocArray(intptr_t desc, size_t width, size_t height,
-       unsigned int flags=*) except? 0
+                               unsigned int flags=*) except? 0
     cpdef intptr_t mallocAsync(size_t size, intptr_t stream) except? 0
     cpdef intptr_t mallocFromPoolAsync(size_t, intptr_t, intptr_t) except? 0
     cpdef intptr_t hostAlloc(size_t size, unsigned int flags) except? 0
@@ -220,35 +217,35 @@ IF CUPY_DONT_USE_GEN_HIP_CODE:
     cpdef memGetInfo()
     cpdef memcpy(intptr_t dst, intptr_t src, size_t size, int kind)
     cpdef memcpyAsync(intptr_t dst, intptr_t src, size_t size, int kind,
-      intptr_t stream)
+                      intptr_t stream)
     cpdef memcpyPeer(intptr_t dst, int dstDevice, intptr_t src, int srcDevice,
-     size_t size)
+                     size_t size)
     cpdef memcpyPeerAsync(intptr_t dst, int dstDevice,
-      intptr_t src, int srcDevice,
-      size_t size, intptr_t stream)
+                          intptr_t src, int srcDevice,
+                          size_t size, intptr_t stream)
     cpdef memcpy2D(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
-       size_t width, size_t height, MemoryKind kind)
-    cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src, size_t spitch,
-    size_t width, size_t height, MemoryKind kind,
-    intptr_t stream)
+                   size_t width, size_t height, MemoryKind kind)
+    cpdef memcpy2DAsync(intptr_t dst, size_t dpitch, intptr_t src,
+                        size_t spitch, size_t width, size_t height,
+                        MemoryKind kind, intptr_t stream)
     cpdef memcpy2DFromArray(intptr_t dst, size_t dpitch, intptr_t src,
-    size_t wOffset, size_t hOffset, size_t width,
-    size_t height, int kind)
+                            size_t wOffset, size_t hOffset, size_t width,
+                            size_t height, int kind)
     cpdef memcpy2DFromArrayAsync(intptr_t dst, size_t dpitch, intptr_t src,
-     size_t wOffset, size_t hOffset, size_t width,
-     size_t height, int kind, intptr_t stream)
+                                 size_t wOffset, size_t hOffset, size_t width,
+                                 size_t height, int kind, intptr_t stream)
     cpdef memcpy2DToArray(intptr_t dst, size_t wOffset, size_t hOffset,
-      intptr_t src, size_t spitch, size_t width, size_t height,
-      int kind)
+                          intptr_t src, size_t spitch, size_t width,
+                          size_t height, int kind)
     cpdef memcpy2DToArrayAsync(intptr_t dst, size_t wOffset, size_t hOffset,
-       intptr_t src, size_t spitch, size_t width,
-       size_t height, int kind, intptr_t stream)
+                               intptr_t src, size_t spitch, size_t width,
+                               size_t height, int kind, intptr_t stream)
     cpdef memcpy3D(intptr_t Memcpy3DParmsPtr)
     cpdef memcpy3DAsync(intptr_t Memcpy3DParmsPtr, intptr_t stream)
     cpdef memset(intptr_t ptr, int value, size_t size)
     cpdef memsetAsync(intptr_t ptr, int value, size_t size, intptr_t stream)
     cpdef memPrefetchAsync(intptr_t devPtr, size_t count, int dstDevice,
-       intptr_t stream)
+                           intptr_t stream)
     cpdef memAdvise(intptr_t devPtr, size_t count, int advice, int device)
     cpdef PointerAttributes pointerGetAttributes(intptr_t ptr)
     cpdef intptr_t deviceGetDefaultMemPool(int) except? 0
@@ -260,20 +257,20 @@ IF CUPY_DONT_USE_GEN_HIP_CODE:
     cpdef memPoolGetAttribute(intptr_t, int)
     cpdef memPoolSetAttribute(intptr_t, int, object)
 
-
-    ###############################################################################
+    ###########################################################################
     # Stream and Event
-    ###############################################################################
+    ###########################################################################
 
     cpdef intptr_t streamCreate() except? 0
     cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0
     cpdef streamDestroy(intptr_t stream)
     cpdef streamSynchronize(intptr_t stream)
     cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg,
-    unsigned int flags=*)
+                            unsigned int flags=*)
     cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg)
     cpdef streamQuery(intptr_t stream)
-    cpdef streamWaitEvent(intptr_t stream, intptr_t event, unsigned int flags=*)
+    cpdef streamWaitEvent(intptr_t stream, intptr_t event,
+                          unsigned int flags=*)
     cpdef streamBeginCapture(intptr_t stream, int mode=*)
     cpdef intptr_t streamEndCapture(intptr_t stream) except? 0
     cpdef bint streamIsCapturing(intptr_t stream) except*
@@ -285,20 +282,18 @@ IF CUPY_DONT_USE_GEN_HIP_CODE:
     cpdef eventRecord(intptr_t event, intptr_t stream)
     cpdef eventSynchronize(intptr_t event)
 
-
-    ##############################################################################
+    ###########################################################################
     # util
-    ##############################################################################
+    ###########################################################################
 
     cdef _ensure_context()
 
-
-    ##############################################################################
+    ###########################################################################
     # Texture
-    ##############################################################################
+    ###########################################################################
 
     cpdef uintmax_t createTextureObject(
-    intptr_t ResDesc, intptr_t TexDesc) except? 0
+        intptr_t ResDesc, intptr_t TexDesc) except? 0
     cpdef destroyTextureObject(uintmax_t texObject)
     cdef ChannelFormatDesc getChannelDesc(intptr_t array) except*
     cdef ResourceDesc getTextureObjectResourceDesc(uintmax_t texobj) except*
@@ -306,16 +301,15 @@ IF CUPY_DONT_USE_GEN_HIP_CODE:
     cdef Extent make_Extent(size_t w, size_t h, size_t d) except*
     cdef Pos make_Pos(size_t x, size_t y, size_t z) except*
     cdef PitchedPtr make_PitchedPtr(
-    intptr_t d, size_t p, size_t xsz, size_t ysz) except*
+        intptr_t d, size_t p, size_t xsz, size_t ysz) except*
 
     cpdef uintmax_t createSurfaceObject(intptr_t ResDesc) except? 0
     cpdef destroySurfaceObject(uintmax_t surfObject)
     # TODO(leofang): add cudaGetSurfaceObjectResourceDesc
 
-
-    ##############################################################################
+    ###########################################################################
     # Graph
-    ##############################################################################
+    ###########################################################################
 
     cpdef graphDestroy(intptr_t graph)
     cpdef graphExecDestroy(intptr_t graphExec)
diff --git a/cupy_backends/cuda/api/runtime.pyx b/cupy_backends/cuda/api/runtime.pyx
index 06ea5bff1d1..c2988853aef 100644
--- a/cupy_backends/cuda/api/runtime.pyx
+++ b/cupy_backends/cuda/api/runtime.pyx
@@ -45,14 +45,12 @@ ELSE:
             self.locationType = locationType
             self.devId = devId
 
-
     ###########################################################################
     # Thread-local storage
     ###########################################################################
 
     cdef object _thread_local = _threading.local()
 
-
     cdef class _ThreadLocal:
 
         cdef list context_initialized
@@ -69,7 +67,6 @@ ELSE:
                 tls = _thread_local.tls = _ThreadLocal()
             return <_ThreadLocal>tls
 
-
     ###########################################################################
     # Extern
     ###########################################################################
@@ -86,7 +83,6 @@ ELSE:
     cdef extern from '../../cupy_backend_runtime.h' nogil:
         bint hip_environment
 
-
     ###########################################################################
     # Constants
     ###########################################################################
@@ -100,7 +96,6 @@ ELSE:
     deviceAttributeComputeCapabilityMajor = cudaDevAttrComputeCapabilityMajor
     deviceAttributeComputeCapabilityMinor = cudaDevAttrComputeCapabilityMinor
 
-
     # Provide access to constants from Python.
     # TODO(kmaehashi): Deprecate aliases above so that we can just do:
     # from cupy_backends.cuda.api._runtime_enum import *
@@ -112,10 +107,8 @@ ELSE:
             if not key.startswith('_'):
                 setattr(this, key, getattr(_runtime_enum, key))
 
-
     _export_enum()
 
-
     ###########################################################################
     # Constants (CuPy)
     ###########################################################################
@@ -123,7 +116,6 @@ ELSE:
     _is_hip_environment = hip_environment  # for runtime being cimport'd
     is_hip = hip_environment  # for runtime being import'd
 
-
     ###########################################################################
     # Error handling
     ###########################################################################
@@ -140,7 +132,6 @@ ELSE:
         def __reduce__(self):
             return (type(self), (self.status,))
 
-
     @cython.profile(False)
     cpdef inline check_status(int status):
         if status != 0:
@@ -148,7 +139,6 @@ ELSE:
             cudaGetLastError()
             raise CUDARuntimeError(status)
 
-
     ###########################################################################
     # Initialization
     ###########################################################################
@@ -173,7 +163,6 @@ ELSE:
             check_status(status)
         return version
 
-
     ###########################################################################
     # Device and context operations
     ###########################################################################
@@ -231,28 +220,34 @@ ELSE:
                 'memoryClockRate': props.memoryClockRate,
                 'memoryBusWidth': props.memoryBusWidth,
                 'l2CacheSize': props.l2CacheSize,
-                'maxThreadsPerMultiProcessor': props.maxThreadsPerMultiProcessor,
+                'maxThreadsPerMultiProcessor': (
+                    props.maxThreadsPerMultiProcessor),
                 'isMultiGpuBoard': props.isMultiGpuBoard,
                 'cooperativeLaunch': props.cooperativeLaunch,
-                'cooperativeMultiDeviceLaunch': props.cooperativeMultiDeviceLaunch,
+                'cooperativeMultiDeviceLaunch': (
+                    props.cooperativeMultiDeviceLaunch),
             }
         IF CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 9020:
             properties['deviceOverlap'] = props.deviceOverlap
             properties['maxTexture1DMipmap'] = props.maxTexture1DMipmap
             properties['maxTexture1DLinear'] = props.maxTexture1DLinear
-            properties['maxTexture1DLayered'] = tuple(props.maxTexture1DLayered)
+            properties['maxTexture1DLayered'] = tuple(
+                props.maxTexture1DLayered)
             properties['maxTexture2DMipmap'] = tuple(props.maxTexture2DMipmap)
             properties['maxTexture2DLinear'] = tuple(props.maxTexture2DLinear)
-            properties['maxTexture2DLayered'] = tuple(props.maxTexture2DLayered)
+            properties['maxTexture2DLayered'] = tuple(
+                props.maxTexture2DLayered)
             properties['maxTexture2DGather'] = tuple(props.maxTexture2DGather)
             properties['maxTexture3DAlt'] = tuple(props.maxTexture3DAlt)
             properties['maxTextureCubemap'] = props.maxTextureCubemap
             properties['maxTextureCubemapLayered'] = tuple(
                 props.maxTextureCubemapLayered)
             properties['maxSurface1D'] = props.maxSurface1D
-            properties['maxSurface1DLayered'] = tuple(props.maxSurface1DLayered)
+            properties['maxSurface1DLayered'] = tuple(
+                props.maxSurface1DLayered)
             properties['maxSurface2D'] = tuple(props.maxSurface2D)
-            properties['maxSurface2DLayered'] = tuple(props.maxSurface2DLayered)
+            properties['maxSurface2DLayered'] = tuple(
+                props.maxSurface2DLayered)
             properties['maxSurface3D'] = tuple(props.maxSurface3D)
             properties['maxSurfaceCubemap'] = props.maxSurfaceCubemap
             properties['maxSurfaceCubemapLayered'] = tuple(
@@ -274,7 +269,8 @@ ELSE:
             properties['singleToDoublePrecisionPerfRatio'] = (
                 props.singleToDoublePrecisionPerfRatio)
             properties['pageableMemoryAccess'] = props.pageableMemoryAccess
-            properties['concurrentManagedAccess'] = props.concurrentManagedAccess
+            properties['concurrentManagedAccess'] = (
+                props.concurrentManagedAccess)
             properties['computePreemptionSupported'] = (
                 props.computePreemptionSupported)
             properties['canUseHostPointerForRegisteredMem'] = (
@@ -289,7 +285,8 @@ ELSE:
             properties['luid'] = props.luid
             properties['luidDeviceNodeMask'] = props.luidDeviceNodeMask
         if CUPY_USE_CUDA_PYTHON or CUPY_CUDA_VERSION >= 11000:
-            properties['persistingL2CacheMaxSize'] = props.persistingL2CacheMaxSize
+            properties['persistingL2CacheMaxSize'] = (
+                props.persistingL2CacheMaxSize)
             properties['maxBlocksPerMultiProcessor'] = (
                 props.maxBlocksPerMultiProcessor)
             properties['accessPolicyMaxWindowSize'] = (
@@ -315,9 +312,11 @@ ELSE:
 
             cdef dict arch = {}  # for hipDeviceArch_t
             arch['hasGlobalInt32Atomics'] = props.arch.hasGlobalInt32Atomics
-            arch['hasGlobalFloatAtomicExch'] = props.arch.hasGlobalFloatAtomicExch
+            arch['hasGlobalFloatAtomicExch'] = (
+                props.arch.hasGlobalFloatAtomicExch)
             arch['hasSharedInt32Atomics'] = props.arch.hasSharedInt32Atomics
-            arch['hasSharedFloatAtomicExch'] = props.arch.hasSharedFloatAtomicExch
+            arch['hasSharedFloatAtomicExch'] = (
+                props.arch.hasSharedFloatAtomicExch)
             arch['hasFloatAtomicAdd'] = props.arch.hasFloatAtomicAdd
             arch['hasGlobalInt64Atomics'] = props.arch.hasGlobalInt64Atomics
             arch['hasSharedInt64Atomics'] = props.arch.hasSharedInt64Atomics
@@ -340,7 +339,8 @@ ELSE:
             properties['managedMemory'] = props.managedMemory
             properties['directManagedMemAccessFromHost'] = (
                 props.directManagedMemAccessFromHost)
-            properties['concurrentManagedAccess'] = props.concurrentManagedAccess
+            properties['concurrentManagedAccess'] = (
+                props.concurrentManagedAccess)
             properties['pageableMemoryAccess'] = props.pageableMemoryAccess
             properties['pageableMemoryAccessUsesHostPageTables'] = (
                 props.pageableMemoryAccessUsesHostPageTables)
@@ -420,7 +420,6 @@ ELSE:
         status = cudaDeviceSetLimit(<Limit>limit, value)
         check_status(status)
 
-
     ###########################################################################
     # IPC operations
     ###########################################################################
@@ -471,7 +470,6 @@ ELSE:
         check_status(status)
         return <intptr_t>devPtr
 
-
     ###########################################################################
     # Memory management
     ###########################################################################
@@ -624,7 +622,7 @@ ELSE:
                         MemoryKind kind, intptr_t stream):
         with nogil:
             status = cudaMemcpy2DAsync(<void*>dst, dpitch, <void*>src, spitch,
-                                       width, height, kind, 
+                                       width, height, kind,
                                        <driver.Stream>stream)
         check_status(status)
 
@@ -711,14 +709,14 @@ ELSE:
         check_status(status)
         IF CUPY_HIP_VERSION >= 60000000:
             if attrs.type == 0:         # hipMemoryTypeHost
-                attrs.type = <MemoryType>1 # cudaMemoryTypeHost
+                attrs.type = <MemoryType>1  # cudaMemoryTypeHost
             elif attrs.type == 1:       # hipMemoryTypeDevice
-                attrs.type = <MemoryType>2 # cudaMemoryTypeDevice
+                attrs.type = <MemoryType>2  # cudaMemoryTypeDevice
         ELIF CUPY_HIP_VERSION > 0:
             if attrs.memoryType == 0:   # hipMemoryTypeHost
-                attrs.memoryType = <MemoryType>1 # cudaMemoryTypeHost
-            elif attrs.memoryType == 1: # hipMemoryTypeDevice
-                attrs.memoryType = <MemoryType>2 # cudaMemoryTypeDevice
+                attrs.memoryType = <MemoryType>1  # cudaMemoryTypeHost
+            elif attrs.memoryType == 1:  # hipMemoryTypeDevice
+                attrs.memoryType = <MemoryType>2  # cudaMemoryTypeDevice
         IF CUPY_CUDA_VERSION > 0 or CUPY_HIP_VERSION > 60000000:
             return PointerAttributes(
                 attrs.device,
@@ -848,7 +846,6 @@ ELSE:
                                              out)
         check_status(status)
 
-
     ###########################################################################
     # Stream and Event
     ###########################################################################
@@ -859,25 +856,21 @@ ELSE:
         check_status(status)
         return <intptr_t>stream
 
-
     cpdef intptr_t streamCreateWithFlags(unsigned int flags) except? 0:
         cdef driver.Stream stream
         status = cudaStreamCreateWithFlags(&stream, flags)
         check_status(status)
         return <intptr_t>stream
 
-
     cpdef streamDestroy(intptr_t stream):
         status = cudaStreamDestroy(<driver.Stream>stream)
         check_status(status)
 
-
     cpdef streamSynchronize(intptr_t stream):
         with nogil:
             status = cudaStreamSynchronize(<driver.Stream>stream)
         check_status(status)
 
-
     cdef _streamCallbackFunc(driver.Stream hStream, int status,
                              void* func_arg) with gil:
         obj = <object>func_arg
@@ -885,14 +878,12 @@ ELSE:
         func(<intptr_t>hStream, status, arg)
         cpython.Py_DECREF(obj)
 
-
     cdef _HostFnFunc(void* func_arg) with gil:
         obj = <object>func_arg
         func, arg = obj
         func(arg)
         cpython.Py_DECREF(obj)
 
-
     cpdef streamAddCallback(intptr_t stream, callback, intptr_t arg,
                             unsigned int flags=0):
         if _is_hip_environment and stream == 0:
@@ -906,10 +897,10 @@ ELSE:
                 <void*>func_arg, flags)
         check_status(status)
 
-
     cpdef launchHostFunc(intptr_t stream, callback, intptr_t arg):
         if 0 < CUPY_HIP_VERSION < 50200000:
-            raise RuntimeError('This feature is supported on HIP since ROCm 5.2')
+            raise RuntimeError('This feature is supported on '
+                               'HIP since ROCm 5.2')
 
         func_arg = (callback, arg)
         cpython.Py_INCREF(func_arg)
@@ -919,11 +910,9 @@ ELSE:
                 <void*>func_arg)
         check_status(status)
 
-
     cpdef streamQuery(intptr_t stream):
         return cudaStreamQuery(<driver.Stream>stream)
 
-
     cpdef streamWaitEvent(intptr_t stream, intptr_t event,
                           unsigned int flags=0):
         with nogil:
@@ -931,7 +920,6 @@ ELSE:
                                          <driver.Event>event, flags)
         check_status(status)
 
-
     cpdef streamBeginCapture(intptr_t stream,
                              int mode=streamCaptureModeRelaxed):
         if 0 < CUPY_HIP_VERSION < 40300000:
@@ -942,7 +930,6 @@ ELSE:
                                             <StreamCaptureMode>mode)
         check_status(status)
 
-
     cpdef intptr_t streamEndCapture(intptr_t stream) except? 0:
         # TODO(leofang): check and raise if stream == 0?
         cdef Graph g
@@ -953,21 +940,19 @@ ELSE:
         check_status(status)
         return <intptr_t>g
 
-
     cpdef bint streamIsCapturing(intptr_t stream) except*:
         cdef StreamCaptureStatus s
         if 0 < CUPY_HIP_VERSION < 50000000:
             raise RuntimeError('streamIsCapturing is not supported in ROCm')
         with nogil:
             status = cudaStreamIsCapturing(<driver.Stream>stream, &s)
-        check_status(status)  # cudaErrorStreamCaptureImplicit could be 
-                              # raised here
+        # cudaErrorStreamCaptureImplicit could be raised here
+        check_status(status)
         if s == <StreamCaptureStatus>streamCaptureStatusInvalidated:
             raise RuntimeError('the stream was capturing, but an error has '
                                'invalidated the capture sequence')
         return <bint>s
 
-
     cpdef intptr_t eventCreate() except? 0:
         cdef driver.Event event
         status = cudaEventCreate(&event)
@@ -980,12 +965,10 @@ ELSE:
         check_status(status)
         return <intptr_t>event
 
-
     cpdef eventDestroy(intptr_t event):
         status = cudaEventDestroy(<driver.Event>event)
         check_status(status)
 
-
     cpdef float eventElapsedTime(intptr_t start, intptr_t end) except? 0:
         cdef float ms
         status = cudaEventElapsedTime(&ms, <driver.Event>start,
@@ -993,22 +976,18 @@ ELSE:
         check_status(status)
         return ms
 
-
     cpdef eventQuery(intptr_t event):
         return cudaEventQuery(<driver.Event>event)
 
-
     cpdef eventRecord(intptr_t event, intptr_t stream):
         status = cudaEventRecord(<driver.Event>event, <driver.Stream>stream)
         check_status(status)
 
-
     cpdef eventSynchronize(intptr_t event):
         with nogil:
             status = cudaEventSynchronize(<driver.Event>event)
         check_status(status)
 
-
     ###########################################################################
     # util
     ###########################################################################
@@ -1025,7 +1004,6 @@ ELSE:
             memGetInfo()
             tls.context_initialized[dev] = True
 
-
     ###########################################################################
     # Texture
     ###########################################################################
@@ -1091,7 +1069,6 @@ ELSE:
             intptr_t d, size_t p, size_t xsz, size_t ysz) except*:
         return make_cudaPitchedPtr(<void*>d, p, xsz, ysz)
 
-
     ###########################################################################
     # Graph
     ###########################################################################

From 4224f3291d6daa1a904d15249ecee9cd2e9538bd Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 10 Apr 2024 18:00:31 +0000
Subject: [PATCH 11/12] Remove dont_use to use mapping in custom json

---
 cupy_backends/cuda/api/runtime.pxd         | 2 +-
 install/amd_build/rocm_custom_mapping.json | 3 +--
 install/cupy_builder/_command.py           | 2 --
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/cupy_backends/cuda/api/runtime.pxd b/cupy_backends/cuda/api/runtime.pxd
index 88ff564bfd7..3dd3d45356a 100644
--- a/cupy_backends/cuda/api/runtime.pxd
+++ b/cupy_backends/cuda/api/runtime.pxd
@@ -142,7 +142,7 @@ cdef int deviceAttributeComputeCapabilityMinor
 
 cdef bint _is_hip_environment
 
-IF CUPY_DONT_USE_GEN_HIP_CODE:
+IF not CUPY_USE_GEN_HIP_CODE:
     ###########################################################################
     # Classes
     ###########################################################################
diff --git a/install/amd_build/rocm_custom_mapping.json b/install/amd_build/rocm_custom_mapping.json
index 4ac4e0b978e..728ed421b85 100644
--- a/install/amd_build/rocm_custom_mapping.json
+++ b/install/amd_build/rocm_custom_mapping.json
@@ -1,6 +1,5 @@
 {
     "custom_map": {
-        "CUPY_USE_GEN_HIP_CODE" : "CUPY_DONT_USE_GEN_HIP_CODE",
-        "CUPY_DONT_USE_GEN_HIP_CODE" : "CUPY_USE_GEN_HIP_CODE"
+        "CUPY_USE_GEN_HIP_CODE" : "CUPY_DONT_USE_GEN_HIP_CODE"
     }
 }
diff --git a/install/cupy_builder/_command.py b/install/cupy_builder/_command.py
index b31c17488d6..5526a926d63 100644
--- a/install/cupy_builder/_command.py
+++ b/install/cupy_builder/_command.py
@@ -123,7 +123,6 @@ def _cythonize(self, nthreads: int) -> None:
             compile_time_env['CUPY_CUDA_VERSION'] = 0
             compile_time_env['CUPY_HIP_VERSION'] = 0
             compile_time_env['CUPY_USE_GEN_HIP_CODE'] = 0
-            compile_time_env['CUPY_DONT_USE_GEN_HIP_CODE'] = 1
         elif ctx.use_hip:  # on ROCm/HIP
             compile_time_env['CUPY_CUDA_VERSION'] = 0
             compile_time_env['CUPY_HIP_VERSION'] = build.get_hip_version()
@@ -134,7 +133,6 @@ def _cythonize(self, nthreads: int) -> None:
                 ctx.features['cuda'].get_version())
             compile_time_env['CUPY_HIP_VERSION'] = 0
             compile_time_env['CUPY_USE_GEN_HIP_CODE'] = 0
-            compile_time_env['CUPY_DONT_USE_GEN_HIP_CODE'] = 1
 
         print('Compile-time constants: ' +
               json.dumps(compile_time_env, indent=4))

From 8d7cc141af520e89bf42e282e1d7ecf19ee3bfd5 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Wed, 10 Apr 2024 13:17:57 -0500
Subject: [PATCH 12/12] Move runtime_hip to HIP list

---
 install/cupy_builder/_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py
index 75a79fe320c..f2f91cdf258 100644
--- a/install/cupy_builder/_features.py
+++ b/install/cupy_builder/_features.py
@@ -96,7 +96,6 @@ def _from_dict(d: Dict[str, Any], ctx: Context) -> Feature:
     'cupy_backends.cuda.api._driver_enum',
     'cupy_backends.cuda.api.runtime',
     'cupy_backends.cuda.api._runtime_enum',
-    'cupy_backends.cuda.api.runtime_hip',
     'cupy_backends.cuda.libs.cublas',
     'cupy_backends.cuda.libs.curand',
     'cupy_backends.cuda.libs.cusparse',
@@ -159,6 +158,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]:
         'name': 'cuda',
         'required': True,
         'file': _cuda_files + [
+            'cupy_backends.cuda.api.runtime_hip',
             'cupy_backends.cuda.libs.nvtx',
             'cupy_backends.cuda.libs.cusolver',
             'cupy_backends.cuda.libs.cusolver_hip',