From 51f1c664b78e2f282d05aa25aa193158d4d1544d Mon Sep 17 00:00:00 2001 From: gfardell Date: Thu, 28 Jan 2021 14:15:12 +0000 Subject: [PATCH] added backward compatibility for compute 3.0 --- MATLAB/Source/POCS_TV.cu | 8 +++++--- MATLAB/Source/POCS_TV2.cu | 8 +++++--- MATLAB/Source/Siddon_projection.cu | 4 +++- MATLAB/Source/ray_interpolated_projection.cu | 4 +++- MATLAB/Source/tvdenoising.cu | 4 +++- MATLAB/Source/voxel_backprojection.cu | 4 +++- MATLAB/Source/voxel_backprojection2.cu | 4 +++- MATLAB/Source/voxel_backprojection_parallel.cu | 4 +++- Python/setup.py | 2 +- Python/tigre/Source/POCS_TV.cu | 8 +++++--- Python/tigre/Source/POCS_TV2.cu | 8 +++++--- Python/tigre/Source/Siddon_projection.cu | 4 +++- Python/tigre/Source/ray_interpolated_projection.cu | 4 +++- Python/tigre/Source/tvdenoising.cu | 4 +++- Python/tigre/Source/voxel_backprojection.cu | 4 +++- Python/tigre/Source/voxel_backprojection2.cu | 5 ++++- Python/tigre/Source/voxel_backprojection_parallel.cu | 4 +++- 17 files changed, 58 insertions(+), 25 deletions(-) diff --git a/MATLAB/Source/POCS_TV.cu b/MATLAB/Source/POCS_TV.cu index ad975cfc..a51e198f 100644 --- a/MATLAB/Source/POCS_TV.cu +++ b/MATLAB/Source/POCS_TV.cu @@ -183,7 +183,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -231,7 +231,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -394,8 +394,10 @@ do { \ // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/MATLAB/Source/POCS_TV2.cu b/MATLAB/Source/POCS_TV2.cu index f0402e6f..7ec9b004 100644 --- a/MATLAB/Source/POCS_TV2.cu +++ b/MATLAB/Source/POCS_TV2.cu @@ -204,7 +204,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -252,7 +252,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -415,8 +415,10 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/MATLAB/Source/Siddon_projection.cu b/MATLAB/Source/Siddon_projection.cu index 543cbeb3..5b825cd5 100644 --- a/MATLAB/Source/Siddon_projection.cu +++ b/MATLAB/Source/Siddon_projection.cu @@ -342,8 +342,10 @@ int siddon_ray_projection(float * img, Geometry geo, float** result,float cons //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ diff --git a/MATLAB/Source/ray_interpolated_projection.cu b/MATLAB/Source/ray_interpolated_projection.cu index c6499f08..d7c73bed 100644 --- a/MATLAB/Source/ray_interpolated_projection.cu +++ b/MATLAB/Source/ray_interpolated_projection.cu @@ -291,8 +291,10 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & splits>1){ diff --git a/MATLAB/Source/tvdenoising.cu b/MATLAB/Source/tvdenoising.cu index 9ea7df4d..1b89724c 100644 --- a/MATLAB/Source/tvdenoising.cu +++ b/MATLAB/Source/tvdenoising.cu @@ -263,8 +263,10 @@ do { \ // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif if (isHostRegisterSupported & splits>1){ cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/MATLAB/Source/voxel_backprojection.cu b/MATLAB/Source/voxel_backprojection.cu index d48a6249..db41046b 100644 --- a/MATLAB/Source/voxel_backprojection.cu +++ b/MATLAB/Source/voxel_backprojection.cu @@ -312,8 +312,10 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ diff --git a/MATLAB/Source/voxel_backprojection2.cu b/MATLAB/Source/voxel_backprojection2.cu index b11ffd59..a9090e11 100644 --- a/MATLAB/Source/voxel_backprojection2.cu +++ b/MATLAB/Source/voxel_backprojection2.cu @@ -353,8 +353,10 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & split_image>1){ diff --git a/MATLAB/Source/voxel_backprojection_parallel.cu b/MATLAB/Source/voxel_backprojection_parallel.cu index cc8ab2aa..6ea5ca00 100644 --- a/MATLAB/Source/voxel_backprojection_parallel.cu +++ b/MATLAB/Source/voxel_backprojection_parallel.cu @@ -299,8 +299,10 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif if (isHostRegisterSupported){ cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); } diff --git a/Python/setup.py b/Python/setup.py index 03efb2d0..5b388b84 100644 --- a/Python/setup.py +++ b/Python/setup.py @@ -16,7 +16,7 @@ # Code from https://github.com/pytorch/pytorch/blob/master/torch/utils/cpp_extension.py COMPUTE_CAPABILITY_ARGS = [ # '-gencode=arch=compute_20,code=sm_20', #deprecated - #'-gencode=arch=compute_30,code=sm_30',#deprecated + '-gencode=arch=compute_30,code=sm_30', '-gencode=arch=compute_37,code=sm_37', '-gencode=arch=compute_52,code=sm_52', '-gencode=arch=compute_60,code=sm_60', diff --git a/Python/tigre/Source/POCS_TV.cu b/Python/tigre/Source/POCS_TV.cu index ad975cfc..a51e198f 100644 --- a/Python/tigre/Source/POCS_TV.cu +++ b/Python/tigre/Source/POCS_TV.cu @@ -183,7 +183,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -231,7 +231,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -394,8 +394,10 @@ do { \ // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/Python/tigre/Source/POCS_TV2.cu b/Python/tigre/Source/POCS_TV2.cu index f0402e6f..7ec9b004 100644 --- a/Python/tigre/Source/POCS_TV2.cu +++ b/Python/tigre/Source/POCS_TV2.cu @@ -204,7 +204,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -252,7 +252,7 @@ do { \ __syncthreads(); -#if (__CUDA_ARCH__ >= 300) +#if (__CUDART_VERSION >= 9000) if ( tid < 32 ) { mySum = sdata[tid] + sdata[tid + 32]; @@ -415,8 +415,10 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/Python/tigre/Source/Siddon_projection.cu b/Python/tigre/Source/Siddon_projection.cu index 7e142153..3303c6c1 100644 --- a/Python/tigre/Source/Siddon_projection.cu +++ b/Python/tigre/Source/Siddon_projection.cu @@ -346,8 +346,10 @@ int siddon_ray_projection(float * img, Geometry geo, float** result,float cons //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ diff --git a/Python/tigre/Source/ray_interpolated_projection.cu b/Python/tigre/Source/ray_interpolated_projection.cu index a1b08a83..7afc3dbc 100644 --- a/Python/tigre/Source/ray_interpolated_projection.cu +++ b/Python/tigre/Source/ray_interpolated_projection.cu @@ -296,8 +296,10 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & splits>1){ diff --git a/Python/tigre/Source/tvdenoising.cu b/Python/tigre/Source/tvdenoising.cu index 9ea7df4d..1b89724c 100644 --- a/Python/tigre/Source/tvdenoising.cu +++ b/Python/tigre/Source/tvdenoising.cu @@ -263,8 +263,10 @@ do { \ // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif if (isHostRegisterSupported & splits>1){ cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); diff --git a/Python/tigre/Source/voxel_backprojection.cu b/Python/tigre/Source/voxel_backprojection.cu index d870e9be..7345494a 100644 --- a/Python/tigre/Source/voxel_backprojection.cu +++ b/Python/tigre/Source/voxel_backprojection.cu @@ -318,8 +318,10 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ diff --git a/Python/tigre/Source/voxel_backprojection2.cu b/Python/tigre/Source/voxel_backprojection2.cu index 84742b81..b036be2c 100644 --- a/Python/tigre/Source/voxel_backprojection2.cu +++ b/Python/tigre/Source/voxel_backprojection2.cu @@ -357,8 +357,11 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif + // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & split_image>1){ diff --git a/Python/tigre/Source/voxel_backprojection_parallel.cu b/Python/tigre/Source/voxel_backprojection_parallel.cu index 6bfcc269..52e7755b 100644 --- a/Python/tigre/Source/voxel_backprojection_parallel.cu +++ b/Python/tigre/Source/voxel_backprojection_parallel.cu @@ -303,8 +303,10 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re //Pagelock memory for syncronous copy. // Lets try to make the host memory pinned: // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes. - int isHostRegisterSupported; + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0); +#endif if (isHostRegisterSupported){ cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); }