From 51f1c664b78e2f282d05aa25aa193158d4d1544d Mon Sep 17 00:00:00 2001
From: gfardell <gemma.fardell@stfc.ac.uk>
Date: Thu, 28 Jan 2021 14:15:12 +0000
Subject: [PATCH] added backward compatibility for compute 3.0

---
 MATLAB/Source/POCS_TV.cu                             | 8 +++++---
 MATLAB/Source/POCS_TV2.cu                            | 8 +++++---
 MATLAB/Source/Siddon_projection.cu                   | 4 +++-
 MATLAB/Source/ray_interpolated_projection.cu         | 4 +++-
 MATLAB/Source/tvdenoising.cu                         | 4 +++-
 MATLAB/Source/voxel_backprojection.cu                | 4 +++-
 MATLAB/Source/voxel_backprojection2.cu               | 4 +++-
 MATLAB/Source/voxel_backprojection_parallel.cu       | 4 +++-
 Python/setup.py                                      | 2 +-
 Python/tigre/Source/POCS_TV.cu                       | 8 +++++---
 Python/tigre/Source/POCS_TV2.cu                      | 8 +++++---
 Python/tigre/Source/Siddon_projection.cu             | 4 +++-
 Python/tigre/Source/ray_interpolated_projection.cu   | 4 +++-
 Python/tigre/Source/tvdenoising.cu                   | 4 +++-
 Python/tigre/Source/voxel_backprojection.cu          | 4 +++-
 Python/tigre/Source/voxel_backprojection2.cu         | 5 ++++-
 Python/tigre/Source/voxel_backprojection_parallel.cu | 4 +++-
 17 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/MATLAB/Source/POCS_TV.cu b/MATLAB/Source/POCS_TV.cu
index ad975cfc..a51e198f 100644
--- a/MATLAB/Source/POCS_TV.cu
+++ b/MATLAB/Source/POCS_TV.cu
@@ -183,7 +183,7 @@ do { \
         __syncthreads();
         
         
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -231,7 +231,7 @@ do { \
         __syncthreads();
         
         
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -394,8 +394,10 @@ do { \
         
         // Lets try to make the host memory pinned:
         // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-        int isHostRegisterSupported;
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
             cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
diff --git a/MATLAB/Source/POCS_TV2.cu b/MATLAB/Source/POCS_TV2.cu
index f0402e6f..7ec9b004 100644
--- a/MATLAB/Source/POCS_TV2.cu
+++ b/MATLAB/Source/POCS_TV2.cu
@@ -204,7 +204,7 @@ do { \
         __syncthreads();
         
         
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -252,7 +252,7 @@ do { \
         __syncthreads();
         
         
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -415,8 +415,10 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         
         // Lets try to make the host memory pinned:
         // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-        int isHostRegisterSupported;
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
             cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
diff --git a/MATLAB/Source/Siddon_projection.cu b/MATLAB/Source/Siddon_projection.cu
index 543cbeb3..5b825cd5 100644
--- a/MATLAB/Source/Siddon_projection.cu
+++ b/MATLAB/Source/Siddon_projection.cu
@@ -342,8 +342,10 @@ int siddon_ray_projection(float  *  img, Geometry geo, float** result,float cons
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
diff --git a/MATLAB/Source/ray_interpolated_projection.cu b/MATLAB/Source/ray_interpolated_projection.cu
index c6499f08..d7c73bed 100644
--- a/MATLAB/Source/ray_interpolated_projection.cu
+++ b/MATLAB/Source/ray_interpolated_projection.cu
@@ -291,8 +291,10 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & splits>1){
diff --git a/MATLAB/Source/tvdenoising.cu b/MATLAB/Source/tvdenoising.cu
index 9ea7df4d..1b89724c 100644
--- a/MATLAB/Source/tvdenoising.cu
+++ b/MATLAB/Source/tvdenoising.cu
@@ -263,8 +263,10 @@ do { \
         
         // Lets try to make the host memory pinned:
         // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-        int isHostRegisterSupported;
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         if (isHostRegisterSupported & splits>1){
             cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
             cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
diff --git a/MATLAB/Source/voxel_backprojection.cu b/MATLAB/Source/voxel_backprojection.cu
index d48a6249..db41046b 100644
--- a/MATLAB/Source/voxel_backprojection.cu
+++ b/MATLAB/Source/voxel_backprojection.cu
@@ -312,8 +312,10 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
diff --git a/MATLAB/Source/voxel_backprojection2.cu b/MATLAB/Source/voxel_backprojection2.cu
index b11ffd59..a9090e11 100644
--- a/MATLAB/Source/voxel_backprojection2.cu
+++ b/MATLAB/Source/voxel_backprojection2.cu
@@ -353,8 +353,10 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & split_image>1){
diff --git a/MATLAB/Source/voxel_backprojection_parallel.cu b/MATLAB/Source/voxel_backprojection_parallel.cu
index cc8ab2aa..6ea5ca00 100644
--- a/MATLAB/Source/voxel_backprojection_parallel.cu
+++ b/MATLAB/Source/voxel_backprojection_parallel.cu
@@ -299,8 +299,10 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     if (isHostRegisterSupported){
         cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
     }
diff --git a/Python/setup.py b/Python/setup.py
index 03efb2d0..5b388b84 100644
--- a/Python/setup.py
+++ b/Python/setup.py
@@ -16,7 +16,7 @@
 
 # Code from https://github.com/pytorch/pytorch/blob/master/torch/utils/cpp_extension.py
 COMPUTE_CAPABILITY_ARGS = [  # '-gencode=arch=compute_20,code=sm_20', #deprecated
-    #'-gencode=arch=compute_30,code=sm_30',#deprecated
+    '-gencode=arch=compute_30,code=sm_30',
     '-gencode=arch=compute_37,code=sm_37',
     '-gencode=arch=compute_52,code=sm_52',
     '-gencode=arch=compute_60,code=sm_60',
diff --git a/Python/tigre/Source/POCS_TV.cu b/Python/tigre/Source/POCS_TV.cu
index ad975cfc..a51e198f 100644
--- a/Python/tigre/Source/POCS_TV.cu
+++ b/Python/tigre/Source/POCS_TV.cu
@@ -183,7 +183,7 @@ do { \
         __syncthreads();
         
         
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -231,7 +231,7 @@ do { \
         __syncthreads();
         
         
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -394,8 +394,10 @@ do { \
         
         // Lets try to make the host memory pinned:
         // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-        int isHostRegisterSupported;
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
             cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
diff --git a/Python/tigre/Source/POCS_TV2.cu b/Python/tigre/Source/POCS_TV2.cu
index f0402e6f..7ec9b004 100644
--- a/Python/tigre/Source/POCS_TV2.cu
+++ b/Python/tigre/Source/POCS_TV2.cu
@@ -204,7 +204,7 @@ do { \
         __syncthreads();
         
         
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -252,7 +252,7 @@ do { \
         __syncthreads();
         
         
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDART_VERSION >= 9000)
         if ( tid < 32 )
         {
             mySum = sdata[tid] + sdata[tid + 32];
@@ -415,8 +415,10 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         
         // Lets try to make the host memory pinned:
         // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-        int isHostRegisterSupported;
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
             cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
diff --git a/Python/tigre/Source/Siddon_projection.cu b/Python/tigre/Source/Siddon_projection.cu
index 7e142153..3303c6c1 100644
--- a/Python/tigre/Source/Siddon_projection.cu
+++ b/Python/tigre/Source/Siddon_projection.cu
@@ -346,8 +346,10 @@ int siddon_ray_projection(float  *  img, Geometry geo, float** result,float cons
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
diff --git a/Python/tigre/Source/ray_interpolated_projection.cu b/Python/tigre/Source/ray_interpolated_projection.cu
index a1b08a83..7afc3dbc 100644
--- a/Python/tigre/Source/ray_interpolated_projection.cu
+++ b/Python/tigre/Source/ray_interpolated_projection.cu
@@ -296,8 +296,10 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & splits>1){
diff --git a/Python/tigre/Source/tvdenoising.cu b/Python/tigre/Source/tvdenoising.cu
index 9ea7df4d..1b89724c 100644
--- a/Python/tigre/Source/tvdenoising.cu
+++ b/Python/tigre/Source/tvdenoising.cu
@@ -263,8 +263,10 @@ do { \
         
         // Lets try to make the host memory pinned:
         // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-        int isHostRegisterSupported;
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
         cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
         if (isHostRegisterSupported & splits>1){
             cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
             cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
diff --git a/Python/tigre/Source/voxel_backprojection.cu b/Python/tigre/Source/voxel_backprojection.cu
index d870e9be..7345494a 100644
--- a/Python/tigre/Source/voxel_backprojection.cu
+++ b/Python/tigre/Source/voxel_backprojection.cu
@@ -318,8 +318,10 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
diff --git a/Python/tigre/Source/voxel_backprojection2.cu b/Python/tigre/Source/voxel_backprojection2.cu
index 84742b81..b036be2c 100644
--- a/Python/tigre/Source/voxel_backprojection2.cu
+++ b/Python/tigre/Source/voxel_backprojection2.cu
@@ -357,8 +357,11 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
+
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Syncronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & split_image>1){
diff --git a/Python/tigre/Source/voxel_backprojection_parallel.cu b/Python/tigre/Source/voxel_backprojection_parallel.cu
index 6bfcc269..52e7755b 100644
--- a/Python/tigre/Source/voxel_backprojection_parallel.cu
+++ b/Python/tigre/Source/voxel_backprojection_parallel.cu
@@ -303,8 +303,10 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     //Pagelock memory for syncronous copy.
     // Lets try to make the host memory pinned:
     // We laredy queried the GPU and assuemd they are the same, thus shoudl have the same attributes.
-    int isHostRegisterSupported;
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
     cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,0);
+#endif
     if (isHostRegisterSupported){
         cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
     }